aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/PowerPC/README_ALTIVEC.txt
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/PowerPC/README_ALTIVEC.txt')
-rw-r--r--lib/Target/PowerPC/README_ALTIVEC.txt104
1 files changed, 104 insertions, 0 deletions
diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt
index 1e4c6fb..43d87d3 100644
--- a/lib/Target/PowerPC/README_ALTIVEC.txt
+++ b/lib/Target/PowerPC/README_ALTIVEC.txt
@@ -209,3 +209,107 @@ vector float f(vector float a, vector float b) {
return b;
}
+//===----------------------------------------------------------------------===//
+
+We should do a little better with eliminating dead stores.
+The stores to the stack are dead since %a and %b are not needed
+
+; Function Attrs: nounwind
+define <16 x i8> @test_vpmsumb() #0 {
+ entry:
+ %a = alloca <16 x i8>, align 16
+ %b = alloca <16 x i8>, align 16
+ store <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, <16 x i8>* %a, align 16
+ store <16 x i8> <i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127, i8 112>, <16 x i8>* %b, align 16
+ %0 = load <16 x i8>* %a, align 16
+ %1 = load <16 x i8>* %b, align 16
+ %2 = call <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8> %0, <16 x i8> %1)
+ ret <16 x i8> %2
+}
+
+
+; Function Attrs: nounwind readnone
+declare <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8>, <16 x i8>) #1
+
+
+Produces the following code with -mtriple=powerpc64-unknown-linux-gnu:
+# BB#0: # %entry
+ addis 3, 2, .LCPI0_0@toc@ha
+ addis 4, 2, .LCPI0_1@toc@ha
+ addi 3, 3, .LCPI0_0@toc@l
+ addi 4, 4, .LCPI0_1@toc@l
+ lxvw4x 0, 0, 3
+ addi 3, 1, -16
+ lxvw4x 35, 0, 4
+ stxvw4x 0, 0, 3
+ ori 2, 2, 0
+ lxvw4x 34, 0, 3
+ addi 3, 1, -32
+ stxvw4x 35, 0, 3
+ vpmsumb 2, 2, 3
+ blr
+ .long 0
+ .quad 0
+
+The two stxvw4x instructions are not needed.
+With -mtriple=powerpc64le-unknown-linux-gnu, the associated permutes
+are present too.
+
+//===----------------------------------------------------------------------===//
+
+The following example is found in test/CodeGen/PowerPC/vec_add_sub_doubleword.ll:
+
+define <2 x i64> @increment_by_val(<2 x i64> %x, i64 %val) nounwind {
+ %tmpvec = insertelement <2 x i64> <i64 0, i64 0>, i64 %val, i32 0
+ %tmpvec2 = insertelement <2 x i64> %tmpvec, i64 %val, i32 1
+ %result = add <2 x i64> %x, %tmpvec2
+ ret <2 x i64> %result
+
+This will generate the following instruction sequence:
+ std 5, -8(1)
+ std 5, -16(1)
+ addi 3, 1, -16
+ ori 2, 2, 0
+ lxvd2x 35, 0, 3
+ vaddudm 2, 2, 3
+ blr
+
+This will almost certainly cause a load-hit-store hazard.
+Since val is a value parameter, it should not need to be saved onto
+the stack, unless it's being done set up the vector register. Instead,
+it would be better to splat teh value into a vector register, and then
+remove the (dead) stores to the stack.
+
+//===----------------------------------------------------------------------===//
+
+At the moment we always generate a lxsdx in preference to lfd, or stxsdx in
+preference to stfd. When we have a reg-immediate addressing mode, this is a
+poor choice, since we have to load the address into an index register. This
+should be fixed for P7/P8.
+
+//===----------------------------------------------------------------------===//
+
+Right now, ShuffleKind 0 is supported only on BE, and ShuffleKind 2 only on LE.
+However, we could actually support both kinds on either endianness, if we check
+for the appropriate shufflevector pattern for each case ... this would cause
+some additional shufflevectors to be recognized and implemented via the
+"swapped" form.
+
+//===----------------------------------------------------------------------===//
+
+There is a utility program called PerfectShuffle that generates a table of the
+shortest instruction sequence for implementing a shufflevector operation on
+PowerPC. However, this was designed for big-endian code generation. We could
+modify this program to create a little endian version of the table. The table
+is used in PPCISelLowering.cpp, PPCTargetLowering::LOWERVECTOR_SHUFFLE().
+
+//===----------------------------------------------------------------------===//
+
+Opportunies to use instructions from PPCInstrVSX.td during code gen
+ - Conversion instructions (Sections 7.6.1.5 and 7.6.1.6 of ISA 2.07)
+ - Scalar comparisons (xscmpodp and xscmpudp)
+ - Min and max (xsmaxdp, xsmindp, xvmaxdp, xvmindp, xvmaxsp, xvminsp)
+
+Related to this: we currently do not generate the lxvw4x instruction for either
+v4f32 or v4i32, probably because adding a dag pattern to the recognizer requires
+a single target type. This should probably be addressed in the PPCISelDAGToDAG logic.