summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s')
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s228
1 files changed, 228 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
new file mode 100755
index 0000000..babe8ad
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
@@ -0,0 +1,228 @@
+;//
+;//
+;// File Name: armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+ EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+
+DEBUG_ON SETL {FALSE}
+
+ IF CortexA8
+
+ M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11
+
+;// Declare input registers
+pSrc RN 0
+srcStep RN 1
+pDst RN 2
+dstStep RN 3
+
+;// Declare Neon registers
+dCoeff5 DN 30.S16
+dCoeff20 DN 31.S16
+
+qSrcA01 QN 11.U8
+qSrcB01 QN 12.U8
+qSrcC01 QN 13.U8
+qSrcD01 QN 14.U8
+
+dSrcA0 DN 22.U8
+dSrcA1 DN 23.U8
+dSrcB0 DN 24.U8
+dSrcB1 DN 25.U8
+dSrcC0 DN 26.U8
+dSrcC1 DN 27.U8
+dSrcD0 DN 28.U8
+dSrcD1 DN 29.U8
+
+dSrcb DN 12.U8
+dSrce DN 13.U8
+dSrcf DN 10.U8
+
+dSrc0c DN 14.U8
+dSrc1c DN 16.U8
+dSrc2c DN 18.U8
+dSrc3c DN 20.U8
+
+dSrc0d DN 15.U8
+dSrc1d DN 17.U8
+dSrc2d DN 19.U8
+dSrc3d DN 21.U8
+
+qTemp01 QN 4.S16
+qTemp23 QN 6.S16
+dTemp0 DN 8.S16
+dTemp2 DN 12.S16
+
+qRes01 QN 11.S16
+qRes23 QN 12.S16
+qRes45 QN 13.S16
+qRes67 QN 14.S16
+
+dRes0 DN 22.S16
+dRes2 DN 24.S16
+dRes4 DN 26.S16
+dRes6 DN 28.S16
+
+dAcc0 DN 22.U8
+dAcc2 DN 24.U8
+dAcc4 DN 26.U8
+dAcc6 DN 28.U8
+
+dResult0 DN 22.U32
+dResult2 DN 24.U32
+dResult4 DN 26.U32
+dResult6 DN 28.U32
+
+ VLD1 qSrcA01, [pSrc], srcStep ;// Load A register [a0 a1 a2 a3 ..]
+ ;// One cycle stall
+ VEXT dSrcf, dSrcA0, dSrcA1, #5 ;// [f0 f1 f2 f3 ..]
+ VEXT dSrcb, dSrcA0, dSrcA1, #1 ;// [b0 b1 b2 b3 ..]
+; VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..]
+ VEXT dSrc0c, dSrcA0, dSrcA1, #2
+ VEXT dSrc0d, dSrcA0, dSrcA1, #3
+ VEXT dSrce, dSrcA0, dSrcA1, #4
+ VADDL qRes01, dSrcA0, dSrcf ;// Acc=a+f
+ VADDL qTemp01, dSrc0c, dSrc0d ;// c+d
+ VADDL qTemp23, dSrcb, dSrce ;// b+e
+
+ VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..]
+; VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..]
+ VMLA dRes0, dTemp0, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi
+
+ VEXT dSrcf, dSrcB0, dSrcB1, #5 ;// [f0 f1 f2 f3 ..]
+ VEXT dSrcb, dSrcB0, dSrcB1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrc1c, dSrcB0, dSrcB1, #2
+ VEXT dSrc1d, dSrcB0, dSrcB1, #3
+ VEXT dSrce, dSrcB0, dSrcB1, #4
+ VADDL qRes23, dSrcB0, dSrcf ;// Acc=a+f
+
+ VSUB dRes0, dRes0, dTemp0 ;// TeRi
+
+ VADDL qTemp01, dSrc1c, dSrc1d ;// c+d
+ VADDL qTemp23, dSrcb, dSrce ;// b+e
+
+ VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..]
+; VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..]
+
+ VMLA dRes2, dTemp0, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes2, dTemp2, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi
+
+ VEXT dSrcf, dSrcC0, dSrcC1, #5 ;// [f0 f1 f2 f3 ..]
+ VEXT dSrcb, dSrcC0, dSrcC1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrc2c, dSrcC0, dSrcC1, #2
+ VEXT dSrc2d, dSrcC0, dSrcC1, #3
+ VEXT dSrce, dSrcC0, dSrcC1, #4
+ VADDL qRes45, dSrcC0, dSrcf ;// Acc=a+f
+
+ VSUB dRes2, dRes2, dTemp0 ;// TeRi
+
+ VADDL qTemp01, dSrc2c, dSrc2d ;// c+d
+ VADDL qTemp23, dSrcb, dSrce ;// b+e
+
+ VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..]
+
+ VMLA dRes4, dTemp0, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes4, dTemp2, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTemp0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) TeRi
+
+
+ VEXT dSrcf, dSrcD0, dSrcD1, #5 ;// [f0 f1 f2 f3 ..]
+ VEXT dSrcb, dSrcD0, dSrcD1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrc3c, dSrcD0, dSrcD1, #2
+ VEXT dSrc3d, dSrcD0, dSrcD1, #3
+ VEXT dSrce, dSrcD0, dSrcD1, #4
+ VADDL qRes67, dSrcD0, dSrcf ;// Acc=a+f
+
+ VSUB dRes4, dRes4, dTemp0 ;// TeRi
+
+ VADDL qTemp01, dSrc3c, dSrc3d ;// c+d
+ VADDL qTemp23, dSrcb, dSrce ;// b+e
+ VMLA dRes6, dTemp0, dCoeff20 ;// Acc += 20*(c+d)
+ VMLS dRes6, dTemp2, dCoeff5 ;// Acc -= 5*(b+e)
+
+ VQRSHRUN dAcc0, qRes01, #5 ;// Acc = Sat ((Acc + 16) / 32)
+ VQRSHRUN dAcc2, qRes23, #5 ;// Acc = Sat ((Acc + 16) / 32)
+ VQRSHRUN dAcc4, qRes45, #5 ;// Acc = Sat ((Acc + 16) / 32)
+ VQRSHRUN dAcc6, qRes67, #5 ;// Acc = Sat ((Acc + 16) / 32)
+
+ M_END
+
+ ENDIF
+
+
+ END
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+