summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s')
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s313
1 files changed, 313 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
new file mode 100755
index 0000000..d1684cb
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
@@ -0,0 +1,313 @@
+;//
+;//
+;// File Name: armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+
+ M_VARIANTS CortexA8
+
+ IF CortexA8
+
+ M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11
+
+;// Declare input registers
+pSrc RN 0
+srcStep RN 1
+pDst RN 2
+dstStep RN 3
+
+;// Declare Neon registers
+dCoeff5 DN 30.S16
+dCoeff20 DN 31.S16
+qCoeff5 QN 14.S32
+qCoeff20 QN 15.S32
+
+qSrc01 QN 0.U8
+dSrc0 DN 0.U8
+dSrc1 DN 1.U8
+
+dSrcb DN 4.U8
+dSrcc DN 2.U8
+dSrcd DN 3.U8
+dSrce DN 5.U8
+dSrcf DN 1.U8
+
+qSrcb QN 2.S16
+qSrcc QN 1.S16
+dSrcB DN 4.S16
+dSrcC DN 2.S16
+
+qRes0 QN 5.S16
+qRes1 QN 6.S16
+qRes2 QN 7.S16
+qRes3 QN 8.S16
+qRes4 QN 9.S16
+qRes5 QN 10.S16
+qRes6 QN 11.S16
+qRes7 QN 12.S16
+qRes8 QN 13.S16
+
+dRes0 DN 10.S16
+dRes1 DN 12.S16
+dRes2 DN 14.S16
+dRes3 DN 16.S16
+dRes4 DN 18.S16
+dRes5 DN 20.S16
+dRes6 DN 22.S16
+dRes7 DN 24.S16
+dRes8 DN 26.S16
+
+qAcc01 QN 5.S32
+qAcc23 QN 6.S32
+qAcc45 QN 2.S32
+qAcc67 QN 3.S32
+qSumBE QN 0.S32
+qSumCD QN 1.S32
+
+dTempAcc0 DN 0.U16
+dTempAcc1 DN 2.U16
+dTempAcc2 DN 4.U16
+dTempAcc3 DN 6.U16
+
+qTAcc0 QN 0.U16
+qTAcc1 QN 1.U16
+qTAcc2 QN 2.U16
+qTAcc3 QN 3.U16
+
+dAcc0 DN 0.U8
+dAcc1 DN 2.U8
+dAcc2 DN 4.U8
+dAcc3 DN 6.U8
+
+dTmp0 DN 8.S16
+dTmp1 DN 9.S16
+qTmp0 QN 4.S32
+
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+ VMOV dCoeff20, #20
+ VMOV dCoeff5, #5
+
+ ;// Row0
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes0, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+ VMLA dRes0, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row1
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes1, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes0, dRes0, dTmp0 ;// TeRi
+
+ VMLA dRes1, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes1, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row2
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes2, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes1, dRes1, dTmp0
+
+ VMLA dRes2, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes2, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row3
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes3, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes2, dRes2, dTmp0
+
+ VMLA dRes3, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes3, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row4
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes4, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes3, dRes3, dTmp0
+
+ VMLA dRes4, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes4, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row5
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes5, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes4, dRes4, dTmp0
+
+ VMLA dRes5, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes5, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row6
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes6, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes5, dRes5, dTmp0
+
+ VMLA dRes6, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes6, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row7
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes7, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes6, dRes6, dTmp0
+
+ VMLA dRes7, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes7, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row8
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes8, dSrc0, dSrcf ;// Acc=a+f
+
+ VSUB dRes7, dRes7, dTmp0
+
+ VMLA dRes8, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes8, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ VMOV qCoeff20, #20
+ VMOV qCoeff5, #5
+
+ ;// Col0
+ VADDL qAcc01, dRes0, dRes5 ;// Acc = a+f
+ VADDL qSumCD, dRes2, dRes3 ;// c+d
+ VADDL qSumBE, dRes1, dRes4 ;// b+e
+
+ VSUB dRes8, dRes8, dTmp0
+
+ VMLA qAcc01, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
+; VMLS qAcc01, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+ VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+
+ ;// Col1
+ VADDL qAcc23, dRes1, dRes6 ;// Acc = a+f
+ VADDL qSumCD, dRes3, dRes4 ;// c+d
+ VADDL qSumBE, dRes2, dRes5 ;// b+e
+ VMLA qAcc23, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
+
+ VSUB qAcc01, qAcc01, qTmp0
+
+; VMLS qAcc23, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+ VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+
+ ;// Col2
+ VADDL qAcc45, dRes2, dRes7 ;// Acc = a+f
+ VADDL qSumCD, dRes4, dRes5 ;// c+d
+ VADDL qSumBE, dRes3, dRes6 ;// b+e
+ VMLA qAcc45, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
+
+ VSUB qAcc23, qAcc23, qTmp0
+
+; VMLS qAcc45, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+ VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+
+ ;// Col3
+ VADDL qAcc67, dRes3, dRes8 ;// Acc = a+f
+ VADDL qSumCD, dRes5, dRes6 ;// c+d
+ VADDL qSumBE, dRes4, dRes7 ;// b+e
+ VMLA qAcc67, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
+
+ VSUB qAcc45, qAcc45, qTmp0
+
+ VMLS qAcc67, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+
+ VQRSHRUN dTempAcc0, qAcc01, #10
+ VQRSHRUN dTempAcc1, qAcc23, #10
+ VQRSHRUN dTempAcc2, qAcc45, #10
+ VQRSHRUN dTempAcc3, qAcc67, #10
+
+ VQMOVN dAcc0, qTAcc0
+ VQMOVN dAcc1, qTAcc1
+ VQMOVN dAcc2, qTAcc2
+ VQMOVN dAcc3, qTAcc3
+
+ M_END
+
+ ENDIF
+
+
+
+ END
+