diff options
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s')
-rwxr-xr-x | media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s | 313 |
1 files changed, 313 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s new file mode 100755 index 0000000..d1684cb --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s @@ -0,0 +1,313 @@ +;// +;// +;// File Name: armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe + + M_VARIANTS CortexA8 + + IF CortexA8 + + M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11 + +;// Declare input registers +pSrc RN 0 +srcStep RN 1 +pDst RN 2 +dstStep RN 3 + +;// Declare Neon registers +dCoeff5 DN 30.S16 +dCoeff20 DN 31.S16 +qCoeff5 QN 14.S32 +qCoeff20 QN 15.S32 + +qSrc01 QN 0.U8 +dSrc0 DN 0.U8 +dSrc1 DN 1.U8 + +dSrcb DN 4.U8 +dSrcc DN 2.U8 +dSrcd DN 3.U8 +dSrce DN 5.U8 +dSrcf DN 1.U8 + +qSrcb QN 2.S16 +qSrcc QN 1.S16 +dSrcB DN 4.S16 +dSrcC DN 2.S16 + +qRes0 QN 5.S16 +qRes1 QN 6.S16 +qRes2 QN 7.S16 +qRes3 QN 8.S16 +qRes4 QN 9.S16 +qRes5 QN 10.S16 +qRes6 QN 11.S16 +qRes7 QN 12.S16 +qRes8 QN 13.S16 + +dRes0 DN 10.S16 +dRes1 DN 12.S16 +dRes2 DN 14.S16 +dRes3 DN 16.S16 +dRes4 DN 18.S16 +dRes5 DN 20.S16 +dRes6 DN 22.S16 +dRes7 DN 24.S16 +dRes8 DN 26.S16 + +qAcc01 QN 5.S32 +qAcc23 QN 6.S32 +qAcc45 QN 2.S32 +qAcc67 QN 3.S32 +qSumBE QN 0.S32 +qSumCD QN 1.S32 + +dTempAcc0 DN 0.U16 +dTempAcc1 DN 2.U16 +dTempAcc2 DN 4.U16 +dTempAcc3 DN 6.U16 + +qTAcc0 QN 0.U16 +qTAcc1 QN 1.U16 +qTAcc2 QN 2.U16 +qTAcc3 QN 3.U16 + +dAcc0 DN 0.U8 +dAcc1 DN 2.U8 +dAcc2 DN 4.U8 +dAcc3 DN 6.U8 + +dTmp0 DN 8.S16 +dTmp1 DN 9.S16 +qTmp0 QN 4.S32 + + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + VMOV dCoeff20, #20 + VMOV dCoeff5, #5 + + ;// Row0 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes0, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + VMLA dRes0, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row1 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes1, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes0, dRes0, dTmp0 ;// TeRi + + VMLA dRes1, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes1, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row2 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes2, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes1, dRes1, dTmp0 + + VMLA dRes2, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes2, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row3 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes3, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes2, dRes2, dTmp0 + + VMLA dRes3, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes3, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row4 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes4, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes3, dRes3, dTmp0 + + VMLA dRes4, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes4, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row5 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes5, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes4, dRes4, dTmp0 + + VMLA dRes5, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes5, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row6 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes6, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes5, dRes5, dTmp0 + + VMLA dRes6, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes6, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row7 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes7, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes6, dRes6, dTmp0 + + VMLA dRes7, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes7, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row8 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes8, dSrc0, dSrcf ;// Acc=a+f + + VSUB dRes7, dRes7, dTmp0 + + VMLA dRes8, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes8, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + VMOV qCoeff20, #20 + VMOV qCoeff5, #5 + + ;// Col0 + VADDL qAcc01, dRes0, dRes5 ;// Acc = a+f + VADDL qSumCD, dRes2, dRes3 ;// c+d + VADDL qSumBE, dRes1, dRes4 ;// b+e + + VSUB dRes8, dRes8, dTmp0 + + VMLA qAcc01, qSumCD, qCoeff20 ;// Acc += 20*(c+d) +; VMLS qAcc01, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + + ;// Col1 + VADDL qAcc23, dRes1, dRes6 ;// Acc = a+f + VADDL qSumCD, dRes3, dRes4 ;// c+d + VADDL qSumBE, dRes2, dRes5 ;// b+e + VMLA qAcc23, qSumCD, qCoeff20 ;// Acc += 20*(c+d) + + VSUB qAcc01, qAcc01, qTmp0 + +; VMLS qAcc23, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + + ;// Col2 + VADDL qAcc45, dRes2, dRes7 ;// Acc = a+f + VADDL qSumCD, dRes4, dRes5 ;// c+d + VADDL qSumBE, dRes3, dRes6 ;// b+e + VMLA qAcc45, qSumCD, qCoeff20 ;// Acc += 20*(c+d) + + VSUB qAcc23, qAcc23, qTmp0 + +; VMLS qAcc45, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + + ;// Col3 + VADDL qAcc67, dRes3, dRes8 ;// Acc = a+f + VADDL qSumCD, dRes5, dRes6 ;// c+d + VADDL qSumBE, dRes4, dRes7 ;// b+e + VMLA qAcc67, qSumCD, qCoeff20 ;// Acc += 20*(c+d) + + VSUB qAcc45, qAcc45, qTmp0 + + VMLS qAcc67, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + + VQRSHRUN dTempAcc0, qAcc01, #10 + VQRSHRUN dTempAcc1, qAcc23, #10 + VQRSHRUN dTempAcc2, qAcc45, #10 + VQRSHRUN dTempAcc3, qAcc67, #10 + + VQMOVN dAcc0, qTAcc0 + VQMOVN dAcc1, qTAcc1 + VQMOVN dAcc2, qTAcc2 + VQMOVN dAcc3, qTAcc3 + + M_END + + ENDIF + + + + END + |