diff options
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s')
-rw-r--r-- | media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s | 366 |
1 files changed, 366 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s new file mode 100644 index 0000000..14b37fe --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s @@ -0,0 +1,366 @@ +;// +;// +;// File Name: armVCM4P10_DeblockingLuma_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 9641 +;// Date: Thursday, February 7, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS ARM1136JS + + + + IF ARM1136JS + +MASK_1 EQU 0x01010101 + +;// Declare input registers + +pQ0 RN 0 +StepArg RN 1 +tC0Arg RN 2 +alpha RN 6 + +beta RN 14 +bS RN 14 +tC0 RN 14 +ptC0 RN 1 + +;// Declare Local/Temporary variables + +;// Pixels +p_0 RN 3 +p_1 RN 5 +p_2 RN 4 +p_3 RN 2 +q_0 RN 8 +q_1 RN 9 +q_2 RN 10 +q_3 RN 12 + + +;// Filtering + +ap0q0 RN 1 +filt RN 2 + +m00 RN 7 +m01 RN 11 + +apflg RN 0 +aqflg RN 6 + +tC RN 1 + + +;//Declarations for bSLT4 kernel + +pos RN 7 +neg RN 12 + +P0a RN 1 +P1a RN 8 +Q0a RN 7 +Q1a RN 4 + +u1 RN 3 +max RN 12 +min RN 2 + + + +;//Declarations for bSGE4 kernel + +q_3b RN 9 +p_3b RN 0 +apqflg RN 12 + +P0b RN 6 +P1b RN 7 +P2b RN 1 + +Q0b RN 9 +Q1b RN 0 +Q2b RN 2 + +;// Miscellanous + +a RN 0 +t0 RN 3 +t1 RN 12 +t2 RN 7 +t3 RN 11 +t4 RN 4 +t5 RN 1 +t8 RN 6 +t9 RN 14 +t10 RN 5 +t11 RN 9 + +;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe() +;// +;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2) +;// - 2 - filt, 0 - apflg, 6 - aqflg +;// - 11 - m01, 7 - tC0 +;// +;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a) +;// +;// Registers Corrupted - 0-3,5-12,14 + + + M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr + + ;// Since beta <= 18 and alpha <= 255 we know + ;// -254 <= p0-q0 <= 254 + ;// -17 <= q1-q0 <= 17 + ;// -17 <= p1-p0 <= 17 + + ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3)) + ;// + ;// Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3 + ;// = (4*q0 - 4*p0 + p1 - q1 + 4)>>3 + ;// = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3 + + USUB8 t1, p_1, p_0 + MUL tC0, t2, m01 + + USUB8 t2, q_1, q_0 + SSUB8 t1, t1, t2 + + USUB8 t2, p_0, q_0 + AND t2, t2, m01 + SHSUB8 t1, t1, t2 + UHSUB8 t5, p_0, q_0 + SSUB8 t1, t1, t2 + SHSUB8 t1, t1, t5 + MOV m00, #0 + SADD8 t1, t1, m01 + SHSUB8 t1, t1, t5 + + ;// tC = tC0 + ;// if (ap < beta) tC++; + ;// if (aq < beta) tC++; + USUB8 t5, filt, m01 + SEL tC0, tC0, m00 + UQADD8 tC, tC0, apflg + SSUB8 t1, t1, m00 + UQADD8 tC, tC, aqflg + + ;// Split into positive and negative part and clip + SEL pos, t1, m00 + USUB8 neg, pos, t1 + USUB8 t3, pos, tC + SEL pos, tC, pos + USUB8 t3, neg, tC + SEL neg, tC, neg + + ;//Reload m01 + LDR m01,=MASK_1 + + UQADD8 P0a, p_0, pos + UQSUB8 Q0a, q_0, pos + UQSUB8 P0a, P0a, neg + UQADD8 Q0a, Q0a, neg + + ;// Choose to store the filtered + ;// value or the original pixel + USUB8 t1, filt, m01 + SEL P0a, P0a, p_0 + SEL Q0a, Q0a, q_0 + + ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1; + ;// u1 = (p0 + q0 + 1)>>1 + ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80 + MVN p_0, p_0 + UHSUB8 u1, q_0, p_0 + UQADD8 max, p_1, tC0 + EOR u1, u1, m01 ,LSL #7 + + ;// Calculate A = (p2+u1)>>1 + ;// Then delta = Clip3( -tC0, tC0, A - p1) + + ;// Clip P1 + UHADD8 P1a, p_2, u1 + UQSUB8 min, p_1, tC0 + USUB8 t4, P1a, max + SEL P1a, max, P1a + USUB8 t4, P1a, min + SEL P1a, P1a, min + + ;// Clip Q1 + UHADD8 Q1a, q_2, u1 + UQADD8 max, q_1, tC0 + UQSUB8 min, q_1, tC0 + USUB8 t0, Q1a, max + SEL Q1a, max, Q1a + USUB8 t0, Q1a, min + SEL Q1a, Q1a, min + + ;// Choose to store the filtered + ;// value or the original pixel + USUB8 t0, apflg, m01 + SEL P1a, P1a, p_1 + USUB8 t0, aqflg, m01 + SEL t3, Q1a, q_1 + + M_END + +;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe() +;// +;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2) +;// - 2 - filt, 0 - apflg,aqflg +;// - 1 - ap0q0, 6 - alpha +;// - 7 - m00, 11 - m01 +;// +;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b) +;// +;// Registers Corrupted - 0-3,5-12,14 + + M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr + + ;// apflg = apflg && |p0-q0|<((alpha>>2)+2) + ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2) + + M_ARG pDummy,4 + M_ARG pQ_3,4 + M_ARG pP_3,4 + + UHADD8 alpha, alpha, m00 + USUB8 t9, p_2, p_0 ;//t9 = dp2p0 + UHADD8 alpha, alpha, m00 + ADD alpha, alpha, m01, LSL #1 + USUB8 ap0q0, ap0q0, alpha + SEL apqflg, m00, apflg + + ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 + ;// = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3 + ;// = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3) + + ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2 + ;// = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2) + + ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3 + ;// = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3 + ;// = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2) + + ;// Compute P0b + USUB8 t2, p_0, q_0 + SSUB8 t5, t9, t2 + + USUB8 t8, q_1, q_0 + SHADD8 t8, t5, t8 + + USUB8 t9, p_1, p_0 + SADD8 t8, t8, t9 + SHSUB8 t8, t8, t2 + SHADD8 t5, t5, t9 + SHADD8 t8, t8, m01 + SHADD8 t9, t5, m01 + SADD8 P0b, p_0, t8 + ;// P0b ready + + ;// Compute P1b + M_LDR p_3b, pP_3 + SADD8 P1b, p_0, t9 + ;// P1b ready + + ;// Compute P2b + USUB8 t9, p_2, p_0 + SADD8 t5, t5, t9 + UHSUB8 t9, p_3b, p_0 + EOR a, p_3b, p_0 + AND a, a, m01 + SHADD8 t5, t5, a + UHADD8 a, p_0, q_1 + SADD8 t5, t5, m01 + SHADD8 t5, t5, t9 + MVN t9, p_1 + SADD8 P2b, p_0, t5 + ;// P2b ready + + UHSUB8 a, a, t9 + ORR t9, apqflg, m01 + USUB8 t9, apqflg, t9 + + EOR a, a, m01, LSL #7 + SEL P0b, P0b, a + SEL P1b, P1b, p_1 + SEL P2b, P2b, p_2 + + USUB8 t4, filt, m01 + SEL P0b, P0b, p_0 + + + ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3 + ;// = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3 + ;// = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3) + + ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2 + ;// = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2) + + ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3 + ;// = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3 + ;// = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2) + + + ;// Compute Q0b Q1b + USUB8 t4, q_2, q_0 + USUB8 a, p_0, q_0 + USUB8 t9, p_1, p_0 + SADD8 t0, t4, a + SHADD8 t9, t0, t9 + UHADD8 t10, q_0, p_1 + SADD8 t9, t9, a + USUB8 a, q_1, q_0 + SHADD8 t9, t9, a + SHADD8 t0, t0, a + SHADD8 t9, t9, m01 + SHADD8 a, t0, m01 + SADD8 t9, q_0, t9 + ;// Q0b ready - t9 + + MOV t4, #0 + UHADD8 apqflg, apqflg, t4 + + SADD8 Q1b, q_0, a + ;// Q1b ready + + USUB8 t4, apqflg, m01 + SEL Q1b, Q1b, q_1 + MVN t11, q_1 + UHSUB8 t10, t10, t11 + M_LDR q_3b, pQ_3 + EOR t10, t10, m01, LSL #7 + SEL t9, t9, t10 + + ;// Compute Q2b + USUB8 t4, q_2, q_0 + SADD8 t4, t0, t4 + EOR t0, q_3b, q_0 + AND t0, t0, m01 + SHADD8 t4, t4, t0 + UHSUB8 t10, q_3b, q_0 + SADD8 t4, t4, m01 + SHADD8 t4, t4, t10 + + USUB8 t10, filt, m01 + SEL Q0b, t9, q_0 + + SADD8 t4, q_0, t4 + ;// Q2b ready - t4 + + USUB8 t10, apqflg, m01 + SEL Q2b, t4, q_2 + + M_END + + ENDIF + + END
\ No newline at end of file |