summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s')
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s366
1 files changed, 366 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
new file mode 100644
index 0000000..14b37fe
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
@@ -0,0 +1,366 @@
+;//
+;//
+;// File Name: armVCM4P10_DeblockingLuma_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 9641
+;// Date: Thursday, February 7, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS ARM1136JS
+
+
+
+ IF ARM1136JS
+
+MASK_1 EQU 0x01010101
+
+;// Declare input registers
+
+pQ0 RN 0
+StepArg RN 1
+tC0Arg RN 2
+alpha RN 6
+
+beta RN 14
+bS RN 14
+tC0 RN 14
+ptC0 RN 1
+
+;// Declare Local/Temporary variables
+
+;// Pixels
+p_0 RN 3
+p_1 RN 5
+p_2 RN 4
+p_3 RN 2
+q_0 RN 8
+q_1 RN 9
+q_2 RN 10
+q_3 RN 12
+
+
+;// Filtering
+
+ap0q0 RN 1
+filt RN 2
+
+m00 RN 7
+m01 RN 11
+
+apflg RN 0
+aqflg RN 6
+
+tC RN 1
+
+
+;//Declarations for bSLT4 kernel
+
+pos RN 7
+neg RN 12
+
+P0a RN 1
+P1a RN 8
+Q0a RN 7
+Q1a RN 4
+
+u1 RN 3
+max RN 12
+min RN 2
+
+
+
+;//Declarations for bSGE4 kernel
+
+q_3b RN 9
+p_3b RN 0
+apqflg RN 12
+
+P0b RN 6
+P1b RN 7
+P2b RN 1
+
+Q0b RN 9
+Q1b RN 0
+Q2b RN 2
+
+;// Miscellanous
+
+a RN 0
+t0 RN 3
+t1 RN 12
+t2 RN 7
+t3 RN 11
+t4 RN 4
+t5 RN 1
+t8 RN 6
+t9 RN 14
+t10 RN 5
+t11 RN 9
+
+;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
+;//
+;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
+;// - 2 - filt, 0 - apflg, 6 - aqflg
+;// - 11 - m01, 7 - tC0
+;//
+;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
+;//
+;// Registers Corrupted - 0-3,5-12,14
+
+
+ M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr
+
+ ;// Since beta <= 18 and alpha <= 255 we know
+ ;// -254 <= p0-q0 <= 254
+ ;// -17 <= q1-q0 <= 17
+ ;// -17 <= p1-p0 <= 17
+
+ ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
+ ;//
+ ;// Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
+ ;// = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
+ ;// = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3
+
+ USUB8 t1, p_1, p_0
+ MUL tC0, t2, m01
+
+ USUB8 t2, q_1, q_0
+ SSUB8 t1, t1, t2
+
+ USUB8 t2, p_0, q_0
+ AND t2, t2, m01
+ SHSUB8 t1, t1, t2
+ UHSUB8 t5, p_0, q_0
+ SSUB8 t1, t1, t2
+ SHSUB8 t1, t1, t5
+ MOV m00, #0
+ SADD8 t1, t1, m01
+ SHSUB8 t1, t1, t5
+
+ ;// tC = tC0
+ ;// if (ap < beta) tC++;
+ ;// if (aq < beta) tC++;
+ USUB8 t5, filt, m01
+ SEL tC0, tC0, m00
+ UQADD8 tC, tC0, apflg
+ SSUB8 t1, t1, m00
+ UQADD8 tC, tC, aqflg
+
+ ;// Split into positive and negative part and clip
+ SEL pos, t1, m00
+ USUB8 neg, pos, t1
+ USUB8 t3, pos, tC
+ SEL pos, tC, pos
+ USUB8 t3, neg, tC
+ SEL neg, tC, neg
+
+ ;//Reload m01
+ LDR m01,=MASK_1
+
+ UQADD8 P0a, p_0, pos
+ UQSUB8 Q0a, q_0, pos
+ UQSUB8 P0a, P0a, neg
+ UQADD8 Q0a, Q0a, neg
+
+ ;// Choose to store the filtered
+ ;// value or the original pixel
+ USUB8 t1, filt, m01
+ SEL P0a, P0a, p_0
+ SEL Q0a, Q0a, q_0
+
+ ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
+ ;// u1 = (p0 + q0 + 1)>>1
+ ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
+ MVN p_0, p_0
+ UHSUB8 u1, q_0, p_0
+ UQADD8 max, p_1, tC0
+ EOR u1, u1, m01 ,LSL #7
+
+ ;// Calculate A = (p2+u1)>>1
+ ;// Then delta = Clip3( -tC0, tC0, A - p1)
+
+ ;// Clip P1
+ UHADD8 P1a, p_2, u1
+ UQSUB8 min, p_1, tC0
+ USUB8 t4, P1a, max
+ SEL P1a, max, P1a
+ USUB8 t4, P1a, min
+ SEL P1a, P1a, min
+
+ ;// Clip Q1
+ UHADD8 Q1a, q_2, u1
+ UQADD8 max, q_1, tC0
+ UQSUB8 min, q_1, tC0
+ USUB8 t0, Q1a, max
+ SEL Q1a, max, Q1a
+ USUB8 t0, Q1a, min
+ SEL Q1a, Q1a, min
+
+ ;// Choose to store the filtered
+ ;// value or the original pixel
+ USUB8 t0, apflg, m01
+ SEL P1a, P1a, p_1
+ USUB8 t0, aqflg, m01
+ SEL t3, Q1a, q_1
+
+ M_END
+
+;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
+;//
+;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
+;// - 2 - filt, 0 - apflg,aqflg
+;// - 1 - ap0q0, 6 - alpha
+;// - 7 - m00, 11 - m01
+;//
+;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
+;//
+;// Registers Corrupted - 0-3,5-12,14
+
+ M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr
+
+ ;// apflg = apflg && |p0-q0|<((alpha>>2)+2)
+ ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2)
+
+ M_ARG pDummy,4
+ M_ARG pQ_3,4
+ M_ARG pP_3,4
+
+ UHADD8 alpha, alpha, m00
+ USUB8 t9, p_2, p_0 ;//t9 = dp2p0
+ UHADD8 alpha, alpha, m00
+ ADD alpha, alpha, m01, LSL #1
+ USUB8 ap0q0, ap0q0, alpha
+ SEL apqflg, m00, apflg
+
+ ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
+ ;// = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3
+ ;// = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3)
+
+ ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
+ ;// = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)
+
+ ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3
+ ;// = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
+ ;// = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)
+
+ ;// Compute P0b
+ USUB8 t2, p_0, q_0
+ SSUB8 t5, t9, t2
+
+ USUB8 t8, q_1, q_0
+ SHADD8 t8, t5, t8
+
+ USUB8 t9, p_1, p_0
+ SADD8 t8, t8, t9
+ SHSUB8 t8, t8, t2
+ SHADD8 t5, t5, t9
+ SHADD8 t8, t8, m01
+ SHADD8 t9, t5, m01
+ SADD8 P0b, p_0, t8
+ ;// P0b ready
+
+ ;// Compute P1b
+ M_LDR p_3b, pP_3
+ SADD8 P1b, p_0, t9
+ ;// P1b ready
+
+ ;// Compute P2b
+ USUB8 t9, p_2, p_0
+ SADD8 t5, t5, t9
+ UHSUB8 t9, p_3b, p_0
+ EOR a, p_3b, p_0
+ AND a, a, m01
+ SHADD8 t5, t5, a
+ UHADD8 a, p_0, q_1
+ SADD8 t5, t5, m01
+ SHADD8 t5, t5, t9
+ MVN t9, p_1
+ SADD8 P2b, p_0, t5
+ ;// P2b ready
+
+ UHSUB8 a, a, t9
+ ORR t9, apqflg, m01
+ USUB8 t9, apqflg, t9
+
+ EOR a, a, m01, LSL #7
+ SEL P0b, P0b, a
+ SEL P1b, P1b, p_1
+ SEL P2b, P2b, p_2
+
+ USUB8 t4, filt, m01
+ SEL P0b, P0b, p_0
+
+
+ ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3
+ ;// = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3
+ ;// = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3)
+
+ ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
+ ;// = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)
+
+ ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3
+ ;// = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
+ ;// = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)
+
+
+ ;// Compute Q0b Q1b
+ USUB8 t4, q_2, q_0
+ USUB8 a, p_0, q_0
+ USUB8 t9, p_1, p_0
+ SADD8 t0, t4, a
+ SHADD8 t9, t0, t9
+ UHADD8 t10, q_0, p_1
+ SADD8 t9, t9, a
+ USUB8 a, q_1, q_0
+ SHADD8 t9, t9, a
+ SHADD8 t0, t0, a
+ SHADD8 t9, t9, m01
+ SHADD8 a, t0, m01
+ SADD8 t9, q_0, t9
+ ;// Q0b ready - t9
+
+ MOV t4, #0
+ UHADD8 apqflg, apqflg, t4
+
+ SADD8 Q1b, q_0, a
+ ;// Q1b ready
+
+ USUB8 t4, apqflg, m01
+ SEL Q1b, Q1b, q_1
+ MVN t11, q_1
+ UHSUB8 t10, t10, t11
+ M_LDR q_3b, pQ_3
+ EOR t10, t10, m01, LSL #7
+ SEL t9, t9, t10
+
+ ;// Compute Q2b
+ USUB8 t4, q_2, q_0
+ SADD8 t4, t0, t4
+ EOR t0, q_3b, q_0
+ AND t0, t0, m01
+ SHADD8 t4, t4, t0
+ UHSUB8 t10, q_3b, q_0
+ SADD8 t4, t4, m01
+ SHADD8 t4, t4, t10
+
+ USUB8 t10, filt, m01
+ SEL Q0b, t9, q_0
+
+ SADD8 t4, q_0, t4
+ ;// Q2b ready - t4
+
+ USUB8 t10, apqflg, m01
+ SEL Q2b, t4, q_2
+
+ M_END
+
+ ENDIF
+
+ END \ No newline at end of file