diff options
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s')
-rwxr-xr-x | media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s | 536 |
1 files changed, 536 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s new file mode 100755 index 0000000..1c79b39 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s @@ -0,0 +1,536 @@ +; Copyright (C) 2009 The Android Open Source Project +; +; Licensed under the Apache License, Version 2.0 (the "License"); +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. + +;------------------------------------------------------------------------------- +;-- +;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorVerQuarter +;-- function +;-- +;------------------------------------------------------------------------------- + + + IF :DEF: H264DEC_WINASM + ;// We dont use REQUIRE8 and PRESERVE8 for winasm + ELSE + REQUIRE8 + PRESERVE8 + ENDIF + + AREA |.text|, CODE + +;// h264bsdInterpolateHorVerQuarter register allocation + +ref RN 0 + +mb RN 1 +buff RN 1 + +count RN 2 +x0 RN 2 + +y0 RN 3 +x_2_0 RN 3 +res RN 3 + +x_3_1 RN 4 +tmp1 RN 4 + +height RN 5 +x_6_4 RN 5 +tmp2 RN 5 + +partW RN 6 +x_7_5 RN 6 +tmp3 RN 6 + +partH RN 7 +tmp4 RN 7 + +tmp5 RN 8 + +tmp6 RN 9 + +tmpa RN 10 + +mult_20_01 RN 11 +tmpb RN 11 + +mult_20_m5 RN 12 +width RN 12 + +plus16 RN 14 + + +;// function exports and imports + + IMPORT h264bsdFillBlock + + EXPORT h264bsdInterpolateHorVerQuarter + +;// Horizontal filter approach +;// +;// Basic idea in horizontal filtering is to adjust coefficients +;// like below. Calculation is done with 16-bit maths. +;// +;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0 +;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ... +;// y_0 = 20 1 20 -5 -5 1 +;// y_1 = -5 20 1 1 20 -5 +;// y_2 = 1 -5 -5 20 1 20 +;// y_3 = 1 20 -5 -5 20 1 + + +h264bsdInterpolateHorVerQuarter + STMFD sp!, {r0-r11, lr} + SUB sp, sp, #0x1e4 + + CMP x0, #0 + BLT do_fill ;// (x0 < 0) + LDR partW, [sp,#0x220] ;// partWidth + LDR width, [sp,#0x218] ;// width + ADD tmpa, x0, partW ;// (x0+partWidth) + ADD tmpa, tmpa, #5 ;// (x0+partW+5) + CMP tmpa, width + BHI do_fill ;// (x0+partW)>width + + CMP y0, #0 + BLT do_fill ;// (y0 < 0) + LDR partH, [sp,#0x224] ;// partHeight + LDR height, [sp,#0x21c] ;// height + ADD tmp5, y0, partH ;// (y0+partHeight) + ADD tmp5, tmp5, #5 ;// (y0+partH+5) + CMP tmp5, height + BLS skip_fill ;// no overfill needed + + +do_fill + LDR partH, [sp,#0x224] ;// partHeight + LDR partW, [sp,#0x220] ;// partWidth + LDR height, [sp,#0x21c] ;// height + ADD tmp5, partH, #5 ;// tmp5 = partH + 5 + ADD tmpa, partW, #5 ;// tmpa = partW + 5 + STMIB sp, {height, tmpa} ;// sp+4 = height, sp+8 = partWidth+5 + LDR width, [sp,#0x218] ;// width + STR tmp5, [sp,#0xc] ;// sp+c = partHeight+5 + STR tmpa, [sp,#0x10] ;// sp+10 = partWidth+5 + STR width, [sp,#0] ;// sp+0 = width + ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1] + BL h264bsdFillBlock + + MOV x0, #0 + STR x0,[sp,#0x1ec] ;// x0 = 0 + STR x0,[sp,#0x1f0] ;// y0 = 0 + ADD ref,sp,#0x28 ;// ref = p1 + STR tmpa, [sp,#0x218] ;// width = partWidth+5 + + +skip_fill + LDR x0 ,[sp,#0x1ec] ;// x0 + LDR y0 ,[sp,#0x1f0] ;// y0 + LDR width, [sp,#0x218] ;// width + LDR tmp6, [sp,#0x228] ;// horVerOffset + LDR mb, [sp, #0x1e8] ;// mb + MLA tmp5, width, y0, x0 ;// y0*width+x0 + ADD ref, ref, tmp5 ;// ref += y0*width+x0 + STR ref, [sp, #0x1e4] ;// store "ref" for vertical filtering + AND tmp6, tmp6, #2 ;// calculate ref for horizontal filter + MOV tmpa, #2 + ADD tmp6, tmpa, tmp6, LSR #1 + MLA ref, tmp6, width, ref + ADD ref, ref, #8 ;// ref = ref+8 + + ;// pack values to count register + ;// [31:28] loop_x (partWidth-1) + ;// [27:24] loop_y (partHeight-1) + ;// [23:20] partWidth-1 + ;// [19:16] partHeight-1 + ;// [15:00] width + MOV count, width + SUB partW, partW, #1; + SUB partH, partH, #1; + ADD tmp5, partH, partW, LSL #4 + ADD count, count, tmp5, LSL #16 + + + LDR mult_20_01, = 0x00140001 ;// constant multipliers + LDR mult_20_m5, = 0x0014FFFB ;// constant multipliers + MOV plus16, #16 ;// constant for add + AND tmp4, count, #0x000F0000 ;// partHeight-1 + AND tmp6, count, #0x00F00000 ;// partWidth-1 + ADD count, count, tmp4, LSL #8 ;// partH-1 to lower part of top byte + +;// HORIZONTAL PART + +loop_y_hor + LDR x_3_1, [ref, #-8] + ADD count, count, tmp6, LSL #8 ;// partW-1 to upper part of top byte + LDR x_7_5, [ref, #-4] + UXTB16 x_2_0, x_3_1 + UXTB16 x_3_1, x_3_1, ROR #8 + UXTB16 x_6_4, x_7_5 + +loop_x_hor + UXTB16 x_7_5, x_7_5, ROR #8 + + SMLAD tmp4, x_2_0, mult_20_01, plus16 + SMLATB tmp6, x_2_0, mult_20_01, plus16 + SMLATB tmp5, x_2_0, mult_20_m5, plus16 + SMLATB tmpa, x_3_1, mult_20_01, plus16 + + SMLAD tmp4, x_3_1, mult_20_m5, tmp4 + SMLATB tmp6, x_3_1, mult_20_m5, tmp6 + SMLAD tmp5, x_3_1, mult_20_01, tmp5 + LDR x_3_1, [ref], #4 + SMLAD tmpa, x_6_4, mult_20_m5, tmpa + + SMLABB tmp4, x_6_4, mult_20_m5, tmp4 + SMLADX tmp6, x_6_4, mult_20_m5, tmp6 + SMLADX tmp5, x_6_4, mult_20_01, tmp5 + SMLADX tmpa, x_7_5, mult_20_m5, tmpa + + SMLABB tmp4, x_7_5, mult_20_01, tmp4 + UXTB16 x_2_0, x_3_1 + SMLABB tmp5, x_7_5, mult_20_m5, tmp5 + SMLADX tmp6, x_7_5, mult_20_01, tmp6 + SMLABB tmpa, x_2_0, mult_20_01, tmpa + + MOV tmp5, tmp5, ASR #5 + MOV tmp4, tmp4, ASR #5 + PKHBT tmp5, tmp5, tmpa, LSL #(16-5) + PKHBT tmp4, tmp4, tmp6, LSL #(16-5) + USAT16 tmp5, #8, tmp5 + USAT16 tmp4, #8, tmp4 + + SUBS count, count, #4<<28 + ORR tmp4, tmp4, tmp5, LSL #8 + STR tmp4, [mb], #4 + BCC next_y_hor + + UXTB16 x_3_1, x_3_1, ROR #8 + + SMLAD tmp4, x_6_4, mult_20_01, plus16 + SMLATB tmp6, x_6_4, mult_20_01, plus16 + SMLATB tmp5, x_6_4, mult_20_m5, plus16 + SMLATB tmpa, x_7_5, mult_20_01, plus16 + + SMLAD tmp4, x_7_5, mult_20_m5, tmp4 + SMLATB tmp6, x_7_5, mult_20_m5, tmp6 + SMLAD tmp5, x_7_5, mult_20_01, tmp5 + LDR x_7_5, [ref], #4 + SMLAD tmpa, x_2_0, mult_20_m5, tmpa + + SMLABB tmp4, x_2_0, mult_20_m5, tmp4 + SMLADX tmp6, x_2_0, mult_20_m5, tmp6 + SMLADX tmp5, x_2_0, mult_20_01, tmp5 + SMLADX tmpa, x_3_1, mult_20_m5, tmpa + + SMLABB tmp4, x_3_1, mult_20_01, tmp4 + UXTB16 x_6_4, x_7_5 + SMLABB tmp5, x_3_1, mult_20_m5, tmp5 + SMLADX tmp6, x_3_1, mult_20_01, tmp6 + SMLABB tmpa, x_6_4, mult_20_01, tmpa + + MOV tmp5, tmp5, ASR #5 + MOV tmp4, tmp4, ASR #5 + PKHBT tmp5, tmp5, tmpa, LSL #(16-5) + PKHBT tmp4, tmp4, tmp6, LSL #(16-5) + USAT16 tmp5, #8, tmp5 + USAT16 tmp4, #8, tmp4 + + SUBS count, count, #4<<28 + ORR tmp4, tmp4, tmp5, LSL #8 + STR tmp4, [mb], #4 + BCS loop_x_hor + +next_y_hor + AND tmp6, count, #0x00F00000 ;// partWidth-1 + SMLABB ref, count, mult_20_01, ref ;// +width + ADDS mb, mb, #16 ;// +16, Carry=0 + SBC mb, mb, tmp6, LSR #20 ;// -(partWidth-1)-1 + SBC ref, ref, tmp6, LSR #20 ;// -(partWidth-1)-1 + ADDS count, count, #(1<<28)-(1<<24) ;// decrement counter (partW) + BGE loop_y_hor + + + +;// VERTICAL PART +;// +;// Approach to vertical interpolation +;// +;// Interpolation is done by using 32-bit loads and stores +;// and by using 16 bit arithmetic. 4x4 block is processed +;// in each round. +;// +;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n| +;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n| +;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n| +;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n| +;// .. +;// .. +;// |a_m1|a_m1|a_m1|a_m1|... +;// |b_m1|b_m1|b_m1|b_m1|... +;// |c_m1|c_m1|c_m1|c_m1|... +;// |d_m1|d_m1|d_m1|d_m1|... + +;// Approach to bilinear interpolation to quarter pel position. +;// 4 bytes are processed parallel +;// +;// algorithm (a+b+1)/2. Rouding upwards +1 can be achieved by +;// negating second operand to get one's complement (instead of 2's) +;// and using subtraction, EOR is used to correct sign. +;// +;// MVN b, b +;// UHSUB8 a, a, b +;// EOR a, a, 0x80808080 + + + LDR ref, [sp, #0x1e4] ;// ref + LDR tmpa, [sp, #0x228] ;// horVerOffset + LDR mb, [sp, #0x1e8] ;// mb + LDR width, [sp, #0x218] ;// width + ADD ref, ref, #2 ;// calculate correct position + AND tmpa, tmpa, #1 + ADD ref, ref, tmpa + LDR plus16, = 0x00100010 ;// +16 to lower and upperf halfwords + AND count, count, #0x00FFFFFF ;// partWidth-1 + + AND tmpa, count, #0x000F0000 ;// partHeight-1 + ADD count, count, tmpa, LSL #8 + +loop_y + ADD count, count, tmp6, LSL #8 ;// partWidth-1 + +loop_x + LDR tmp1, [ref], width ;// |a4|a3|a2|a1| + LDR tmp2, [ref], width ;// |c4|c3|c2|c1| + LDR tmp3, [ref], width ;// |g4|g3|g2|g1| + LDR tmp4, [ref], width ;// |m4|m3|m2|m1| + LDR tmp5, [ref], width ;// |r4|r3|r2|r1| + LDR tmp6, [ref], width ;// |t4|t3|t2|t1| + + ;// first four pixels + UXTB16 tmpa, tmp3 ;// |g3|g1| + UXTAB16 tmpa, tmpa, tmp4 ;// |g3+m3|g1+m1| + UXTB16 tmpb, tmp2 ;// |c3|c1| + ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) + + UXTAB16 tmpb, tmpb, tmp5 ;// |c3+r3|c1+r1| + ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) + UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A + UXTAB16 tmpa, tmpa, tmp6 ;// 16+20(G+M)+A+T + + ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) + SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) + + USAT16 tmpb, #13, tmpa ;// saturate + LDR res, = 0x00FF00FF + UXTB16 tmpa, tmp3, ROR #8 ;// |g4|g2| + UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// |g4+m4|g2+m2| + AND res, res, tmpb, LSR #5 ;// mask and divide by 32 + + ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) + UXTB16 tmpb, tmp2, ROR #8 ;// |c4|c2| + ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) + UXTAB16 tmpb, tmpb, tmp5, ROR #8 ;// |c4+r4|c2+r2| + UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A + UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// 16+20(G+M)+A+T + + ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) + SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) + + USAT16 tmpb, #13, tmpa ;// saturate + LDR tmp1, [mb] + LDR tmpa, = 0xFF00FF00 + MVN tmp1, tmp1 + AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divede by 32 + ORR res, res, tmpa + + LDR tmpa, = 0x80808080 + UHSUB8 res, res, tmp1 ;// bilinear interpolation + LDR tmp1, [ref], width ;// load next row + EOR res, res, tmpa ;// correct sign + + STR res, [mb], #16 ;// next row (mb) + + + ;// tmp2 = |a4|a3|a2|a1| + ;// tmp3 = |c4|c3|c2|c1| + ;// tmp4 = |g4|g3|g2|g1| + ;// tmp5 = |m4|m3|m2|m1| + ;// tmp6 = |r4|r3|r2|r1| + ;// tmp1 = |t4|t3|t2|t1| + + ;// second four pixels + UXTB16 tmpa, tmp4 ;// |g3|g1| + UXTAB16 tmpa, tmpa, tmp5 ;// |g3+m3|g1+m1| + UXTB16 tmpb, tmp3 ;// |c3|c1| + ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) + UXTAB16 tmpb, tmpb, tmp6 ;// |c3+r3|c1+r1| + ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) + UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A + UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A+T + + ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) + SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) + + USAT16 tmpb, #13, tmpa ;// saturate + LDR res, = 0x00FF00FF + UXTB16 tmpa, tmp4, ROR #8 ;// |g4|g2| + UXTAB16 tmpa, tmpa, tmp5, ROR #8 ;// |g4+m4|g2+m2| + AND res, res, tmpb, LSR #5 ;// mask and divide by 32 + + ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) + UXTB16 tmpb, tmp3, ROR #8 ;// |c4|c2| + ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) + UXTAB16 tmpb, tmpb, tmp6, ROR #8 ;// |c4+r4|c2+r2| + UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A + UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A+T + + ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) + SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) + + USAT16 tmpb, #13, tmpa ;// saturate + LDR tmp2, [mb] + LDR tmpa, = 0xFF00FF00 + MVN tmp2, tmp2 + + AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32 + ORR res, res, tmpa + LDR tmpa, = 0x80808080 + UHSUB8 res, res, tmp2 ;// bilinear interpolation + LDR tmp2, [ref], width ;// load next row + EOR res, res, tmpa ;// correct sign + STR res, [mb], #16 ;// next row + + ;// tmp3 = |a4|a3|a2|a1| + ;// tmp4 = |c4|c3|c2|c1| + ;// tmp5 = |g4|g3|g2|g1| + ;// tmp6 = |m4|m3|m2|m1| + ;// tmp1 = |r4|r3|r2|r1| + ;// tmp2 = |t4|t3|t2|t1| + + ;// third four pixels + UXTB16 tmpa, tmp5 ;// |g3|g1| + UXTAB16 tmpa, tmpa, tmp6 ;// |g3+m3|g1+m1| + UXTB16 tmpb, tmp4 ;// |c3|c1| + ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) + UXTAB16 tmpb, tmpb, tmp1 ;// |c3+r3|c1+r1| + ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) + UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A + UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A+T + + ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) + SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) + + USAT16 tmpb, #13, tmpa ;// saturate + LDR res, = 0x00FF00FF + UXTB16 tmpa, tmp5, ROR #8 ;// |g4|g2| + UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// |g4+m4|g2+m2| + AND res, res, tmpb, LSR #5 ;// mask and divide by 32 + + ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) + UXTB16 tmpb, tmp4, ROR #8 ;// |c4|c2| + ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) + UXTAB16 tmpb, tmpb, tmp1, ROR #8 ;// |c4+r4|c2+r2| + UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A + UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A+T + + + ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) + SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) + + USAT16 tmpb, #13, tmpa ;// saturate + LDR tmp3, [mb] + LDR tmpa, = 0xFF00FF00 + MVN tmp3, tmp3 + + AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32 + ORR res, res, tmpa + LDR tmpa, = 0x80808080 + UHSUB8 res, res, tmp3 ;// bilinear interpolation + LDR tmp3, [ref] ;// load next row + EOR res, res, tmpa ;// correct sign + STR res, [mb], #16 ;// next row + + ;// tmp4 = |a4|a3|a2|a1| + ;// tmp5 = |c4|c3|c2|c1| + ;// tmp6 = |g4|g3|g2|g1| + ;// tmp1 = |m4|m3|m2|m1| + ;// tmp2 = |r4|r3|r2|r1| + ;// tmp3 = |t4|t3|t2|t1| + + ;// fourth four pixels + UXTB16 tmpa, tmp6 ;// |g3|g1| + UXTAB16 tmpa, tmpa, tmp1 ;// |g3+m3|g1+m1| + UXTB16 tmpb, tmp5 ;// |c3|c1| + ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) + UXTAB16 tmpb, tmpb, tmp2 ;// |c3+r3|c1+r1| + ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) + UXTAB16 tmpa, tmpa, tmp4 ;// 16+20(G+M)+A + UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A+T + + ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) + SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) + + USAT16 tmpb, #13, tmpa ;// saturate + LDR res, = 0x00FF00FF + UXTB16 tmpa, tmp6, ROR #8 ;// |g4|g2| + UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// |g4+m4|g2+m2| + AND res, res, tmpb, LSR #5 ;// mask and divide by 32 + + ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) + UXTB16 tmpb, tmp5, ROR #8 ;// |c4|c2| + ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) + UXTAB16 tmpb, tmpb, tmp2, ROR #8 ;// |c4+r4|c2+r2| + UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// 16+20(G+M)+A + UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A+T + + ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) + SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) + + USAT16 tmpb, #13, tmpa ;// saturate + LDR tmp5, [mb] + LDR tmp4, = 0xFF00FF00 + MVN tmp5, tmp5 + + AND tmpa, tmp4, tmpb, LSL #3 ;// mask and divide by 32 + ORR res, res, tmpa + LDR tmpa, = 0x80808080 + UHSUB8 res, res, tmp5 ;// bilinear interpolation + + ;// decrement loop_x counter + SUBS count, count, #4<<28 ;// decrement x loop counter + + ;// calculate "ref" address for next round + SUB ref, ref, width, LSL #3 ;// ref -= 8*width; + ADD ref, ref, #4 ;// next column (4 pixels) + + EOR res, res, tmpa ;// correct sign + STR res, [mb], #-44 + + BCS loop_x + + ADDS mb, mb, #64 ;// set Carry=0 + ADD ref, ref, width, LSL #2 ;// ref += 4*width + AND tmp6, count, #0x00F00000 ;// partWidth-1 + SBC ref, ref, tmp6, LSR #20 ;// -(partWidth-1)-1 + SBC mb, mb, tmp6, LSR #20 ;// -(partWidth-1)-1 + + ADDS count, count, #0xC << 24 ;// decrement y loop counter + BGE loop_y + + ADD sp, sp, #0x1f4 + LDMFD sp!, {r4-r11, pc} + + END |