;//
;// 
;// File Name:  armVCM4P10_Average_4x_Align_unsafe_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   12290
;// Date:       Wednesday, April 9, 2008
;// 
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;// 
;// 
;//


;// Functions:
;//     armVCM4P10_Average_4x4_Align<ALIGNMENT>_unsafe  
;//
;// Implements Average of 4x4 with equation c = (a+b+1)>>1.
;// First operand will be at offset ALIGNMENT from aligned address
;// Second operand will be at aligned location and will be used as output.
;// destination pointed by (pDst) for vertical interpolation.
;// This function needs to copy 4 bytes in horizontal direction 
;//
;// Registers used as input for this function
;// r0,r1,r2,r3 where r2 containings aligned memory pointer and r3 step size
;//
;// Registers preserved for top level function
;// r4,r5,r6,r8,r9,r14
;//
;// Registers modified by the function
;// r7,r10,r11,r12
;//
;// Output registers
;// r2 - pointer to the aligned location
;// r3 - step size to this aligned location

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
        M_VARIANTS ARM1136JS

        EXPORT armVCM4P10_Average_4x4_Align0_unsafe
        EXPORT armVCM4P10_Average_4x4_Align2_unsafe
        EXPORT armVCM4P10_Average_4x4_Align3_unsafe

DEBUG_ON    SETL {FALSE}

;// Declare input registers
pPred0          RN 0
iPredStep0      RN 1
pPred1          RN 2
iPredStep1      RN 3
pDstPred        RN 2
iDstStep        RN 3

;// Declare other intermediate registers
iPredA0         RN 10
iPredA1         RN 11
iPredB0         RN 12
iPredB1         RN 14
Temp1           RN 4
Temp2           RN 5
ResultA         RN 5
ResultB         RN 4
r0x80808080     RN 7

    IF ARM1136JS
        
        ;// This function calculates average of 4x4 block 
        ;// pPred0 is at alignment offset 0 and pPred1 is alignment 4

        ;// Function header
        M_START armVCM4P10_Average_4x4_Align0_unsafe, r6

        ;// Code start        
        LDR         r0x80808080, =0x80808080

        ;// 1st load
        M_LDR       iPredB0, [pPred1]
        M_LDR       iPredA0, [pPred0], iPredStep0        
        M_LDR       iPredB1, [pPred1, iPredStep1]
        M_LDR       iPredA1, [pPred0], iPredStep0

        ;// (a+b+1)/2 = (a+256-(255-b))/2 = (a-(255-b))/2 + 128
        MVN         iPredB0, iPredB0
        MVN         iPredB1, iPredB1
        UHSUB8      ResultA, iPredA0, iPredB0
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep        
        
        ;// 2nd load
        M_LDR       iPredA0, [pPred0], iPredStep0        
        M_LDR       iPredB0, [pPred1]
        M_LDR       iPredA1, [pPred0], iPredStep0
        M_LDR       iPredB1, [pPred1, iPredStep1]

        MVN         iPredB0, iPredB0
        UHSUB8      ResultA, iPredA0, iPredB0
        MVN         iPredB1, iPredB1
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080        
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep                
End0
        M_END

        ;// This function calculates average of 4x4 block 
        ;// pPred0 is at alignment offset 2 and pPred1 is alignment 4

        ;// Function header
        M_START armVCM4P10_Average_4x4_Align2_unsafe, r6

        ;// Code start        
        LDR         r0x80808080, =0x80808080

        ;// 1st load
        LDR         Temp1, [pPred0, #4]
        M_LDR       iPredA0, [pPred0], iPredStep0        
        M_LDR       iPredB0, [pPred1]
        M_LDR       iPredB1, [pPred1, iPredStep1]
        M_LDR       Temp2, [pPred0, #4]
        M_LDR       iPredA1, [pPred0], iPredStep0
        MVN         iPredB0, iPredB0
        MVN         iPredB1, iPredB1        
        MOV         iPredA0, iPredA0, LSR #16
        ORR         iPredA0, iPredA0, Temp1, LSL #16        
        MOV         iPredA1, iPredA1, LSR #16
        ORR         iPredA1, iPredA1, Temp2, LSL #16

        ;// (a+b+1)/2 = (a+256-(255-b))/2 = (a-(255-b))/2 + 128
        UHSUB8      ResultA, iPredA0, iPredB0
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep        
        
        ;// 2nd load
        LDR         Temp1, [pPred0, #4]
        M_LDR         iPredA0, [pPred0], iPredStep0        
        LDR         iPredB0, [pPred1]
        LDR         iPredB1, [pPred1, iPredStep1]
        LDR         Temp2, [pPred0, #4]
        M_LDR         iPredA1, [pPred0], iPredStep0
        MVN         iPredB0, iPredB0
        MVN         iPredB1, iPredB1
        MOV         iPredA0, iPredA0, LSR #16
        ORR         iPredA0, iPredA0, Temp1, LSL #16        
        MOV         iPredA1, iPredA1, LSR #16
        ORR         iPredA1, iPredA1, Temp2, LSL #16

        UHSUB8      ResultA, iPredA0, iPredB0
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080        
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep                
End2
        M_END


        ;// This function calculates average of 4x4 block 
        ;// pPred0 is at alignment offset 3 and pPred1 is alignment 4

        ;// Function header
        M_START armVCM4P10_Average_4x4_Align3_unsafe, r6

        ;// Code start        
        LDR         r0x80808080, =0x80808080

        ;// 1st load
        LDR         Temp1, [pPred0, #4]
        M_LDR       iPredA0, [pPred0], iPredStep0        
        LDR         iPredB0, [pPred1]
        LDR         iPredB1, [pPred1, iPredStep1]
        LDR         Temp2, [pPred0, #4]
        M_LDR       iPredA1, [pPred0], iPredStep0

        MVN         iPredB0, iPredB0
        MVN         iPredB1, iPredB1
        MOV         iPredA0, iPredA0, LSR #24
        ORR         iPredA0, iPredA0, Temp1, LSL #8                
        MOV         iPredA1, iPredA1, LSR #24
        ORR         iPredA1, iPredA1, Temp2, LSL #8
        UHSUB8      ResultA, iPredA0, iPredB0
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep        
        
        ;// 2nd load
        LDR         Temp1, [pPred0, #4]
        M_LDR       iPredA0, [pPred0], iPredStep0        
        LDR         iPredB0, [pPred1]
        LDR         iPredB1, [pPred1, iPredStep1]
        LDR         Temp2, [pPred0, #4]
        M_LDR       iPredA1, [pPred0], iPredStep0

        MVN         iPredB0, iPredB0
        MVN         iPredB1, iPredB1
        MOV         iPredA0, iPredA0, LSR #24
        ORR         iPredA0, iPredA0, Temp1, LSL #8        
        MOV         iPredA1, iPredA1, LSR #24
        ORR         iPredA1, iPredA1, Temp2, LSL #8

        UHSUB8      ResultA, iPredA0, iPredB0
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080        
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep                
End3
        M_END

    ENDIF
    
    END