;//
;// 
;// File Name:  armVCM4P10_TransformResidual4x4_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   12290
;// Date:       Wednesday, April 9, 2008
;// 
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;// 
;// 
;//
;// Description:
;// Transform Residual 4x4 Coefficients
;// 
;// 

        
;// Include standard headers

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
        M_VARIANTS CortexA8
        
;// Import symbols required from other files
;// (For example tables)
    
        
;// Set debugging level        
;//DEBUG_ON    SETL {TRUE}


;// Guarding implementation by the processor name
    
    
;// Guarding implementation by the processor name
    
    IF  CortexA8

;// ARM Registers
    
;//Input Registers
pDst                RN  0
pSrc                RN  1


;// Neon Registers
      
;// Packed Input pixels
dIn0                DN  D0.S16       
dIn1                DN  D1.S16       
dIn2                DN  D2.S16       
dIn3                DN  D3.S16

;// Intermediate calculations       
dZero               DN  D4.S16
de0                 DN  D5.S16
de1                 DN  D6.S16
de2                 DN  D7.S16
de3                 DN  D8.S16
dIn1RS              DN  D7.S16
dIn3RS              DN  D8.S16
df0                 DN  D0.S16
df1                 DN  D1.S16
df2                 DN  D2.S16
df3                 DN  D3.S16
qf01                QN  Q0.32
qf23                QN  Q1.32
dg0                 DN  D5.S16
dg1                 DN  D6.S16
dg2                 DN  D7.S16
dg3                 DN  D8.S16
df1RS               DN  D7.S16
df3RS               DN  D8.S16

;// Output pixels
dh0                 DN  D0.S16
dh1                 DN  D1.S16
dh2                 DN  D2.S16
dh3                 DN  D3.S16

       
    ;// Allocate stack memory required by the function
        

    ;// Write function header
        M_START armVCM4P10_TransformResidual4x4, ,d8
        
        ;******************************************************************
        ;// The strategy used in implementing the transform is as follows:*
        ;// Load the 4x4 block into 8 registers                           *  
        ;// Transpose the 4x4 matrix                                      *  
        ;// Perform the row operations (on columns) using SIMD            *  
        ;// Transpose the 4x4 result matrix                               *  
        ;// Perform the coloumn operations                                *
        ;// Store the 4x4 block at one go                                 *  
        ;******************************************************************

        ;// Load all the 4x4 pixels in transposed form
        
        VLD4    {dIn0,dIn1,dIn2,dIn3},[pSrc]
        
        VMOV    dZero,#0                                    ;// Used to right shift by 1 
        
        
        ;**************************************** 
        ;// Row Operations (Performed on columns)
        ;**************************************** 
        
        
        VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2 
        VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2 
        VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
        VHADD       dIn3RS,dIn3,dZero
        VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3 
        VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1) 
        VADD        df0,de0,de3                         ;//  f0 = e0 + e3
        VADD        df1,de1,de2                            ;//  f1 = e1 + e2
        VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
        VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
        
        
        ;*****************************************************************
        ;// Transpose the resultant matrix
        ;*****************************************************************
        
        VTRN    df0,df1
        VTRN    df2,df3
        VTRN    qf01,qf23 
        
        
        ;******************************* 
        ;// Coloumn Operations 
        ;******************************* 
        
        
        VADD        dg0,df0,df2                         ;//  e0 = d0 + d2 
        VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2 
        VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
        VHADD       df3RS,df3,dZero
        VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3 
        VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1) 
        VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
        VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
        VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
        VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
        
             
        ;************************************************
        ;// Calculate final value (colOp[i][j] + 32)>>6
        ;************************************************
        
        VRSHR       dh0,#6
        VRSHR       dh1,#6
        VRSHR       dh2,#6
        VRSHR       dh3,#6
        
                
        ;***************************
        ;// Store all the 4x4 pixels
        ;***************************
        
        VST1   {dh0,dh1,dh2,dh3},[pDst]
            
        
        ;// Set return value
        
End                

        
        ;// Write function tail
        M_END
        
    ENDIF                                                           ;//CortexA8            
            
    END