;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;//      http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;// 
;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   12290
;// Date:       Wednesday, April 9, 2008
;// 
;// 
;// 
;//
;// Description:
;// H.264 inverse quantize and transform module
;// 
;// 

;// Include standard headers

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
;// Import/Export symbols required from/to other files
;// (For example tables)
        
        IMPORT armVCM4P10_UnpackBlock4x4 
        IMPORT armVCM4P10_QPDivTable
        IMPORT armVCM4P10_VMatrixQPModTable
        
        M_VARIANTS CortexA8

;// Set debugging level        
;//DEBUG_ON    SETL {TRUE}


;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
    

;// Guarding implementation by the processor name
    
    
;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4

;// Guarding implementation by the processor name
    
    IF  CortexA8
    
;//Input Registers
pData               RN  0
QP                  RN  1    


;//Local Scratch Registers

;// ARM Registers

pQPDivTable         RN  2
pQPModTable         RN  3
Shift               RN  4
Scale               RN  5

;// NEON Registers

;// Packed Input pixels
dIn0                DN  D0.S16
dIn1                DN  D1.S16
dIn2                DN  D2.S16
dIn3                DN  D3.S16   

;// Intermediate calculations
dRowSum1            DN  D4.S16
dRowSum2            DN  D5.S16
dRowDiff1           DN  D6.S16
dRowDiff2           DN  D7.S16

;// Row operated pixels
dRowOp0             DN  D0.S16
dRowOp1                DN  D1.S16
dRowOp2                DN  D2.S16
dRowOp3                DN  D3.S16
qRowOp01            QN  Q0.32
qRowOp23            QN  Q1.32

;// Intermediate calculations
dColSum1            DN  D4.S16
dColSum2            DN  D5.S16
dColDiff1           DN  D6.S16
dColDiff2           DN  D7.S16

;// Coloumn operated pixels
dColOp0             DN  D0.S16
dColOp1                DN  D1.S16
dColOp2                DN  D2.S16
dColOp3                DN  D3.S16

;// Temporary scratch varaibles

dScale              DN  D5.S16
qRound0             QN  Q3.S32
qRound1             QN  Q4.S32
qRound2             QN  Q5.S32
qRound3             QN  Q6.S32

;// InvTransformed and Dequantized pixels
dOut0               DN  D0.S16
dOut1                DN  D1.S16
dOut2                DN  D2.S16
dOut3                DN  D3.S16

       
    ;// Allocate stack memory required by the function
        

    ;// Write function header
    M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13
    
    ;******************************************************************
    ;// The strategy used in implementing the transform is as follows:*
    ;// Load the 4x4 block into 4 D-registers                         *  
    ;// Transpose the 4x4 matrix                                      *  
    ;// Perform the row operations (on columns) using SIMD            *  
    ;// Transpose the 4x4 result matrix                               *  
    ;// Perform the coloumn operations                                *
    ;******************************************************************

        ;// Load all the 4x4 pixels in Transposed form
        
        VLD4    {dIn0,dIn1,dIn2,dIn3},[pData]
        LDR     pQPDivTable, =armVCM4P10_QPDivTable        ;// QP Division look-up-table base pointer
        LDR     pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
        
        ;**************************************** 
        ;// Row Operations (Performed on columns)
        ;**************************************** 
        ;// Scale factor calculation is done using ARM instructions
        ;// Interleaved with NEON instructions inorder to Dual issue
        
        VADD    dRowSum1,dIn0,dIn1
        VADD    dRowSum2,dIn2,dIn3
        VSUB    dRowDiff1,dIn0,dIn1
        LDRSB   Shift, [pQPDivTable, QP]               ;// ARM CODE: Shift = pQPDivTable[QP]
        VSUB    dRowDiff2,dIn2,dIn3
        LDRSB   Scale, [pQPModTable, QP]               ;// ARM CODE: Scale = pQPModTable[QP] 
        VADD    dRowOp0,dRowSum1,dRowSum2
        VSUB    dRowOp1,dRowSum1,dRowSum2
        VSUB    dRowOp2,dRowDiff1,dRowDiff2
        LSL     Scale, Scale, Shift                    ;// ARM CODE: Scale = Scale << Shift
        VADD    dRowOp3,dRowDiff1,dRowDiff2
        
        ;****************************************
        ;// Transpose the resultant matrix
        ;****************************************
        
        VTRN    dRowOp0,dRowOp1
        VTRN    dRowOp2,dRowOp3
        VTRN    qRowOp01,qRowOp23 
        
        ;**************************************** 
        ;// Coloumn Operations 
        ;**************************************** 
        
        VADD    dColSum1,dRowOp0,dRowOp1
        VADD    dColSum2,dRowOp2,dRowOp3
        VSUB    dColDiff1,dRowOp0,dRowOp1
        VSUB    dColDiff2,dRowOp2,dRowOp3
        VADD    dColOp0,dColSum1,dColSum2
        VSUB    dColOp1,dColSum1,dColSum2
        VSUB    dColOp2,dColDiff1,dColDiff2
        VADD    dColOp3,dColDiff1,dColDiff2
        
        ;//----------------------------------------------------------------------
        ;//
        ;// <Dequantize> improves on the c-reference code
        ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
        ;// We do not subtract 2 from Shift as in C reference, instead perform a
        ;// Scale << Shift once in the beginning and do a right shift by a 
        ;// constant 2 after the Multiplication. The value of Round would be 2 
        ;// 
        ;// By doing this we aviod the Branches required and also 
        ;// reduce the code size substantially
        ;// 
        ;//----------------------------------------------------------------------
        
        
        VDUP    dScale, Scale                            ;// ARM -> NEON  copy 'scale' to vector
               
                
        VMOV    qRound0,#2                               ;// Set the Round Value 
        VMOV    qRound1,#2
        VMOV    qRound2,#2
        VMOV    qRound3,#2
        
        VMLAL   qRound0,dColOp0,dScale                   ;// pDst[i] * Scale + Round 
        VMLAL   qRound1,dColOp1,dScale
        VMLAL   qRound2,dColOp2,dScale
        VMLAL   qRound3,dColOp3,dScale
        
        VSHRN   dOut0,qRound0,#2                          ;// Right shift by 2 & (OMX_S16)Value
        VSHRN   dOut1,qRound1,#2
        VSHRN   dOut2,qRound2,#2
        VSHRN   dOut3,qRound3,#2
        
        ;***************************
        ;// Store all the 4x4 pixels
        ;***************************
        
        VST1  {dOut0,dOut1,dOut2,dOut3}, [pData]

        
        ;// Set return value
        
        ;// Write function tail
        M_END        
        
    ENDIF                                                           ;//CORTEXA8   
        

;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
    
;//Input Registers
ppSrc               RN  0
pDst                RN  1
QPR2                RN  2

;//Output Registers
result              RN  0

;//Local Scratch Registers
pDstR4              RN  4
pDstR0              RN  0
QPR1                RN  1
QPR5                RN  5

;// Guarding implementation by the processor name
    
    IF CortexA8
       
    ;// Allocate stack memory required by the function
        

    ;// Write function header
        M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
        
        MOV     pDstR4,pDst                         ;// Saving register r1
        MOV     QPR5,QPR2                           ;// Saving register r2
        BL      armVCM4P10_UnpackBlock4x4
        
        MOV     pDstR0,pDstR4                       ;// Setting up register r0
        MOV     QPR1,QPR5                           ;// Setting up register r1
        BL      armVCM4P10_InvTransformDequantLumaDC4x4
                               
       
        ;// Set return value
        MOV     result,#OMX_Sts_NoErr        
       
        ;// Write function tail
        M_END
        
            
    ENDIF                                                           ;//ARM1136JS  
    

    END