;//
;// 
;// File Name:  omxVCM4P10_PredictIntraChroma_8x8_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   12290
;// Date:       Wednesday, April 9, 2008
;// 
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;// 
;// 
;//

  
        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
        EXPORT armVCM4P10_pIndexTable8x8
        
;// Define the processor variants supported by this file
         
         M_VARIANTS CortexA8
     
     AREA table, DATA    
;//-------------------------------------------------------
;// This table for implementing switch case of C in asm by
;// the mehtod of two levels of indexing.
;//-------------------------------------------------------

    M_TABLE armVCM4P10_pIndexTable8x8
    DCD  OMX_VC_CHROMA_DC,     OMX_VC_CHROMA_HOR 
    DCD  OMX_VC_CHROMA_VERT,   OMX_VC_CHROMA_PLANE  
    
    M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
    DCW   3, 2, 1,4 
    DCW  -3,-2,-1,0
    DCW   1, 2, 3,4
    
        
    IF CortexA8

;//--------------------------------------------
;// Scratch variable
;//--------------------------------------------
 
pc              RN 15   
return          RN 0  
pTable          RN 8 
  
;//--------------------------------------------
;// Input Arguments
;//--------------------------------------------
pSrcLeft        RN 0    ;// input pointer
pSrcAbove       RN 1    ;// input pointer
pSrcAboveLeft   RN 2    ;// input pointer
pDst            RN 3    ;// output pointer
leftStep        RN 4    ;// input variable
dstStep         RN 5    ;// input variable
predMode        RN 6    ;// input variable
availability    RN 7    ;// input variable
pMultiplierTable    RN  2     

pTmp            RN 9
step            RN 10

;//---------------------
;// Neon Registers
;//---------------------

;// OMX_VC_CHROMA_HOR

dLeftVal0       DN  D0.8
dLeftVal1       DN  D1.8
dLeftVal2       DN  D2.8
dLeftVal3       DN  D3.8
dLeftVal4       DN  D4.8
dLeftVal5       DN  D5.8
dLeftVal6       DN  D6.8
dLeftVal7       DN  D7.8

;// OMX_VC_CHROMA_VERT

dAboveVal       DN  D0.U8

;// OMX_VC_CHROMA_DC

dLeftVal        DN  D1.U8
dSumAboveValU16 DN  D2.U16
dSumAboveValU32 DN  D3.U32
dSumAboveValU8  DN  D3.U8
dSumLeftValU16  DN  D2.U16
dSumLeftValU32  DN  D1.U32
dSumLeftValU8   DN  D1.U8
dSumAboveLeft   DN  D2.U32
dSumAboveLeftU8 DN  D2.U8
dIndexRow0U8    DN  D5.U8
dIndexRow0      DN  D5.U64
dIndexRow4U8    DN  D6.U8
dIndexRow4      DN  D6.U64
dDstRow0        DN  D0.U8
dDstRow4        DN  D4.U8
dConst128U8     DN  D0.U8

;// OMX_VC_CHROMA_PLANE

dRevAboveVal    DN  D3.U8  
dRevAboveValU64 DN  D3.U64  
dAboveLeftVal   DN  D2.U8
qAbove7minus0   QN  Q3.S16 
qAboveDiff      QN  Q2.S16 
dIndex          DN  D8.U8  
dDiffAboveU8    DN  D9.U8  
dDiffAboveS16   DN  D9.S16 
dAboveDiff0U8   DN  D4.U8  
dAboveDiff0U64  DN  D4.U64
dAbove7minus0U8 DN  D6.U8  
dMultiplier     DN  D10.S16 
dHorPred        DN  D11.S16 
dRevLeftVal     DN  D3.U8
dRevLeftValU64  DN  D3.U64
qLeft7minus0    QN  Q7.S16
qLeftDiff       QN  Q6.S16
dDiffLeftU8     DN  D16.U8
dDiffLeftS16    DN  D16.S16
dLeftDiff0U8    DN  D12.U8
dLeftDiff0U64   DN  D12.U64
dLeft7minus0U8  DN  D14.U8
dVerPred        DN  D3.S16 
dHVValS16       DN  D3.S16
dHVValS32       DN  D3.S32
dHVTempS32      DN  D2.S32
qA              QN  Q0.S16
qB              QN  Q2.S16
qC              QN  Q3.S16
qMultiplier     QN  Q5.S16
dMultiplier0    DN  D10.S16
dMultiplier1    DN  D11.S16
qC0             QN  Q0.S16
qC1             QN  Q1.S16
qC2             QN  Q4.S16
qC3             QN  Q5.S16
qC4             QN  Q6.S16
qC5             QN  Q7.S16
qC6             QN  Q8.S16
qC7             QN  Q9.S16
qSum0           QN  Q0.S16
qSum1           QN  Q1.S16
qSum2           QN  Q4.S16
qSum3           QN  Q5.S16
qSum4           QN  Q6.S16
qSum5           QN  Q7.S16
qSum6           QN  Q8.S16
qSum7           QN  Q9.S16
dSum0           DN  D0.U8
dSum1           DN  D1.U8
dSum2           DN  D2.U8
dSum3           DN  D3.U8
dSum4           DN  D4.U8
dSum5           DN  D5.U8
dSum6           DN  D6.U8
dSum7           DN  D7.U8

;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntraChroma_8x8 starts
;//-----------------------------------------------------------------------------------------------
        
        ;// Write function header
        M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15
        
        ;// Define stack arguments
        M_ARG    LeftStep,     4
        M_ARG    DstStep,      4
        M_ARG    PredMode,     4
        M_ARG    Availability, 4
        
        LDR      pTable,=armVCM4P10_pIndexTable8x8   ;// Load index table for switch case
        
        ;// Load argument from the stack
        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg 
        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg 
        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg         
        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg 
        
        
        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode

OMX_VC_CHROMA_DC
        
        TST     availability, #OMX_VC_LEFT
        BEQ     DCChroma8x8LeftNotAvailable

        ADD     pTmp, pSrcLeft, leftStep
        ADD     step, leftStep, leftStep

        ;// Load Left Edge
        VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
        VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
        VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
        VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
        VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
        VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
        VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
        VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep]      
        
        TST     availability, #OMX_VC_UPPER
        BEQ     DCChroma8x8LeftOnlyAvailable

        ;// Load Upper Edge also
        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]  
        
        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
        
        VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]             
        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ] 
                
        VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]             
        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]             
        
        VADD     dSumAboveLeft,dSumAboveValU32,dSumLeftValU32
        VRSHR    dSumAboveLeft,dSumAboveLeft,#3             ;// Sum = (Sum + 4) >> 3
        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
        
        VMOV     dIndexRow0U8,#0x0c                         
        VMOV     dIndexRow4U8,#0x04
        VSHL     dIndexRow0,dIndexRow0,#32                  ;// index0 = 0x0c0c0c0c00000000 
        VSHR     dIndexRow4,dIndexRow4,#32                  ;// index4 = 0x0000000004040404
        VADD     dIndexRow4U8,dIndexRow4U8,dIndexRow0U8     ;// index4 = 0x0c0c0c0c04040404
        VTBL     dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8
        VTBL     dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8
 
DCChroma8x8LeftStore       
        ADD     pTmp, pDst, dstStep
        ADD     step, dstStep, dstStep
        
        VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
        VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
        VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
        VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
        VST1     dDstRow4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
        VST1     dDstRow4,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
        VST1     dDstRow4,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
        VST1     dDstRow4,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7

        M_EXIT
        

DCChroma8x8LeftOnlyAvailable

        MOV      return, #OMX_Sts_NoErr
        
        VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]             
        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]   
        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
        
        VDUP     dDstRow0,dSumLeftValU8[0]
        VDUP     dDstRow4,dSumLeftValU8[4]
        
        B        DCChroma8x8LeftStore  
        

DCChroma8x8LeftNotAvailable
                 
        TST     availability, #OMX_VC_UPPER
        BEQ     DCChroma8x8NoneAvailable

        ;// Load Upper Edge 
        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]  
        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
        
        VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]             
        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ] 
        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
        VMOV     dIndexRow0U8,#0x04
        VSHL     dIndexRow0,dIndexRow0,#32                  ;// index = 0x0404040400000000
        VTBL     dDstRow0,{dSumAboveValU8},dIndexRow0U8 
        
        B        DCChroma8x8UpperStore
        

DCChroma8x8NoneAvailable        
        
        VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError

DCChroma8x8UpperStore        
        
        ADD     pTmp, pDst, dstStep
        ADD     step, dstStep, dstStep
        
        VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
        VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
        VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
        VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
        VST1     dDstRow0,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
        VST1     dDstRow0,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
        VST1     dDstRow0,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
        VST1     dDstRow0,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
        
        M_EXIT


OMX_VC_CHROMA_VERT
        
        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[x]      :0<= x <= 7   
        MOV      return, #OMX_Sts_NoErr
        
        B        DCChroma8x8UpperStore
        

OMX_VC_CHROMA_HOR
        
        ADD     pTmp, pSrcLeft, leftStep
        ADD     step, leftStep, leftStep
        
        VLD1    {dLeftVal0[]},[pSrcLeft],step           ;// pSrcLeft[0*leftStep]
        VLD1    {dLeftVal1[]},[pTmp],step               ;// pSrcLeft[1*leftStep]
        VLD1    {dLeftVal2[]},[pSrcLeft],step           ;// pSrcLeft[2*leftStep]
        VLD1    {dLeftVal3[]},[pTmp],step               ;// pSrcLeft[3*leftStep]
        VLD1    {dLeftVal4[]},[pSrcLeft],step           ;// pSrcLeft[4*leftStep]
        VLD1    {dLeftVal5[]},[pTmp],step               ;// pSrcLeft[5*leftStep]
        VLD1    {dLeftVal6[]},[pSrcLeft],step           ;// pSrcLeft[6*leftStep]
        VLD1    {dLeftVal7[]},[pTmp]                    ;// pSrcLeft[7*leftStep]
        
        B        DCChroma8x8PlaneStore
        
        
OMX_VC_CHROMA_PLANE
        ADD     pTmp, pSrcLeft, leftStep
        ADD     step, leftStep, leftStep
        
        VLD1    dAboveVal,[pSrcAbove]                       ;// pSrcAbove[x]      :0<= x <= 7   
        VLD1    dAboveLeftVal[0],[pSrcAboveLeft]
        
        VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
        VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
        VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
        VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
        VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
        VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
        VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
        VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep] 
        
        
        VREV64  dRevAboveVal,dAboveVal                      ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7]
        VSUBL   qAbove7minus0,dRevAboveVal,dAboveLeftVal    ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0]
        VSHR    dRevAboveValU64,dRevAboveValU64,#8          ;// pSrcAbove[X:0:1:2:3:4:5:6]
        VSUBL   qAboveDiff,dRevAboveVal,dAboveVal           ;// pSrcAbove[6] - pSrcAbove[0]
                                                            ;// pSrcAbove[5] - pSrcAbove[1]
                                                            ;// pSrcAbove[4] - pSrcAbove[2]
        
        VREV64  dRevLeftVal,dLeftVal                        ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7]
        VSUBL   qLeft7minus0,dRevLeftVal,dAboveLeftVal      ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
        VSHR    dRevLeftValU64,dRevLeftValU64,#8            ;// pSrcLeft[X:0:1:2:3:4:5:6]
        VSUBL   qLeftDiff,dRevLeftVal,dLeftVal              ;// pSrcLeft[6] - pSrcLeft[0]
                                                            ;// pSrcLeft[5] - pSrcLeft[1]
                                                            ;// pSrcLeft[4] - pSrcLeft[2]
        
        LDR     pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8   ;// Used to calculate Hval & Vval  
        VSHL    dAboveDiff0U64,dAboveDiff0U64,#16  
        VEXT    dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2           ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ]
        VLD1    dMultiplier,[pMultiplierTable]! 
        VSHL    dLeftDiff0U64,dLeftDiff0U64,#16  
        VEXT    dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2              ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ]                                                   
                                                                    
        
        VMUL    dHorPred,dDiffAboveS16,dMultiplier                      ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ]
        VMUL    dVerPred,dDiffLeftS16,dMultiplier
        VPADD   dHVValS16,dHorPred,dVerPred
        
        
        VPADDL  dHVValS32,dHVValS16                                     ;// [V|H] in 32 bits each
        VSHL    dHVTempS32,dHVValS32,#4                                 ;// 17*H = 16*H + H = (H<<4)+H
        VADD    dHVValS32,dHVValS32,dHVTempS32                          ;// [ 17*V  | 17*H ]in 32 bits each
        VLD1    {dMultiplier0,dMultiplier1},[pMultiplierTable]          ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ]  
        VRSHR   dHVValS32,dHVValS32,#5                                  ;// [c|b] in 16bits each
        VADDL   qA,dAboveVal,dLeftVal
        VDUP    qA,qA[7]
        VSHL    qA,qA,#4                                                ;// [a|a|a|a|a|a|a|a]
        VDUP    qB,dHVValS16[0]                                         ;// [b|b|b|b|b|b|b|b]
        VDUP    qC,dHVValS16[2]                                         ;// [c|c|c|c|c|c|c|c]
        
        
        VMUL    qB,qB,qMultiplier
        VMUL    qC,qC,qMultiplier
        VADD    qB,qB,qA 
        
        VDUP    qC0,qC[0]
        VDUP    qC1,qC[1]
        VDUP    qC2,qC[2]
        VDUP    qC3,qC[3]
        VDUP    qC4,qC[4]
        VDUP    qC5,qC[5]
        VDUP    qC6,qC[6]
        VDUP    qC7,qC[7]
        
        VADD    qSum0,qB,qC0
        VADD    qSum1,qB,qC1
        VADD    qSum2,qB,qC2
        VADD    qSum3,qB,qC3
        VADD    qSum4,qB,qC4
        VADD    qSum5,qB,qC5
        VADD    qSum6,qB,qC6
        VADD    qSum7,qB,qC7
        
        VQRSHRUN dSum0,qSum0,#5                         ;// (OMX_U8)armClip(0,255,(Sum+16)>>5)
        VQRSHRUN dSum1,qSum1,#5
        VQRSHRUN dSum2,qSum2,#5
        VQRSHRUN dSum3,qSum3,#5
        VQRSHRUN dSum4,qSum4,#5
        VQRSHRUN dSum5,qSum5,#5
        VQRSHRUN dSum6,qSum6,#5
        VQRSHRUN dSum7,qSum7,#5      

DCChroma8x8PlaneStore        
        ADD     pTmp, pDst, dstStep
        ADD     step, dstStep, dstStep
        
        VST1    dSum0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
        VST1    dSum1,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
        VST1    dSum2,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
        VST1    dSum3,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
        VST1    dSum4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
        VST1    dSum5,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
        VST1    dSum6,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
        VST1    dSum7,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7       
        
        MOV     return, #OMX_Sts_NoErr
        M_END
        
        ENDIF ;// CortexA8
        
        END
;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntraChroma_8x8 ends
;//-----------------------------------------------------------------------------------------------