From 0c1bc742181ded4930842b46e9507372f0b1b963 Mon Sep 17 00:00:00 2001
From: James Dong <jdong@google.com>
Date: Tue, 31 May 2011 18:53:46 -0700
Subject: Initial-checkin for ON2 Software AVC/H264 decoder

o when neon is present, the performance gain of On2 AVC software decoder
  over PV software decoder is more than 30%.

o In addition, it fixes some known PV software decoder issues like missing
  output frames

o allow both pv and on2 software avc to be available for easy comparision

o change output frames from 8 to 16

Change-Id: I567ad1842025ead7092f0c47e3513d6d9ca232dd
---
 .../on2/h264dec/omxdl/arm11/api/armCOMM_IDCT_s.h   | 1445 ++++++++++++++++++++
 1 file changed, 1445 insertions(+)
 create mode 100644 media/libstagefright/codecs/on2/h264dec/omxdl/arm11/api/armCOMM_IDCT_s.h

(limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm11/api/armCOMM_IDCT_s.h')

diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/api/armCOMM_IDCT_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/api/armCOMM_IDCT_s.h
new file mode 100644
index 0000000..03f7137
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/api/armCOMM_IDCT_s.h
@@ -0,0 +1,1445 @@
+;//
+;// This confidential and proprietary software may be used only as
+;// authorised by a licensing agreement from ARM Limited
+;//   (C) COPYRIGHT 2004 ARM Limited
+;//       ALL RIGHTS RESERVED
+;// The entire notice above must be reproduced on all authorised
+;// copies and copies may only be made to the extent permitted
+;// by a licensing agreement from ARM Limited.
+;//
+;// IDCT_s.s
+;//
+;// Inverse DCT module
+;//
+;// 
+;// ALGORITHM DESCRIPTION
+;//
+;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
+;// column and then a 1D IDCT for each row.
+;//
+;// The 8-point 1D IDCT is defined by
+;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
+;//
+;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
+;//   c(u,x) = cos( (2x+1)*u*pi/16 )
+;//
+;// We compute the 8-point 1D IDCT using the reverse of
+;// the Arai-Agui-Nakajima flow graph which we split into
+;// 5 stages named in reverse order to identify with the
+;// forward DCT. Direct inversion of the forward formulae
+;// in file FDCT_s.s gives:
+;//
+;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
+;//             [ A(0) = 2*sqrt(2)
+;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
+;//
+;// IStage 4:   i0 = j0             i1 = j4
+;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
+;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
+;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
+;//
+;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
+;//             h2 = (i2*sqrt2)-i3  h3 = i3
+;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
+;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
+;//             [ The above two lines rotate by -(pi/8) ]
+;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2 
+;//             
+;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
+;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
+;//             g7 = h7             g6 = h6 - h7
+;//             g5 = h5 - g6        g4 = h4 - g5
+;//
+;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
+;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
+;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
+;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
+;//
+;// Note that most coefficients are halved 3 times during the
+;// above calculation. We can rescale the algorithm dividing
+;// the input by 8 to remove the halvings.
+;//
+;// IStage 5:   j(u) = T(u)*A(u)/8
+;//
+;// IStage 4:   i0 = j0             i1 = j4
+;//             i3 = j2 + j6        i2 = j2 - j6
+;//             i7 = j5 + j3        i4 = j5 - j3
+;//             i5 = j1 + j7        i6 = j1 - j7
+;//
+;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
+;//             h2 = (i2*sqrt2)-i3  h3 = i3
+;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
+;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
+;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7 
+;//             
+;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
+;//             g1 = h1 + h2        g2 = h1 - h2
+;//             g7 = h7             g6 = h6 - h7
+;//             g5 = h5 - g6        g4 = h4 - g5
+;//
+;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
+;//             f1 = g1 + g6        f6 = g1 - g6
+;//             f2 = g2 + g5        f5 = g2 - g5
+;//             f3 = g3 + g4        f4 = g3 - g4
+;//
+;// Note:
+;// 1. The scaling by A(u)/8 can often be combined with inverse
+;//    quantization. The column and row scalings can be combined.
+;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
+;//    to the above code but is otherwise identical.
+;// 3. The rotation by -pi/8 can be peformed using three multiplies
+;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
+;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
+;// 4. If |T(u)|<=1 then from the IDCT definition,
+;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
+;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
+;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
+;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
+;//            = (approx)2.64
+;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
+;//    The table below shows input patterns generating the maximum
+;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
+;//    InputPattern      Max |f(x)|
+;//      PPPPPPPP        |f0| =  2.64
+;//      PPPMMMMM        |f1| =  2.64
+;//      PPMMMPPP        |f2| =  2.64
+;//      PPMMPPMM        |f3| =  2.64
+;//      PMMPPMMP        |f4| =  2.64
+;//      PMMPMMPM        |f5| =  2.64
+;//      PMPPMPMP        |f6| =  2.64
+;//      PMPMPMPM        |f7| =  2.64
+;//   Note that this input pattern is the transpose of the
+;//   corresponding max input patter for the FDCT.
+
+;// Arguments
+
+pSrc    RN 0    ;// source data buffer
+Stride  RN 1    ;// destination stride in bytes
+pDest   RN 2    ;// destination data buffer
+pScale  RN 3    ;// pointer to scaling table
+
+
+        ;// DCT Inverse Macro
+        ;// The DCT code should be parametrized according
+        ;// to the following inputs:
+        ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
+        ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
+        ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
+        ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
+        ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
+        ;//
+        ;// Inputs:
+        ;// pSrc   = r0 = Pointer to input data
+        ;//               Range is -256 to +255 (9-bit)
+        ;// Stride = r1 = Stride between input lines
+        ;// pDest  = r2 = Pointer to output data
+        ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
+        
+        
+        
+        MACRO
+        M_IDCT  $outsize, $inscale, $stride
+        LCLA    SHIFT
+        
+        
+        IF ARM1136JS
+        
+;// REGISTER ALLOCATION
+;// This is hard since we have 8 values, 9 free registers and each
+;// butterfly requires a temporary register. We also want to 
+;// maintain register order so we can use LDM/STM. The table below
+;// summarises the register allocation that meets all these criteria.
+;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
+;//
+;// r1  a01     g0  h0
+;// r4  b01 f0  g1  h1  i0
+;// r5  a23 f1  g2      i1
+;// r6  b23 f2  g3  h2  i2
+;// r7  a45 f3      h3  i3
+;// r8  b45 f4  g4  h4  i4
+;// r9  a67 f5  g5  h5  i5
+;// r10 b67 f6  g6  h6  i6
+;// r11     f7  g7  h7  i7
+;//
+ra01    RN 1
+rb01    RN 4
+ra23    RN 5
+rb23    RN 6
+ra45    RN 7
+rb45    RN 8
+ra67    RN 9
+rb67    RN 10
+rtmp    RN 11
+csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
+LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
+;// Transpose allocation
+xft     RN ra01
+xf0     RN rb01
+xf1     RN ra23
+xf2     RN rb23
+xf3     RN ra45
+xf4     RN rb45
+xf5     RN ra67
+xf6     RN rb67
+xf7     RN rtmp
+;// IStage 1 allocation
+xg0     RN xft
+xg1     RN xf0
+xg2     RN xf1
+xg3     RN xf2
+xgt     RN xf3
+xg4     RN xf4
+xg5     RN xf5
+xg6     RN xf6
+xg7     RN xf7
+;// IStage 2 allocation
+xh0     RN xg0
+xh1     RN xg1
+xht     RN xg2
+xh2     RN xg3
+xh3     RN xgt
+xh4     RN xg4
+xh5     RN xg5
+xh6     RN xg6
+xh7     RN xg7
+;// IStage 3,4 allocation
+xit     RN xh0
+xi0     RN xh1
+xi1     RN xht
+xi2     RN xh2
+xi3     RN xh3
+xi4     RN xh4
+xi5     RN xh5
+xi6     RN xh6
+xi7     RN xh7
+        
+        M_STR   pDest,  ppDest
+        IF "$stride"="s"
+            M_STR   Stride, pStride
+        ENDIF
+        M_ADR   pDest,  pBlk
+        LDR     csPiBy8, =0x30fc7642
+        LDR     LoopRR2, =0x00005a82
+  
+v6_idct_col$_F
+        ;// Load even values
+        LDR     xi4, [pSrc], #4  ;// j0
+        LDR     xi5, [pSrc, #4*16-4]  ;// j4
+        LDR     xi6, [pSrc, #2*16-4]  ;// j2
+        LDR     xi7, [pSrc, #6*16-4]  ;// j6
+        
+        ;// Scale Even Values
+        IF "$inscale"="s16" ;// 16x16 mul
+SHIFT       SETA    12
+            LDR     xi0, [pScale], #4
+            LDR     xi1, [pScale, #4*16-4]        
+            LDR     xi2, [pScale, #2*16-4]
+            MOV     xit, #1<<(SHIFT-1)
+            SMLABB  xi3, xi0, xi4, xit
+            SMLATT  xi4, xi0, xi4, xit
+            SMLABB  xi0, xi1, xi5, xit
+            SMLATT  xi5, xi1, xi5, xit
+            MOV     xi3, xi3, ASR #SHIFT
+            PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
+            LDR     xi3, [pScale, #6*16-4]
+            SMLABB  xi1, xi2, xi6, xit
+            SMLATT  xi6, xi2, xi6, xit
+            MOV     xi0, xi0, ASR #SHIFT
+            PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
+            SMLABB  xi2, xi3, xi7, xit
+            SMLATT  xi7, xi3, xi7, xit
+            MOV     xi1, xi1, ASR #SHIFT
+            PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
+            MOV     xi2, xi2, ASR #SHIFT
+            PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
+        ENDIF
+        IF "$inscale"="s32" ;// 32x16 mul
+SHIFT       SETA    (12+8-16)
+            MOV     xit, #1<<(SHIFT-1)
+            LDR     xi0, [pScale], #8
+            LDR     xi1, [pScale, #0*32+4-8]
+            LDR     xi2, [pScale, #4*32-8]
+            LDR     xi3, [pScale, #4*32+4-8]            
+            SMLAWB  xi0, xi0, xi4, xit
+            SMLAWT  xi1, xi1, xi4, xit
+            SMLAWB  xi2, xi2, xi5, xit
+            SMLAWT  xi3, xi3, xi5, xit            
+            MOV     xi0, xi0, ASR #SHIFT
+            PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
+            MOV     xi2, xi2, ASR #SHIFT            
+            PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
+            LDR     xi0, [pScale, #2*32-8]
+            LDR     xi1, [pScale, #2*32+4-8]
+            LDR     xi2, [pScale, #6*32-8]
+            LDR     xi3, [pScale, #6*32+4-8]            
+            SMLAWB  xi0, xi0, xi6, xit
+            SMLAWT  xi1, xi1, xi6, xit
+            SMLAWB  xi2, xi2, xi7, xit
+            SMLAWT  xi3, xi3, xi7, xit            
+            MOV     xi0, xi0, ASR #SHIFT
+            PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
+            MOV     xi2, xi2, ASR #SHIFT            
+            PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
+        ENDIF
+                
+        ;// Load odd values
+        LDR     xi0, [pSrc, #1*16-4]      ;// j1
+        LDR     xi1, [pSrc, #7*16-4]      ;// j7
+        LDR     xi2, [pSrc, #5*16-4]      ;// j5
+        LDR     xi3, [pSrc, #3*16-4]      ;// j3
+        
+        IF  {TRUE}
+            ;// shortcut if odd values 0
+            TEQ     xi0, #0
+            TEQEQ   xi1, #0
+            TEQEQ   xi2, #0
+            TEQEQ   xi3, #0
+            BEQ     v6OddZero$_F
+        ENDIF
+        
+        ;// Store scaled even values
+        STMIA   pDest, {xi4, xi5, xi6, xi7}
+        
+        ;// Scale odd values
+        IF "$inscale"="s16"
+            ;// Perform AAN Scale
+            LDR     xi4, [pScale, #1*16-4]
+            LDR     xi5, [pScale, #7*16-4]        
+            LDR     xi6, [pScale, #5*16-4]
+            SMLABB  xi7, xi0, xi4, xit
+            SMLATT  xi0, xi0, xi4, xit
+            SMLABB  xi4, xi1, xi5, xit
+            SMLATT  xi1, xi1, xi5, xit
+            MOV     xi7, xi7, ASR #SHIFT
+            PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
+            LDR     xi7, [pScale, #3*16-4]
+            SMLABB  xi5, xi2, xi6, xit
+            SMLATT  xi2, xi2, xi6, xit
+            MOV     xi4, xi4, ASR #SHIFT
+            PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
+            SMLABB  xi6, xi3, xi7, xit
+            SMLATT  xi3, xi3, xi7, xit
+            MOV     xi5, xi5, ASR #SHIFT
+            PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
+            MOV     xi6, xi6, ASR #SHIFT
+            PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
+        ENDIF
+        IF "$inscale"="s32" ;// 32x16 mul
+            LDR     xi4, [pScale, #1*32-8]
+            LDR     xi5, [pScale, #1*32+4-8]
+            LDR     xi6, [pScale, #7*32-8]
+            LDR     xi7, [pScale, #7*32+4-8]            
+            SMLAWB  xi4, xi4, xi0, xit
+            SMLAWT  xi5, xi5, xi0, xit
+            SMLAWB  xi6, xi6, xi1, xit
+            SMLAWT  xi7, xi7, xi1, xit            
+            MOV     xi4, xi4, ASR #SHIFT
+            PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
+            MOV     xi6, xi6, ASR #SHIFT            
+            PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
+            LDR     xi4, [pScale, #5*32-8]
+            LDR     xi5, [pScale, #5*32+4-8]
+            LDR     xi6, [pScale, #3*32-8]
+            LDR     xi7, [pScale, #3*32+4-8]            
+            SMLAWB  xi4, xi4, xi2, xit
+            SMLAWT  xi5, xi5, xi2, xit
+            SMLAWB  xi6, xi6, xi3, xit
+            SMLAWT  xi7, xi7, xi3, xit            
+            MOV     xi4, xi4, ASR #SHIFT
+            PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
+            MOV     xi6, xi6, ASR #SHIFT            
+            PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
+        ENDIF
+        
+        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
+        SSUB16  xi6, xi0, xi1           ;// j1-j7
+        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
+        SSUB16  xi4, xi2, xi3           ;// j5-j3
+        
+        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
+        
+        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
+        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
+        
+        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
+        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
+        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]   
+        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
+                
+        SMULBB  xi1, xi3, LoopRR2
+        SMULTB  xi3, xi3, LoopRR2
+                
+        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
+        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
+        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
+                
+        ;// xi0,xi1,xi2,xi3 now free
+        ;// IStage 4,3, rows 2to3 x1/2
+        
+        MOV     xi3, xi3, LSL #1
+        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
+        LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
+                
+        ;// IStage 2, rows4to7
+        SSUB16  xg6, xh6, xh7
+        SSUB16  xg5, xh5, xg6        
+        SSUB16  xg4, xh4, xg5
+                
+        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
+        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
+        
+        SMULBB  xi0, xi2, LoopRR2
+        SMULTB  xi2, xi2, LoopRR2
+        
+        MOV     xi2, xi2, LSL #1
+        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
+        
+        ;// xi0, xi1 now free
+        ;// IStage 4,3 rows 0to1 x 1/2
+        LDRD    xi0, [pDest]            ;// j0, j4 scaled
+        SSUB16  xh2, xh2, xi3
+        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
+        
+        SHADD16 xh0, xi0, xi1
+        SHSUB16 xh1, xi0, xi1                
+        
+        ;// IStage 2 rows 0to3 x 1/2
+        SHSUB16 xg2, xh1, xh2
+        SHADD16 xg1, xh1, xh2
+        SHSUB16 xg3, xh0, xh3
+        SHADD16 xg0, xh0, xh3
+        
+        ;// IStage 1 all rows
+        SADD16  xf3, xg3, xg4
+        SSUB16  xf4, xg3, xg4
+        SADD16  xf2, xg2, xg5
+        SSUB16  xf5, xg2, xg5
+        SADD16  xf1, xg1, xg6
+        SSUB16  xf6, xg1, xg6
+        SADD16  xf0, xg0, xg7
+        SSUB16  xf7, xg0, xg7
+        
+        ;// Transpose, store and loop
+        PKHBT   ra01, xf0, xf1, LSL #16
+        PKHTB   rb01, xf1, xf0, ASR #16
+        
+        PKHBT   ra23, xf2, xf3, LSL #16
+        PKHTB   rb23, xf3, xf2, ASR #16
+        
+        PKHBT   ra45, xf4, xf5, LSL #16
+        PKHTB   rb45, xf5, xf4, ASR #16
+        
+        PKHBT   ra67, xf6, xf7, LSL #16
+        STMIA   pDest!, {ra01, ra23, ra45, ra67}      
+        PKHTB   rb67, xf7, xf6, ASR #16
+        STMIA   pDest!, {rb01, rb23, rb45, rb67}                              
+        BCC     v6_idct_col$_F
+        
+        SUB     pSrc, pDest, #(64*2)
+        M_LDR   pDest, ppDest
+        IF "$stride"="s"
+            M_LDR   pScale, pStride 
+        ENDIF
+        B       v6_idct_row$_F
+        
+v6OddZero$_F
+        SSUB16  xi2, xi6, xi7           ;// (j2-j6)
+        SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
+        
+        SMULBB  xi0, xi2, LoopRR2
+        SMULTB  xi2, xi2, LoopRR2
+        
+        MOV     xi2, xi2, LSL #1
+        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
+        SSUB16  xh2, xh2, xi3
+        
+        ;// xi0, xi1 now free
+        ;// IStage 4,3 rows 0to1 x 1/2
+        
+        SHADD16 xh0, xi4, xi5
+        SHSUB16 xh1, xi4, xi5                
+        
+        ;// IStage 2 rows 0to3 x 1/2
+        SHSUB16 xg2, xh1, xh2
+        SHADD16 xg1, xh1, xh2
+        SHSUB16 xg3, xh0, xh3
+        SHADD16 xg0, xh0, xh3
+               
+        ;// IStage 1 all rows
+        MOV  xf3, xg3
+        MOV  xf4, xg3
+        MOV  xf2, xg2
+        MOV  xf5, xg2
+        MOV  xf1, xg1
+        MOV  xf6, xg1
+        MOV  xf0, xg0
+        MOV  xf7, xg0
+        
+        ;// Transpose
+        PKHBT   ra01, xf0, xf1, LSL #16
+        PKHTB   rb01, xf1, xf0, ASR #16
+        
+        PKHBT   ra23, xf2, xf3, LSL #16
+        PKHTB   rb23, xf3, xf2, ASR #16
+        
+        PKHBT   ra45, xf4, xf5, LSL #16
+        PKHTB   rb45, xf5, xf4, ASR #16
+        
+        PKHBT   ra67, xf6, xf7, LSL #16
+        PKHTB   rb67, xf7, xf6, ASR #16
+                
+        STMIA   pDest!, {ra01, ra23, ra45, ra67}      
+        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
+        STMIA   pDest!, {rb01, rb23, rb45, rb67}      
+        
+        BCC     v6_idct_col$_F
+        SUB     pSrc, pDest, #(64*2)
+        M_LDR   pDest, ppDest
+        IF "$stride"="s"
+            M_LDR   pScale, pStride 
+        ENDIF
+               
+        
+v6_idct_row$_F
+        ;// IStage 4,3, rows4to7 x1/4
+        LDR     xit, =0x00010001        ;// rounding constant
+        LDR     xi0, [pSrc, #1*16]      ;// j1
+        LDR     xi1, [pSrc, #7*16]      ;// 4*j7
+        LDR     xi2, [pSrc, #5*16]      ;// j5
+        LDR     xi3, [pSrc, #3*16]      ;// j3
+        
+        SHADD16 xi1, xi1, xit           ;// 2*j7
+        SHADD16 xi1, xi1, xit           ;// j7                
+        
+        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
+        SSUB16  xi6, xi0, xi1           ;// j1-j7
+        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
+        SSUB16  xi4, xi2, xi3           ;// j5-j3
+        
+        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
+        
+        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
+        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
+        
+        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
+        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
+        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]   
+        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
+                
+        SMULBB  xi1, xi3, LoopRR2
+        SMULTB  xi3, xi3, LoopRR2
+                
+        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
+        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
+        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
+        
+        MOV     xi3, xi3, LSL #1
+        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
+               
+        ;// xi0,xi1,xi2,xi3 now free
+        ;// IStage 4,3, rows 2to3 x1/2
+        
+        LDR     xi0, [pSrc, #2*16]      ;// j2
+        LDR     xi1, [pSrc, #6*16]      ;// 2*j6
+        
+        ;// IStage 2, rows4to7
+        SSUB16  xg6, xh6, xh7
+        SSUB16  xg5, xh5, xg6
+        SSUB16  xg4, xh4, xg5
+        
+        SHADD16 xi1, xi1, xit           ;// j6
+        SSUB16  xi2, xi0, xi1           ;// (j2-j6)        
+        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
+        
+        SMULBB  xi0, xi2, LoopRR2
+        SMULTB  xi2, xi2, LoopRR2
+        
+        MOV     xi2, xi2, LSL #1
+        
+        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
+        
+        ;// xi0, xi1 now free
+        ;// IStage 4,3 rows 0to1 x 1/2
+        LDR     xi1, [pSrc, #4*16]      ;// j4
+        LDR     xi0, [pSrc], #4         ;// j0
+
+        SSUB16  xh2, xh2, xi3
+        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
+        
+        ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
+        SHADD16 xh0, xi0, xi1           ;// of DC result
+        SHSUB16 xh1, xi0, xi1
+                
+        ;// IStage 2 rows 0to3 x 1/2
+        SHSUB16 xg2, xh1, xh2
+        SHADD16 xg1, xh1, xh2
+        SHSUB16 xg3, xh0, xh3
+        SHADD16 xg0, xh0, xh3
+        
+        ;// IStage 1 all rows
+        SHADD16 xf3, xg3, xg4
+        SHSUB16 xf4, xg3, xg4
+        SHADD16 xf2, xg2, xg5
+        SHSUB16 xf5, xg2, xg5
+        SHADD16 xf1, xg1, xg6
+        SHSUB16 xf6, xg1, xg6
+        SHADD16 xf0, xg0, xg7
+        SHSUB16 xf7, xg0, xg7
+        
+        ;// Saturate
+        IF ("$outsize"="u8")
+            USAT16  xf0, #8, xf0
+            USAT16  xf1, #8, xf1
+            USAT16  xf2, #8, xf2
+            USAT16  xf3, #8, xf3
+            USAT16  xf4, #8, xf4
+            USAT16  xf5, #8, xf5
+            USAT16  xf6, #8, xf6
+            USAT16  xf7, #8, xf7        
+        ENDIF
+        IF ("$outsize"="s9")
+            SSAT16  xf0, #9, xf0
+            SSAT16  xf1, #9, xf1
+            SSAT16  xf2, #9, xf2
+            SSAT16  xf3, #9, xf3
+            SSAT16  xf4, #9, xf4
+            SSAT16  xf5, #9, xf5
+            SSAT16  xf6, #9, xf6
+            SSAT16  xf7, #9, xf7        
+        ENDIF
+        
+        ;// Transpose to Row, Pack and store
+        IF ("$outsize"="u8")
+            ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
+            ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
+            ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
+            ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
+            PKHBT   ra01, xf0, xf2, LSL #16
+            PKHTB   rb01, xf2, xf0, ASR #16
+            PKHBT   ra23, xf4, xf6, LSL #16
+            PKHTB   rb23, xf6, xf4, ASR #16
+            STMIA   pDest, {ra01, ra23}
+            IF "$stride"="s"
+                ADD     pDest, pDest, pScale
+                STMIA   pDest, {rb01, rb23}
+                ADD     pDest, pDest, pScale
+            ELSE                
+                ADD     pDest, pDest, #($stride)
+                STMIA   pDest, {rb01, rb23}
+                ADD     pDest, pDest, #($stride)
+            ENDIF
+        ENDIF
+        IF ("$outsize"="s9"):LOR:("$outsize"="s16")        
+            PKHBT   ra01, xf0, xf1, LSL #16
+            PKHTB   rb01, xf1, xf0, ASR #16
+        
+            PKHBT   ra23, xf2, xf3, LSL #16
+            PKHTB   rb23, xf3, xf2, ASR #16
+            
+            PKHBT   ra45, xf4, xf5, LSL #16
+            PKHTB   rb45, xf5, xf4, ASR #16
+            
+            PKHBT   ra67, xf6, xf7, LSL #16
+            PKHTB   rb67, xf7, xf6, ASR #16
+            
+            STMIA   pDest, {ra01, ra23, ra45, ra67}      
+            IF "$stride"="s"
+                ADD     pDest, pDest, pScale
+                STMIA   pDest, {rb01, rb23, rb45, rb67}      
+                ADD     pDest, pDest, pScale
+            ELSE                
+                ADD     pDest, pDest, #($stride)
+                STMIA   pDest, {rb01, rb23, rb45, rb67}      
+                ADD     pDest, pDest, #($stride)
+            ENDIF
+        ENDIF
+        
+        BCC     v6_idct_row$_F
+        ENDIF ;// ARM1136JS
+
+
+        IF CortexA8
+        
+Src0            EQU  7              
+Src1            EQU  8              
+Src2            EQU  9              
+Src3            EQU  10              
+Src4            EQU  11              
+Src5            EQU  12              
+Src6            EQU  13
+Src7            EQU  14
+Tmp             EQU  15
+
+qXj0            QN Src0.S16 
+qXj1            QN Src1.S16
+qXj2            QN Src2.S16
+qXj3            QN Src3.S16
+qXj4            QN Src4.S16
+qXj5            QN Src5.S16
+qXj6            QN Src6.S16
+qXj7            QN Src7.S16
+qXjt            QN Tmp.S16
+
+dXj0lo          DN (Src0*2).S16
+dXj0hi          DN (Src0*2+1).S16
+dXj1lo          DN (Src1*2).S16
+dXj1hi          DN (Src1*2+1).S16
+dXj2lo          DN (Src2*2).S16
+dXj2hi          DN (Src2*2+1).S16
+dXj3lo          DN (Src3*2).S16
+dXj3hi          DN (Src3*2+1).S16
+dXj4lo          DN (Src4*2).S16
+dXj4hi          DN (Src4*2+1).S16
+dXj5lo          DN (Src5*2).S16
+dXj5hi          DN (Src5*2+1).S16
+dXj6lo          DN (Src6*2).S16
+dXj6hi          DN (Src6*2+1).S16
+dXj7lo          DN (Src7*2).S16
+dXj7hi          DN (Src7*2+1).S16
+dXjtlo          DN (Tmp*2).S16
+dXjthi          DN (Tmp*2+1).S16
+
+qXi0            QN qXj0
+qXi1            QN qXj4
+qXi2            QN qXj2
+qXi3            QN qXj7
+qXi4            QN qXj5
+qXi5            QN qXjt
+qXi6            QN qXj1
+qXi7            QN qXj6
+qXit            QN qXj3
+
+dXi0lo          DN dXj0lo
+dXi0hi          DN dXj0hi
+dXi1lo          DN dXj4lo
+dXi1hi          DN dXj4hi
+dXi2lo          DN dXj2lo
+dXi2hi          DN dXj2hi
+dXi3lo          DN dXj7lo
+dXi3hi          DN dXj7hi
+dXi4lo          DN dXj5lo
+dXi4hi          DN dXj5hi
+dXi5lo          DN dXjtlo
+dXi5hi          DN dXjthi
+dXi6lo          DN dXj1lo
+dXi6hi          DN dXj1hi
+dXi7lo          DN dXj6lo
+dXi7hi          DN dXj6hi
+dXitlo          DN dXj3lo
+dXithi          DN dXj3hi
+
+qXh0            QN qXit
+qXh1            QN qXi0
+qXh2            QN qXi2
+qXh3            QN qXi3
+qXh4            QN qXi7
+qXh5            QN qXi5
+qXh6            QN qXi4
+qXh7            QN qXi1
+qXht            QN qXi6
+
+dXh0lo          DN dXitlo
+dXh0hi          DN dXithi
+dXh1lo          DN dXi0lo
+dXh1hi          DN dXi0hi
+dXh2lo          DN dXi2lo
+dXh2hi          DN dXi2hi
+dXh3lo          DN dXi3lo
+dXh3hi          DN dXi3hi
+dXh4lo          DN dXi7lo
+dXh4hi          DN dXi7hi
+dXh5lo          DN dXi5lo
+dXh5hi          DN dXi5hi
+dXh6lo          DN dXi4lo
+dXh6hi          DN dXi4hi
+dXh7lo          DN dXi1lo
+dXh7hi          DN dXi1hi
+dXhtlo          DN dXi6lo
+dXhthi          DN dXi6hi
+
+qXg0            QN qXh2
+qXg1            QN qXht
+qXg2            QN qXh1
+qXg3            QN qXh0
+qXg4            QN qXh4
+qXg5            QN qXh5
+qXg6            QN qXh6
+qXg7            QN qXh7
+qXgt            QN qXh3
+
+qXf0            QN qXg6
+qXf1            QN qXg5
+qXf2            QN qXg4
+qXf3            QN qXgt
+qXf4            QN qXg3
+qXf5            QN qXg2
+qXf6            QN qXg1
+qXf7            QN qXg0
+qXft            QN qXg7
+
+
+qXt0            QN 1.S32
+qXt1            QN 2.S32
+qT0lo           QN 1.S32         
+qT0hi           QN 2.S32         
+qT1lo           QN 3.S32         
+qT1hi           QN 4.S32         
+qScalelo        QN 5.S32        ;// used to read post scale values
+qScalehi        QN 6.S32
+qTemp0          QN 5.S32         
+qTemp1          QN 6.S32    
+
+
+Scale1          EQU 6
+Scale2          EQU 15
+qScale1         QN Scale1.S16     
+qScale2         QN Scale2.S16     
+dScale1lo       DN (Scale1*2).S16     
+dScale1hi       DN (Scale1*2+1).S16
+dScale2lo       DN (Scale2*2).S16     
+dScale2hi       DN (Scale2*2+1).S16
+
+dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
+InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
+S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
+C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
+
+pTemp           RN 12
+
+                
+        IMPORT  armCOMM_IDCTCoef
+                    
+        VLD1        {qXj0,qXj1}, [pSrc @64]!
+        VLD1        {qXj2,qXj3}, [pSrc @64]!
+        VLD1        {qXj4,qXj5}, [pSrc @64]!
+        VLD1        {qXj6,qXj7}, [pSrc @64]!
+        
+        ;// Load PreScale and multiply with Src
+        ;// IStage 4
+        
+        IF "$inscale"="s16"                         ;// 16X16 Mul
+            M_IDCT_PRESCALE16
+        ENDIF
+        
+        IF "$inscale"="s32"                         ;// 32X32 ,ul
+            M_IDCT_PRESCALE32
+        ENDIF
+
+        ;// IStage 3
+        VQRDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
+        VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
+        VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
+        VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
+        VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
+        VQRDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
+        VSUB        qXh2, qXi2, qXi3                ;// h2, h3
+
+        VMULL       qXt0, dXi4lo, C                 ;// c*i4
+        VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
+        VMULL       qXt1, dXi4hi, C
+        VMLAL       qXt1, dXi6hi, S
+        VSHRN       dXh4lo, qXt0, #16               ;// h4
+        VSHRN       dXh4hi, qXt1, #16
+        
+        VMULL       qXt0, dXi6lo, C                 ;// c*i6
+        VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
+        VMULL       qXt1, dXi6hi, C
+        VMLSL       qXt1, dXi4hi, S
+        VSHRN       dXh6lo, qXt0, #16               ;// h6
+        VSHRN       dXh6hi, qXt1, #16
+        
+        ;// IStage 2
+        VSUB        qXg6, qXh6, qXh7
+        VSUB        qXg5, qXh5, qXg6
+        VSUB        qXg4, qXh4, qXg5
+        VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
+        VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
+        VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
+        VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
+
+        ;// IStage 1 all rows
+        VADD        qXf3, qXg3, qXg4        
+        VSUB        qXf4, qXg3, qXg4        
+        VADD        qXf2, qXg2, qXg5        
+        VSUB        qXf5, qXg2, qXg5        
+        VADD        qXf1, qXg1, qXg6
+        VSUB        qXf6, qXg1, qXg6        
+        VADD        qXf0, qXg0, qXg7
+        VSUB        qXf7, qXg0, qXg7      
+
+        ;// Transpose, store and loop
+XTR0            EQU Src5
+XTR1            EQU Tmp
+XTR2            EQU Src6
+XTR3            EQU Src7
+XTR4            EQU Src3
+XTR5            EQU Src0
+XTR6            EQU Src1
+XTR7            EQU Src2
+XTRt            EQU Src4
+                
+qA0             QN  XTR0.S32  ;// for XTRpose
+qA1             QN  XTR1.S32
+qA2             QN  XTR2.S32
+qA3             QN  XTR3.S32
+qA4             QN  XTR4.S32
+qA5             QN  XTR5.S32
+qA6             QN  XTR6.S32
+qA7             QN  XTR7.S32
+
+dB0             DN  XTR0*2+1      ;// for using VSWP
+dB1             DN  XTR1*2+1
+dB2             DN  XTR2*2+1
+dB3             DN  XTR3*2+1
+dB4             DN  XTR4*2
+dB5             DN  XTR5*2
+dB6             DN  XTR6*2
+dB7             DN  XTR7*2
+
+          
+        VTRN        qXf0, qXf1
+        VTRN        qXf2, qXf3
+        VTRN        qXf4, qXf5
+        VTRN        qXf6, qXf7
+        VTRN        qA0, qA2
+        VTRN        qA1, qA3
+        VTRN        qA4, qA6
+        VTRN        qA5, qA7        
+        VSWP        dB0, dB4
+        VSWP        dB1, dB5
+        VSWP        dB2, dB6
+        VSWP        dB3, dB7
+        
+
+qYj0            QN qXf0
+qYj1            QN qXf1
+qYj2            QN qXf2
+qYj3            QN qXf3
+qYj4            QN qXf4
+qYj5            QN qXf5
+qYj6            QN qXf6
+qYj7            QN qXf7
+qYjt            QN qXft
+
+dYj0lo          DN (XTR0*2).S16
+dYj0hi          DN (XTR0*2+1).S16
+dYj1lo          DN (XTR1*2).S16
+dYj1hi          DN (XTR1*2+1).S16
+dYj2lo          DN (XTR2*2).S16
+dYj2hi          DN (XTR2*2+1).S16
+dYj3lo          DN (XTR3*2).S16
+dYj3hi          DN (XTR3*2+1).S16
+dYj4lo          DN (XTR4*2).S16
+dYj4hi          DN (XTR4*2+1).S16
+dYj5lo          DN (XTR5*2).S16
+dYj5hi          DN (XTR5*2+1).S16
+dYj6lo          DN (XTR6*2).S16
+dYj6hi          DN (XTR6*2+1).S16
+dYj7lo          DN (XTR7*2).S16
+dYj7hi          DN (XTR7*2+1).S16
+dYjtlo          DN (XTRt*2).S16
+dYjthi          DN (XTRt*2+1).S16
+
+qYi0            QN qYj0
+qYi1            QN qYj4
+qYi2            QN qYj2
+qYi3            QN qYj7
+qYi4            QN qYj5
+qYi5            QN qYjt
+qYi6            QN qYj1
+qYi7            QN qYj6
+qYit            QN qYj3
+
+dYi0lo          DN dYj0lo
+dYi0hi          DN dYj0hi
+dYi1lo          DN dYj4lo
+dYi1hi          DN dYj4hi
+dYi2lo          DN dYj2lo
+dYi2hi          DN dYj2hi
+dYi3lo          DN dYj7lo
+dYi3hi          DN dYj7hi
+dYi4lo          DN dYj5lo
+dYi4hi          DN dYj5hi
+dYi5lo          DN dYjtlo
+dYi5hi          DN dYjthi
+dYi6lo          DN dYj1lo
+dYi6hi          DN dYj1hi
+dYi7lo          DN dYj6lo
+dYi7hi          DN dYj6hi
+dYitlo          DN dYj3lo
+dYithi          DN dYj3hi
+
+qYh0            QN qYit
+qYh1            QN qYi0
+qYh2            QN qYi2
+qYh3            QN qYi3
+qYh4            QN qYi7
+qYh5            QN qYi5
+qYh6            QN qYi4
+qYh7            QN qYi1
+qYht            QN qYi6
+
+dYh0lo          DN dYitlo
+dYh0hi          DN dYithi
+dYh1lo          DN dYi0lo
+dYh1hi          DN dYi0hi
+dYh2lo          DN dYi2lo
+dYh2hi          DN dYi2hi
+dYh3lo          DN dYi3lo
+dYh3hi          DN dYi3hi
+dYh4lo          DN dYi7lo
+dYh4hi          DN dYi7hi
+dYh5lo          DN dYi5lo
+dYh5hi          DN dYi5hi
+dYh6lo          DN dYi4lo
+dYh6hi          DN dYi4hi
+dYh7lo          DN dYi1lo
+dYh7hi          DN dYi1hi
+dYhtlo          DN dYi6lo
+dYhthi          DN dYi6hi
+
+qYg0            QN qYh2
+qYg1            QN qYht
+qYg2            QN qYh1
+qYg3            QN qYh0
+qYg4            QN qYh4
+qYg5            QN qYh5
+qYg6            QN qYh6
+qYg7            QN qYh7
+qYgt            QN qYh3
+
+qYf0            QN qYg6
+qYf1            QN qYg5
+qYf2            QN qYg4
+qYf3            QN qYgt
+qYf4            QN qYg3
+qYf5            QN qYg2
+qYf6            QN qYg1
+qYf7            QN qYg0
+qYft            QN qYg7
+
+        VRSHR       qYj7, qYj7, #2
+        VRSHR       qYj6, qYj6, #1
+        
+        VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
+        VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
+        VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
+        VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
+        VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
+        VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
+
+        VQRDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
+        ;// IStage 4,3 rows 0to1 x 1/2
+        
+        MOV         pTemp, #0x4             ;// ensure correct round
+        VDUP        qScale1, pTemp           ;// of DC result
+        VADD        qYi0, qYi0, qScale1
+        
+        VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
+        VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
+
+        VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
+        VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
+        VSUB        qYh2, qYi2, qYi3        ;// h2, h3
+        VQRDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
+
+        VMULL       qXt0, dYi4lo, C         ;// c*i4
+        VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
+        VMULL       qXt1, dYi4hi, C
+        VMLAL       qXt1, dYi6hi, S
+        VSHRN       dYh4lo, qXt0, #16       ;// h4
+        VSHRN       dYh4hi, qXt1, #16
+        
+        VMULL       qXt0, dYi6lo, C         ;// c*i6
+        VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
+        VMULL       qXt1, dYi6hi, C
+        VMLSL       qXt1, dYi4hi, S
+        VSHRN       dYh6lo, qXt0, #16       ;// h6
+        VSHRN       dYh6hi, qXt1, #16
+        
+        VSUB        qYg6, qYh6, qYh7
+        VSUB        qYg5, qYh5, qYg6
+        VSUB        qYg4, qYh4, qYg5
+        
+        ;// IStage 2 rows 0to3 x 1/2
+        VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
+        VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
+        VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
+        VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
+        
+
+        ;// IStage 1 all rows
+        VHADD        qYf3, qYg3, qYg4        
+        VHSUB        qYf4, qYg3, qYg4        
+        VHADD        qYf2, qYg2, qYg5        
+        VHSUB        qYf5, qYg2, qYg5        
+        VHADD        qYf1, qYg1, qYg6
+        VHSUB        qYf6, qYg1, qYg6        
+        VHADD        qYf0, qYg0, qYg7
+        VHSUB        qYf7, qYg0, qYg7      
+
+YTR0            EQU Src0
+YTR1            EQU Src4
+YTR2            EQU Src1
+YTR3            EQU Src2
+YTR4            EQU Src7
+YTR5            EQU Src5
+YTR6            EQU Tmp
+YTR7            EQU Src6
+YTRt            EQU Src3
+
+qC0             QN  YTR0.S32                ;// for YTRpose
+qC1             QN  YTR1.S32
+qC2             QN  YTR2.S32
+qC3             QN  YTR3.S32
+qC4             QN  YTR4.S32
+qC5             QN  YTR5.S32
+qC6             QN  YTR6.S32
+qC7             QN  YTR7.S32
+
+dD0             DN  YTR0*2+1                ;// for using VSWP
+dD1             DN  YTR1*2+1
+dD2             DN  YTR2*2+1
+dD3             DN  YTR3*2+1
+dD4             DN  YTR4*2
+dD5             DN  YTR5*2
+dD6             DN  YTR6*2
+dD7             DN  YTR7*2
+          
+        VTRN        qYf0, qYf1
+        VTRN        qYf2, qYf3
+        VTRN        qYf4, qYf5
+        VTRN        qYf6, qYf7
+        VTRN        qC0, qC2
+        VTRN        qC1, qC3
+        VTRN        qC4, qC6
+        VTRN        qC5, qC7        
+        VSWP        dD0, dD4
+        VSWP        dD1, dD5
+        VSWP        dD2, dD6
+        VSWP        dD3, dD7
+
+        
+dYf0U8          DN YTR0*2.U8
+dYf1U8          DN YTR1*2.U8
+dYf2U8          DN YTR2*2.U8
+dYf3U8          DN YTR3*2.U8
+dYf4U8          DN YTR4*2.U8
+dYf5U8          DN YTR5*2.U8
+dYf6U8          DN YTR6*2.U8
+dYf7U8          DN YTR7*2.U8
+        
+        ;//
+        ;// Do saturation if outsize is other than S16
+        ;//
+        
+        IF ("$outsize"="u8")
+            ;// Output range [0-255]
+            VQMOVN            dYf0U8, qYf0
+            VQMOVN            dYf1U8, qYf1
+            VQMOVN            dYf2U8, qYf2
+            VQMOVN            dYf3U8, qYf3
+            VQMOVN            dYf4U8, qYf4
+            VQMOVN            dYf5U8, qYf5
+            VQMOVN            dYf6U8, qYf6
+            VQMOVN            dYf7U8, qYf7
+        ENDIF
+        
+        IF ("$outsize"="s9")
+            ;// Output range [-256 to +255]
+            VQSHL            qYf0, qYf0, #16-9
+            VQSHL            qYf1, qYf1, #16-9
+            VQSHL            qYf2, qYf2, #16-9
+            VQSHL            qYf3, qYf3, #16-9
+            VQSHL            qYf4, qYf4, #16-9
+            VQSHL            qYf5, qYf5, #16-9
+            VQSHL            qYf6, qYf6, #16-9
+            VQSHL            qYf7, qYf7, #16-9
+            
+            VSHR             qYf0, qYf0, #16-9
+            VSHR             qYf1, qYf1, #16-9
+            VSHR             qYf2, qYf2, #16-9
+            VSHR             qYf3, qYf3, #16-9
+            VSHR             qYf4, qYf4, #16-9
+            VSHR             qYf5, qYf5, #16-9
+            VSHR             qYf6, qYf6, #16-9
+            VSHR             qYf7, qYf7, #16-9
+        ENDIF
+
+        ;// Store output depending on the Stride size
+        IF "$stride"="s"
+            VST1        qYf0, [pDest @64], Stride
+            VST1        qYf1, [pDest @64], Stride
+            VST1        qYf2, [pDest @64], Stride
+            VST1        qYf3, [pDest @64], Stride
+            VST1        qYf4, [pDest @64], Stride
+            VST1        qYf5, [pDest @64], Stride
+            VST1        qYf6, [pDest @64], Stride
+            VST1        qYf7, [pDest @64]            
+        ELSE
+            IF ("$outsize"="u8")
+                VST1        dYf0U8, [pDest @64], #8
+                VST1        dYf1U8, [pDest @64], #8
+                VST1        dYf2U8, [pDest @64], #8
+                VST1        dYf3U8, [pDest @64], #8
+                VST1        dYf4U8, [pDest @64], #8
+                VST1        dYf5U8, [pDest @64], #8
+                VST1        dYf6U8, [pDest @64], #8
+                VST1        dYf7U8, [pDest @64]
+            ELSE
+                ;// ("$outsize"="s9") or ("$outsize"="s16")
+                VST1        qYf0, [pDest @64], #16
+                VST1        qYf1, [pDest @64], #16
+                VST1        qYf2, [pDest @64], #16
+                VST1        qYf3, [pDest @64], #16
+                VST1        qYf4, [pDest @64], #16
+                VST1        qYf5, [pDest @64], #16
+                VST1        qYf6, [pDest @64], #16
+                VST1        qYf7, [pDest @64]
+            ENDIF
+        
+        ENDIF
+
+
+
+        ENDIF ;// CortexA8
+
+
+
+        MEND        
+
+        ;// Scale TWO input rows with TWO rows of 16 bit scale values
+        ;//
+        ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
+        ;// input (Eight input values) with one row of scale values. Also 
+        ;// Loads next scale values from pScale, if $LastRow flag is not set.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $dAlo           - Input D register with first four S16 values of row n
+        ;// $dAhi           - Input D register with next four S16 values of row n
+        ;// $dBlo           - Input D register with first four S16 values of row n+1
+        ;// $dBhi           - Input D register with next four S16 values of row n+1
+        ;// pScale          - Pointer to next row of scale values
+        ;// qT0lo           - Temporary scratch register
+        ;// qT0hi           - Temporary scratch register
+        ;// qT1lo           - Temporary scratch register
+        ;// qT1hi           - Temporary scratch register
+        ;// dScale1lo       - Scale value of row n
+        ;// dScale1hi       - Scale value of row n
+        ;// dScale2lo       - Scale value of row n+1
+        ;// dScale2hi       - Scale value of row n+1
+        ;//
+        ;// Input Flag
+        ;//
+        ;// $LastRow        - Flag to indicate whether current row is last row
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $dAlo           - Scaled output values (first four S16 of row n)
+        ;// $dAhi           - Scaled output values (next four S16 of row n)
+        ;// $dBlo           - Scaled output values (first four S16 of row n+1)
+        ;// $dBhi           - Scaled output values (next four S16 of row n+1)
+        ;// qScale1         - Scale values for next row
+        ;// qScale2         - Scale values for next row+1
+        ;// pScale          - Pointer to next row of scale values
+        ;//
+        MACRO
+        M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
+        VMULL       qT0lo, $dAlo, dScale1lo
+        VMULL       qT0hi, $dAhi, dScale1hi
+        VMULL       qT1lo, $dBlo, dScale2lo
+        VMULL       qT1hi, $dBhi, dScale2hi
+        IF "$LastRow"="0"
+            VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
+            VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
+        ENDIF
+        VQRSHRN       $dAlo, qT0lo, #12        
+        VQRSHRN       $dAhi, qT0hi, #12        
+        VQRSHRN       $dBlo, qT1lo, #12        
+        VQRSHRN       $dBhi, qT1hi, #12        
+        MEND
+
+        ;// Scale 8x8 block input values with 16 bit scale values
+        ;//
+        ;// This macro is used to pre-scale block of 8x8 input.
+        ;// This also do the Ist stage transformations of IDCT.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// dXjnlo          - n th input D register with first four S16 values
+        ;// dXjnhi          - n th input D register with next four S16 values
+        ;// qXjn            - n th input Q register with eight S16 values
+        ;// pScale          - Pointer to scale values
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// qXin            - n th output Q register with eight S16 output values of 1st stage
+        ;//
+        MACRO
+        M_IDCT_PRESCALE16
+        VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
+        VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
+        M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
+        M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0        
+        M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0        
+        M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1        
+        VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
+        VSUB        qXi6, qXj1, qXj7            ;// j1-j7
+        LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
+        VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
+        VSUB        qXi2, qXj2, qXj6            ;// j2-j6
+        VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
+        VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
+        VSUB        qXi4, qXj5, qXj3            ;// j5-j3
+        MEND    
+        
+        
+        ;// Scale 8x8 block input values with 32 bit scale values
+        ;//
+        ;// This macro is used to pre-scale block of 8x8 input.
+        ;// This also do the Ist stage transformations of IDCT.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// dXjnlo          - n th input D register with first four S16 values
+        ;// dXjnhi          - n th input D register with next four S16 values
+        ;// qXjn            - n th input Q register with eight S16 values
+        ;// pScale          - Pointer to 32bit scale values in Q23 format
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
+        ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
+        ;//
+        MACRO
+        M_IDCT_PRESCALE32
+qScale0lo       QN 0.S32
+qScale0hi       QN 1.S32
+qScale1lo       QN 2.S32
+qScale1hi       QN 3.S32
+qScale2lo       QN qScale1lo
+qScale2hi       QN qScale1hi
+qScale3lo       QN qScale1lo
+qScale3hi       QN qScale1hi
+qScale4lo       QN qScale1lo
+qScale4hi       QN qScale1hi
+qScale5lo       QN qScale0lo
+qScale5hi       QN qScale0hi
+qScale6lo       QN qScale0lo
+qScale6hi       QN qScale0hi
+qScale7lo       QN qScale0lo
+qScale7hi       QN qScale0hi
+
+qSrc0lo         QN 4.S32
+qSrc0hi         QN 5.S32
+qSrc1lo         QN 6.S32
+qSrc1hi         QN Src4.S32
+qSrc2lo         QN qSrc0lo
+qSrc2hi         QN qSrc0hi
+qSrc3lo         QN qSrc0lo
+qSrc3hi         QN qSrc0hi
+qSrc4lo         QN qSrc0lo
+qSrc4hi         QN qSrc0hi
+qSrc5lo         QN qSrc1lo
+qSrc5hi         QN qSrc1hi
+qSrc6lo         QN qSrc1lo
+qSrc6hi         QN qSrc1hi
+qSrc7lo         QN qSrc0lo
+qSrc7hi         QN qSrc0hi
+
+qRes17lo        QN qScale0lo
+qRes17hi        QN qScale0hi
+qRes26lo        QN qScale0lo
+qRes26hi        QN qScale0hi
+qRes53lo        QN qScale0lo
+qRes53hi        QN qScale0hi
+
+            ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
+            
+            ;// Row 0
+            VLD1        {qScale0lo, qScale0hi}, [pScale]!
+            VSHLL       qSrc0lo, dXj0lo, #(12-1)
+            VSHLL       qSrc0hi, dXj0hi, #(12-1)            
+            VLD1        {qScale1lo, qScale1hi}, [pScale]!
+            VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
+            VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
+            VLD1        {qScale7lo, qScale7hi}, [pTemp]!
+            VSHLL       qSrc1lo, dXj1lo, #(12-1)
+            VSHLL       qSrc1hi, dXj1hi, #(12-1)            
+            VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
+            VMOVN       dXi0hi, qSrc0hi
+            VSHLL       qSrc7lo, dXj7lo, #(12-1)
+            VSHLL       qSrc7hi, dXj7hi, #(12-1)
+            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
+            VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
+            VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
+            VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
+            VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
+            VLD1        {qScale2lo, qScale2hi}, [pScale]!
+
+            ;// Row 1 & 7
+            VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
+            VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
+            VMOVN       dXi5lo, qRes17lo                ;// Output i5
+            VMOVN       dXi5hi, qRes17hi              
+            VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
+            VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
+            VMOVN       dXi6lo, qRes17lo                ;// Output i6
+            VMOVN       dXi6hi, qRes17hi      
+            VSHLL       qSrc2lo, dXj2lo, #(12-1)
+            VSHLL       qSrc2hi, dXj2hi, #(12-1)
+            VLD1        {qScale6lo, qScale6hi}, [pTemp]!
+            VSHLL       qSrc6lo, dXj6lo, #(12-1)
+            VSHLL       qSrc6hi, dXj6hi, #(12-1)
+            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
+            VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
+            VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
+            VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
+            VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
+            VLD1        {qScale3lo, qScale3hi}, [pScale]!
+
+            ;// Row 2 & 6
+            VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
+            VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
+            VMOVN       dXi3lo, qRes26lo                ;// Output i3
+            VMOVN       dXi3hi, qRes26hi              
+            VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
+            VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
+            VMOVN       dXi2lo, qRes26lo                ;// Output i2
+            VMOVN       dXi2hi, qRes26hi      
+            VSHLL       qSrc3lo, dXj3lo, #(12-1)
+            VSHLL       qSrc3hi, dXj3hi, #(12-1)
+            VLD1        {qScale5lo, qScale5hi}, [pTemp]!
+            VSHLL       qSrc5lo, dXj5lo, #(12-1)
+            VSHLL       qSrc5hi, dXj5hi, #(12-1)
+            VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
+            VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
+            VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
+            VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
+            
+            ;// Row 3 & 5
+            VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
+            VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
+            SUB         pSrc, pSrc, #16*2*2
+            VMOVN       dXi7lo, qRes53lo                ;// Output i7
+            VMOVN       dXi7hi, qRes53hi              
+            VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
+            VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
+            VLD1        qXj4, [pSrc @64]
+            VMOVN       dXi4lo, qRes53lo                ;// Output i4
+            VMOVN       dXi4hi, qRes53hi                              
+            VSHLL       qSrc4lo, dXj4lo, #(12-1)
+            VSHLL       qSrc4hi, dXj4hi, #(12-1)
+            VLD1        {qScale4lo, qScale4hi}, [pScale]            
+            LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
+            VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
+            VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
+            VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
+            ;// Row 4
+            VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
+            VMOVN       dXi1hi, qSrc4hi              
+        
+        MEND
+                                                
+        END
-- 
cgit v1.1