1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
|
;//
;// Copyright (C) 2004 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// IDCT_s.s
;//
;// Inverse DCT module
;//
;//
;// ALGORITHM DESCRIPTION
;//
;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
;// column and then a 1D IDCT for each row.
;//
;// The 8-point 1D IDCT is defined by
;// f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
;//
;// C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
;// c(u,x) = cos( (2x+1)*u*pi/16 )
;//
;// We compute the 8-point 1D IDCT using the reverse of
;// the Arai-Agui-Nakajima flow graph which we split into
;// 5 stages named in reverse order to identify with the
;// forward DCT. Direct inversion of the forward formulae
;// in file FDCT_s.s gives:
;//
;// IStage 5: j(u) = T(u)*A(u) [ A(u)=4*C(u)*c(u,0) ]
;// [ A(0) = 2*sqrt(2)
;// A(u) = 4*cos(u*pi/16) for (u!=0) ]
;//
;// IStage 4: i0 = j0 i1 = j4
;// i3 = (j2+j6)/2 i2 = (j2-j6)/2
;// i7 = (j5+j3)/2 i4 = (j5-j3)/2
;// i5 = (j1+j7)/2 i6 = (j1-j7)/2
;//
;// IStage 3: h0 = (i0+i1)/2 h1 = (i0-i1)/2
;// h2 = (i2*sqrt2)-i3 h3 = i3
;// h4 = cos(pi/8)*i4 + sin(pi/8)*i6
;// h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
;// [ The above two lines rotate by -(pi/8) ]
;// h5 = (i5-i7)/sqrt2 h7 = (i5+i7)/2
;//
;// IStage 2: g0 = (h0+h3)/2 g3 = (h0-h3)/2
;// g1 = (h1+h2)/2 g2 = (h1-h2)/2
;// g7 = h7 g6 = h6 - h7
;// g5 = h5 - g6 g4 = h4 - g5
;//
;// IStage 1: f0 = (g0+g7)/2 f7 = (g0-g7)/2
;// f1 = (g1+g6)/2 f6 = (g1-g6)/2
;// f2 = (g2+g5)/2 f5 = (g2-g5)/2
;// f3 = (g3+g4)/2 f4 = (g3-g4)/2
;//
;// Note that most coefficients are halved 3 times during the
;// above calculation. We can rescale the algorithm dividing
;// the input by 8 to remove the halvings.
;//
;// IStage 5: j(u) = T(u)*A(u)/8
;//
;// IStage 4: i0 = j0 i1 = j4
;// i3 = j2 + j6 i2 = j2 - j6
;// i7 = j5 + j3 i4 = j5 - j3
;// i5 = j1 + j7 i6 = j1 - j7
;//
;// IStage 3: h0 = i0 + i1 h1 = i0 - i1
;// h2 = (i2*sqrt2)-i3 h3 = i3
;// h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
;// h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
;// h5 = (i5-i7)*sqrt2 h7 = i5 + i7
;//
;// IStage 2: g0 = h0 + h3 g3 = h0 - h3
;// g1 = h1 + h2 g2 = h1 - h2
;// g7 = h7 g6 = h6 - h7
;// g5 = h5 - g6 g4 = h4 - g5
;//
;// IStage 1: f0 = g0 + g7 f7 = g0 - g7
;// f1 = g1 + g6 f6 = g1 - g6
;// f2 = g2 + g5 f5 = g2 - g5
;// f3 = g3 + g4 f4 = g3 - g4
;//
;// Note:
;// 1. The scaling by A(u)/8 can often be combined with inverse
;// quantization. The column and row scalings can be combined.
;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
;// to the above code but is otherwise identical.
;// 3. The rotation by -pi/8 can be peformed using three multiplies
;// Eg c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
;// -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
;// 4. If |T(u)|<=1 then from the IDCT definition,
;// |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
;// = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
;// = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
;// = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
;// = (approx)2.64
;// So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
;// The table below shows input patterns generating the maximum
;// value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
;// InputPattern Max |f(x)|
;// PPPPPPPP |f0| = 2.64
;// PPPMMMMM |f1| = 2.64
;// PPMMMPPP |f2| = 2.64
;// PPMMPPMM |f3| = 2.64
;// PMMPPMMP |f4| = 2.64
;// PMMPMMPM |f5| = 2.64
;// PMPPMPMP |f6| = 2.64
;// PMPMPMPM |f7| = 2.64
;// Note that this input pattern is the transpose of the
;// corresponding max input patter for the FDCT.
;// Arguments
pSrc RN 0 ;// source data buffer
Stride RN 1 ;// destination stride in bytes
pDest RN 2 ;// destination data buffer
pScale RN 3 ;// pointer to scaling table
;// DCT Inverse Macro
;// The DCT code should be parametrized according
;// to the following inputs:
;// $outsize = "u8" : 8-bit unsigned data saturated (0 to +255)
;// "s9" : 16-bit signed data saturated to 9-bit (-256 to +255)
;// "s16" : 16-bit signed data not saturated (max size ~+/-14273)
;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
;// "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
;//
;// Inputs:
;// pSrc = r0 = Pointer to input data
;// Range is -256 to +255 (9-bit)
;// Stride = r1 = Stride between input lines
;// pDest = r2 = Pointer to output data
;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
MACRO
M_IDCT $outsize, $inscale, $stride
LCLA SHIFT
IF ARM1136JS
;// REGISTER ALLOCATION
;// This is hard since we have 8 values, 9 free registers and each
;// butterfly requires a temporary register. We also want to
;// maintain register order so we can use LDM/STM. The table below
;// summarises the register allocation that meets all these criteria.
;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
;//
;// r1 a01 g0 h0
;// r4 b01 f0 g1 h1 i0
;// r5 a23 f1 g2 i1
;// r6 b23 f2 g3 h2 i2
;// r7 a45 f3 h3 i3
;// r8 b45 f4 g4 h4 i4
;// r9 a67 f5 g5 h5 i5
;// r10 b67 f6 g6 h6 i6
;// r11 f7 g7 h7 i7
;//
ra01 RN 1
rb01 RN 4
ra23 RN 5
rb23 RN 6
ra45 RN 7
rb45 RN 8
ra67 RN 9
rb67 RN 10
rtmp RN 11
csPiBy8 RN 12 ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
LoopRR2 RN 14 ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
;// Transpose allocation
xft RN ra01
xf0 RN rb01
xf1 RN ra23
xf2 RN rb23
xf3 RN ra45
xf4 RN rb45
xf5 RN ra67
xf6 RN rb67
xf7 RN rtmp
;// IStage 1 allocation
xg0 RN xft
xg1 RN xf0
xg2 RN xf1
xg3 RN xf2
xgt RN xf3
xg4 RN xf4
xg5 RN xf5
xg6 RN xf6
xg7 RN xf7
;// IStage 2 allocation
xh0 RN xg0
xh1 RN xg1
xht RN xg2
xh2 RN xg3
xh3 RN xgt
xh4 RN xg4
xh5 RN xg5
xh6 RN xg6
xh7 RN xg7
;// IStage 3,4 allocation
xit RN xh0
xi0 RN xh1
xi1 RN xht
xi2 RN xh2
xi3 RN xh3
xi4 RN xh4
xi5 RN xh5
xi6 RN xh6
xi7 RN xh7
M_STR pDest, ppDest
IF "$stride"="s"
M_STR Stride, pStride
ENDIF
M_ADR pDest, pBlk
LDR csPiBy8, =0x30fc7642
LDR LoopRR2, =0x00005a82
v6_idct_col$_F
;// Load even values
LDR xi4, [pSrc], #4 ;// j0
LDR xi5, [pSrc, #4*16-4] ;// j4
LDR xi6, [pSrc, #2*16-4] ;// j2
LDR xi7, [pSrc, #6*16-4] ;// j6
;// Scale Even Values
IF "$inscale"="s16" ;// 16x16 mul
SHIFT SETA 12
LDR xi0, [pScale], #4
LDR xi1, [pScale, #4*16-4]
LDR xi2, [pScale, #2*16-4]
MOV xit, #1<<(SHIFT-1)
SMLABB xi3, xi0, xi4, xit
SMLATT xi4, xi0, xi4, xit
SMLABB xi0, xi1, xi5, xit
SMLATT xi5, xi1, xi5, xit
MOV xi3, xi3, ASR #SHIFT
PKHBT xi4, xi3, xi4, LSL #(16-SHIFT)
LDR xi3, [pScale, #6*16-4]
SMLABB xi1, xi2, xi6, xit
SMLATT xi6, xi2, xi6, xit
MOV xi0, xi0, ASR #SHIFT
PKHBT xi5, xi0, xi5, LSL #(16-SHIFT)
SMLABB xi2, xi3, xi7, xit
SMLATT xi7, xi3, xi7, xit
MOV xi1, xi1, ASR #SHIFT
PKHBT xi6, xi1, xi6, LSL #(16-SHIFT)
MOV xi2, xi2, ASR #SHIFT
PKHBT xi7, xi2, xi7, LSL #(16-SHIFT)
ENDIF
IF "$inscale"="s32" ;// 32x16 mul
SHIFT SETA (12+8-16)
MOV xit, #1<<(SHIFT-1)
LDR xi0, [pScale], #8
LDR xi1, [pScale, #0*32+4-8]
LDR xi2, [pScale, #4*32-8]
LDR xi3, [pScale, #4*32+4-8]
SMLAWB xi0, xi0, xi4, xit
SMLAWT xi1, xi1, xi4, xit
SMLAWB xi2, xi2, xi5, xit
SMLAWT xi3, xi3, xi5, xit
MOV xi0, xi0, ASR #SHIFT
PKHBT xi4, xi0, xi1, LSL #(16-SHIFT)
MOV xi2, xi2, ASR #SHIFT
PKHBT xi5, xi2, xi3, LSL #(16-SHIFT)
LDR xi0, [pScale, #2*32-8]
LDR xi1, [pScale, #2*32+4-8]
LDR xi2, [pScale, #6*32-8]
LDR xi3, [pScale, #6*32+4-8]
SMLAWB xi0, xi0, xi6, xit
SMLAWT xi1, xi1, xi6, xit
SMLAWB xi2, xi2, xi7, xit
SMLAWT xi3, xi3, xi7, xit
MOV xi0, xi0, ASR #SHIFT
PKHBT xi6, xi0, xi1, LSL #(16-SHIFT)
MOV xi2, xi2, ASR #SHIFT
PKHBT xi7, xi2, xi3, LSL #(16-SHIFT)
ENDIF
;// Load odd values
LDR xi0, [pSrc, #1*16-4] ;// j1
LDR xi1, [pSrc, #7*16-4] ;// j7
LDR xi2, [pSrc, #5*16-4] ;// j5
LDR xi3, [pSrc, #3*16-4] ;// j3
IF {TRUE}
;// shortcut if odd values 0
TEQ xi0, #0
TEQEQ xi1, #0
TEQEQ xi2, #0
TEQEQ xi3, #0
BEQ v6OddZero$_F
ENDIF
;// Store scaled even values
STMIA pDest, {xi4, xi5, xi6, xi7}
;// Scale odd values
IF "$inscale"="s16"
;// Perform AAN Scale
LDR xi4, [pScale, #1*16-4]
LDR xi5, [pScale, #7*16-4]
LDR xi6, [pScale, #5*16-4]
SMLABB xi7, xi0, xi4, xit
SMLATT xi0, xi0, xi4, xit
SMLABB xi4, xi1, xi5, xit
SMLATT xi1, xi1, xi5, xit
MOV xi7, xi7, ASR #SHIFT
PKHBT xi0, xi7, xi0, LSL #(16-SHIFT)
LDR xi7, [pScale, #3*16-4]
SMLABB xi5, xi2, xi6, xit
SMLATT xi2, xi2, xi6, xit
MOV xi4, xi4, ASR #SHIFT
PKHBT xi1, xi4, xi1, LSL #(16-SHIFT)
SMLABB xi6, xi3, xi7, xit
SMLATT xi3, xi3, xi7, xit
MOV xi5, xi5, ASR #SHIFT
PKHBT xi2, xi5, xi2, LSL #(16-SHIFT)
MOV xi6, xi6, ASR #SHIFT
PKHBT xi3, xi6, xi3, LSL #(16-SHIFT)
ENDIF
IF "$inscale"="s32" ;// 32x16 mul
LDR xi4, [pScale, #1*32-8]
LDR xi5, [pScale, #1*32+4-8]
LDR xi6, [pScale, #7*32-8]
LDR xi7, [pScale, #7*32+4-8]
SMLAWB xi4, xi4, xi0, xit
SMLAWT xi5, xi5, xi0, xit
SMLAWB xi6, xi6, xi1, xit
SMLAWT xi7, xi7, xi1, xit
MOV xi4, xi4, ASR #SHIFT
PKHBT xi0, xi4, xi5, LSL #(16-SHIFT)
MOV xi6, xi6, ASR #SHIFT
PKHBT xi1, xi6, xi7, LSL #(16-SHIFT)
LDR xi4, [pScale, #5*32-8]
LDR xi5, [pScale, #5*32+4-8]
LDR xi6, [pScale, #3*32-8]
LDR xi7, [pScale, #3*32+4-8]
SMLAWB xi4, xi4, xi2, xit
SMLAWT xi5, xi5, xi2, xit
SMLAWB xi6, xi6, xi3, xit
SMLAWT xi7, xi7, xi3, xit
MOV xi4, xi4, ASR #SHIFT
PKHBT xi2, xi4, xi5, LSL #(16-SHIFT)
MOV xi6, xi6, ASR #SHIFT
PKHBT xi3, xi6, xi7, LSL #(16-SHIFT)
ENDIF
LDR xit, =0x00010001 ;// rounding constant
SADD16 xi5, xi0, xi1 ;// (j1+j7)/2
SHADD16 xi5, xi5, xit
SSUB16 xi6, xi0, xi1 ;// j1-j7
SADD16 xi7, xi2, xi3 ;// (j5+j3)/2
SHADD16 xi7, xi7, xit
SSUB16 xi4, xi2, xi3 ;// j5-j3
SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2
PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a
PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b
SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s]
SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s]
SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c]
SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c]
SMULBB xi1, xi3, LoopRR2
SMULTB xi3, xi3, LoopRR2
PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4
PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4
SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4
;// xi0,xi1,xi2,xi3 now free
;// IStage 4,3, rows 2to3 x1/2
MOV xi3, xi3, LSL #1
PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4
LDRD xi0, [pDest, #8] ;// j2,j6 scaled
;// IStage 2, rows4to7
SSUB16 xg6, xh6, xh7
SSUB16 xg5, xh5, xg6
SSUB16 xg4, xh4, xg5
SSUB16 xi2, xi0, xi1 ;// (j2-j6)
SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2
SMULBB xi0, xi2, LoopRR2
SMULTB xi2, xi2, LoopRR2
MOV xi2, xi2, LSL #1
PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
;// xi0, xi1 now free
;// IStage 4,3 rows 0to1 x 1/2
LDRD xi0, [pDest] ;// j0, j4 scaled
SSUB16 xh2, xh2, xi3
ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
SHADD16 xh0, xi0, xi1
SHSUB16 xh1, xi0, xi1
;// IStage 2 rows 0to3 x 1/2
SHSUB16 xg2, xh1, xh2
SHADD16 xg1, xh1, xh2
SHSUB16 xg3, xh0, xh3
SHADD16 xg0, xh0, xh3
;// IStage 1 all rows
SADD16 xf3, xg3, xg4
SSUB16 xf4, xg3, xg4
SADD16 xf2, xg2, xg5
SSUB16 xf5, xg2, xg5
SADD16 xf1, xg1, xg6
SSUB16 xf6, xg1, xg6
SADD16 xf0, xg0, xg7
SSUB16 xf7, xg0, xg7
;// Transpose, store and loop
PKHBT ra01, xf0, xf1, LSL #16
PKHTB rb01, xf1, xf0, ASR #16
PKHBT ra23, xf2, xf3, LSL #16
PKHTB rb23, xf3, xf2, ASR #16
PKHBT ra45, xf4, xf5, LSL #16
PKHTB rb45, xf5, xf4, ASR #16
PKHBT ra67, xf6, xf7, LSL #16
STMIA pDest!, {ra01, ra23, ra45, ra67}
PKHTB rb67, xf7, xf6, ASR #16
STMIA pDest!, {rb01, rb23, rb45, rb67}
BCC v6_idct_col$_F
SUB pSrc, pDest, #(64*2)
M_LDR pDest, ppDest
IF "$stride"="s"
M_LDR pScale, pStride
ENDIF
B v6_idct_row$_F
v6OddZero$_F
SSUB16 xi2, xi6, xi7 ;// (j2-j6)
SHADD16 xi3, xi6, xi7 ;// (j2+j6)/2
SMULBB xi0, xi2, LoopRR2
SMULTB xi2, xi2, LoopRR2
MOV xi2, xi2, LSL #1
PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
SSUB16 xh2, xh2, xi3
;// xi0, xi1 now free
;// IStage 4,3 rows 0to1 x 1/2
SHADD16 xh0, xi4, xi5
SHSUB16 xh1, xi4, xi5
;// IStage 2 rows 0to3 x 1/2
SHSUB16 xg2, xh1, xh2
SHADD16 xg1, xh1, xh2
SHSUB16 xg3, xh0, xh3
SHADD16 xg0, xh0, xh3
;// IStage 1 all rows
MOV xf3, xg3
MOV xf4, xg3
MOV xf2, xg2
MOV xf5, xg2
MOV xf1, xg1
MOV xf6, xg1
MOV xf0, xg0
MOV xf7, xg0
;// Transpose
PKHBT ra01, xf0, xf1, LSL #16
PKHTB rb01, xf1, xf0, ASR #16
PKHBT ra23, xf2, xf3, LSL #16
PKHTB rb23, xf3, xf2, ASR #16
PKHBT ra45, xf4, xf5, LSL #16
PKHTB rb45, xf5, xf4, ASR #16
PKHBT ra67, xf6, xf7, LSL #16
PKHTB rb67, xf7, xf6, ASR #16
STMIA pDest!, {ra01, ra23, ra45, ra67}
ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
STMIA pDest!, {rb01, rb23, rb45, rb67}
BCC v6_idct_col$_F
SUB pSrc, pDest, #(64*2)
M_LDR pDest, ppDest
IF "$stride"="s"
M_LDR pScale, pStride
ENDIF
v6_idct_row$_F
;// IStage 4,3, rows4to7 x1/4
LDR xit, =0x00010001 ;// rounding constant
LDR xi0, [pSrc, #1*16] ;// j1
LDR xi1, [pSrc, #7*16] ;// 4*j7
LDR xi2, [pSrc, #5*16] ;// j5
LDR xi3, [pSrc, #3*16] ;// j3
SHADD16 xi1, xi1, xit ;// 2*j7
SHADD16 xi1, xi1, xit ;// j7
SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2
SSUB16 xi6, xi0, xi1 ;// j1-j7
SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2
SSUB16 xi4, xi2, xi3 ;// j5-j3
SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2
PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a
PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b
SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s]
SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s]
SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c]
SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c]
SMULBB xi1, xi3, LoopRR2
SMULTB xi3, xi3, LoopRR2
PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4
PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4
SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4
MOV xi3, xi3, LSL #1
PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4
;// xi0,xi1,xi2,xi3 now free
;// IStage 4,3, rows 2to3 x1/2
LDR xi0, [pSrc, #2*16] ;// j2
LDR xi1, [pSrc, #6*16] ;// 2*j6
;// IStage 2, rows4to7
SSUB16 xg6, xh6, xh7
SSUB16 xg5, xh5, xg6
SSUB16 xg4, xh4, xg5
SHADD16 xi1, xi1, xit ;// j6
SSUB16 xi2, xi0, xi1 ;// (j2-j6)
SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2
SMULBB xi0, xi2, LoopRR2
SMULTB xi2, xi2, LoopRR2
MOV xi2, xi2, LSL #1
PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
;// xi0, xi1 now free
;// IStage 4,3 rows 0to1 x 1/2
LDR xi1, [pSrc, #4*16] ;// j4
LDR xi0, [pSrc], #4 ;// j0
SSUB16 xh2, xh2, xi3
ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
ADD xi0, xi0, xit, LSL #2 ;// ensure correct round
SHADD16 xh0, xi0, xi1 ;// of DC result
SHSUB16 xh1, xi0, xi1
;// IStage 2 rows 0to3 x 1/2
SHSUB16 xg2, xh1, xh2
SHADD16 xg1, xh1, xh2
SHSUB16 xg3, xh0, xh3
SHADD16 xg0, xh0, xh3
;// IStage 1 all rows
SHADD16 xf3, xg3, xg4
SHSUB16 xf4, xg3, xg4
SHADD16 xf2, xg2, xg5
SHSUB16 xf5, xg2, xg5
SHADD16 xf1, xg1, xg6
SHSUB16 xf6, xg1, xg6
SHADD16 xf0, xg0, xg7
SHSUB16 xf7, xg0, xg7
;// Saturate
IF ("$outsize"="u8")
USAT16 xf0, #8, xf0
USAT16 xf1, #8, xf1
USAT16 xf2, #8, xf2
USAT16 xf3, #8, xf3
USAT16 xf4, #8, xf4
USAT16 xf5, #8, xf5
USAT16 xf6, #8, xf6
USAT16 xf7, #8, xf7
ENDIF
IF ("$outsize"="s9")
SSAT16 xf0, #9, xf0
SSAT16 xf1, #9, xf1
SSAT16 xf2, #9, xf2
SSAT16 xf3, #9, xf3
SSAT16 xf4, #9, xf4
SSAT16 xf5, #9, xf5
SSAT16 xf6, #9, xf6
SSAT16 xf7, #9, xf7
ENDIF
;// Transpose to Row, Pack and store
IF ("$outsize"="u8")
ORR xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
ORR xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
ORR xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
ORR xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
PKHBT ra01, xf0, xf2, LSL #16
PKHTB rb01, xf2, xf0, ASR #16
PKHBT ra23, xf4, xf6, LSL #16
PKHTB rb23, xf6, xf4, ASR #16
STMIA pDest, {ra01, ra23}
IF "$stride"="s"
ADD pDest, pDest, pScale
STMIA pDest, {rb01, rb23}
ADD pDest, pDest, pScale
ELSE
ADD pDest, pDest, #($stride)
STMIA pDest, {rb01, rb23}
ADD pDest, pDest, #($stride)
ENDIF
ENDIF
IF ("$outsize"="s9"):LOR:("$outsize"="s16")
PKHBT ra01, xf0, xf1, LSL #16
PKHTB rb01, xf1, xf0, ASR #16
PKHBT ra23, xf2, xf3, LSL #16
PKHTB rb23, xf3, xf2, ASR #16
PKHBT ra45, xf4, xf5, LSL #16
PKHTB rb45, xf5, xf4, ASR #16
PKHBT ra67, xf6, xf7, LSL #16
PKHTB rb67, xf7, xf6, ASR #16
STMIA pDest, {ra01, ra23, ra45, ra67}
IF "$stride"="s"
ADD pDest, pDest, pScale
STMIA pDest, {rb01, rb23, rb45, rb67}
ADD pDest, pDest, pScale
ELSE
ADD pDest, pDest, #($stride)
STMIA pDest, {rb01, rb23, rb45, rb67}
ADD pDest, pDest, #($stride)
ENDIF
ENDIF
BCC v6_idct_row$_F
ENDIF ;// ARM1136JS
IF CortexA8
Src0 EQU 7
Src1 EQU 8
Src2 EQU 9
Src3 EQU 10
Src4 EQU 11
Src5 EQU 12
Src6 EQU 13
Src7 EQU 14
Tmp EQU 15
qXj0 QN Src0.S16
qXj1 QN Src1.S16
qXj2 QN Src2.S16
qXj3 QN Src3.S16
qXj4 QN Src4.S16
qXj5 QN Src5.S16
qXj6 QN Src6.S16
qXj7 QN Src7.S16
qXjt QN Tmp.S16
dXj0lo DN (Src0*2).S16
dXj0hi DN (Src0*2+1).S16
dXj1lo DN (Src1*2).S16
dXj1hi DN (Src1*2+1).S16
dXj2lo DN (Src2*2).S16
dXj2hi DN (Src2*2+1).S16
dXj3lo DN (Src3*2).S16
dXj3hi DN (Src3*2+1).S16
dXj4lo DN (Src4*2).S16
dXj4hi DN (Src4*2+1).S16
dXj5lo DN (Src5*2).S16
dXj5hi DN (Src5*2+1).S16
dXj6lo DN (Src6*2).S16
dXj6hi DN (Src6*2+1).S16
dXj7lo DN (Src7*2).S16
dXj7hi DN (Src7*2+1).S16
dXjtlo DN (Tmp*2).S16
dXjthi DN (Tmp*2+1).S16
qXi0 QN qXj0
qXi1 QN qXj4
qXi2 QN qXj2
qXi3 QN qXj7
qXi4 QN qXj5
qXi5 QN qXjt
qXi6 QN qXj1
qXi7 QN qXj6
qXit QN qXj3
dXi0lo DN dXj0lo
dXi0hi DN dXj0hi
dXi1lo DN dXj4lo
dXi1hi DN dXj4hi
dXi2lo DN dXj2lo
dXi2hi DN dXj2hi
dXi3lo DN dXj7lo
dXi3hi DN dXj7hi
dXi4lo DN dXj5lo
dXi4hi DN dXj5hi
dXi5lo DN dXjtlo
dXi5hi DN dXjthi
dXi6lo DN dXj1lo
dXi6hi DN dXj1hi
dXi7lo DN dXj6lo
dXi7hi DN dXj6hi
dXitlo DN dXj3lo
dXithi DN dXj3hi
qXh0 QN qXit
qXh1 QN qXi0
qXh2 QN qXi2
qXh3 QN qXi3
qXh4 QN qXi7
qXh5 QN qXi5
qXh6 QN qXi4
qXh7 QN qXi1
qXht QN qXi6
dXh0lo DN dXitlo
dXh0hi DN dXithi
dXh1lo DN dXi0lo
dXh1hi DN dXi0hi
dXh2lo DN dXi2lo
dXh2hi DN dXi2hi
dXh3lo DN dXi3lo
dXh3hi DN dXi3hi
dXh4lo DN dXi7lo
dXh4hi DN dXi7hi
dXh5lo DN dXi5lo
dXh5hi DN dXi5hi
dXh6lo DN dXi4lo
dXh6hi DN dXi4hi
dXh7lo DN dXi1lo
dXh7hi DN dXi1hi
dXhtlo DN dXi6lo
dXhthi DN dXi6hi
qXg0 QN qXh2
qXg1 QN qXht
qXg2 QN qXh1
qXg3 QN qXh0
qXg4 QN qXh4
qXg5 QN qXh5
qXg6 QN qXh6
qXg7 QN qXh7
qXgt QN qXh3
qXf0 QN qXg6
qXf1 QN qXg5
qXf2 QN qXg4
qXf3 QN qXgt
qXf4 QN qXg3
qXf5 QN qXg2
qXf6 QN qXg1
qXf7 QN qXg0
qXft QN qXg7
qXt0 QN 1.S32
qXt1 QN 2.S32
qT0lo QN 1.S32
qT0hi QN 2.S32
qT1lo QN 3.S32
qT1hi QN 4.S32
qScalelo QN 5.S32 ;// used to read post scale values
qScalehi QN 6.S32
qTemp0 QN 5.S32
qTemp1 QN 6.S32
Scale1 EQU 6
Scale2 EQU 15
qScale1 QN Scale1.S16
qScale2 QN Scale2.S16
dScale1lo DN (Scale1*2).S16
dScale1hi DN (Scale1*2+1).S16
dScale2lo DN (Scale2*2).S16
dScale2hi DN (Scale2*2+1).S16
dCoefs DN 0.S16 ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
InvSqrt2 DN dCoefs[0] ;// 1/sqrt(2) in Q15
S DN dCoefs[1] ;// Sin(PI/8) in Q15
C DN dCoefs[2] ;// Cos(PI/8) in Q15
pTemp RN 12
IMPORT armCOMM_IDCTCoef
VLD1 {qXj0,qXj1}, [pSrc @64]!
VLD1 {qXj2,qXj3}, [pSrc @64]!
VLD1 {qXj4,qXj5}, [pSrc @64]!
VLD1 {qXj6,qXj7}, [pSrc @64]!
;// Load PreScale and multiply with Src
;// IStage 4
IF "$inscale"="s16" ;// 16X16 Mul
M_IDCT_PRESCALE16
ENDIF
IF "$inscale"="s32" ;// 32X32 ,ul
M_IDCT_PRESCALE32
ENDIF
;// IStage 3
VQDMULH qXi2, qXi2, InvSqrt2 ;// i2/sqrt(2)
VHADD qXh0, qXi0, qXi1 ;// (i0+i1)/2
VHSUB qXh1, qXi0, qXi1 ;// (i0-i1)/2
VHADD qXh7, qXi5, qXi7 ;// (i5+i7)/4
VSUB qXh5, qXi5, qXi7 ;// (i5-i7)/2
VQDMULH qXh5, qXh5, InvSqrt2 ;// h5/sqrt(2)
VSUB qXh2, qXi2, qXi3 ;// h2, h3
VMULL qXt0, dXi4lo, C ;// c*i4
VMLAL qXt0, dXi6lo, S ;// c*i4+s*i6
VMULL qXt1, dXi4hi, C
VMLAL qXt1, dXi6hi, S
VSHRN dXh4lo, qXt0, #16 ;// h4
VSHRN dXh4hi, qXt1, #16
VMULL qXt0, dXi6lo, C ;// c*i6
VMLSL qXt0, dXi4lo, S ;// -s*i4 + c*h6
VMULL qXt1, dXi6hi, C
VMLSL qXt1, dXi4hi, S
VSHRN dXh6lo, qXt0, #16 ;// h6
VSHRN dXh6hi, qXt1, #16
;// IStage 2
VSUB qXg6, qXh6, qXh7
VSUB qXg5, qXh5, qXg6
VSUB qXg4, qXh4, qXg5
VHADD qXg1, qXh1, qXh2 ;// (h1+h2)/2
VHSUB qXg2, qXh1, qXh2 ;// (h1-h2)/2
VHADD qXg0, qXh0, qXh3 ;// (h0+h3)/2
VHSUB qXg3, qXh0, qXh3 ;// (h0-h3)/2
;// IStage 1 all rows
VADD qXf3, qXg3, qXg4
VSUB qXf4, qXg3, qXg4
VADD qXf2, qXg2, qXg5
VSUB qXf5, qXg2, qXg5
VADD qXf1, qXg1, qXg6
VSUB qXf6, qXg1, qXg6
VADD qXf0, qXg0, qXg7
VSUB qXf7, qXg0, qXg7
;// Transpose, store and loop
XTR0 EQU Src5
XTR1 EQU Tmp
XTR2 EQU Src6
XTR3 EQU Src7
XTR4 EQU Src3
XTR5 EQU Src0
XTR6 EQU Src1
XTR7 EQU Src2
XTRt EQU Src4
qA0 QN XTR0.S32 ;// for XTRpose
qA1 QN XTR1.S32
qA2 QN XTR2.S32
qA3 QN XTR3.S32
qA4 QN XTR4.S32
qA5 QN XTR5.S32
qA6 QN XTR6.S32
qA7 QN XTR7.S32
dB0 DN XTR0*2+1 ;// for using VSWP
dB1 DN XTR1*2+1
dB2 DN XTR2*2+1
dB3 DN XTR3*2+1
dB4 DN XTR4*2
dB5 DN XTR5*2
dB6 DN XTR6*2
dB7 DN XTR7*2
VTRN qXf0, qXf1
VTRN qXf2, qXf3
VTRN qXf4, qXf5
VTRN qXf6, qXf7
VTRN qA0, qA2
VTRN qA1, qA3
VTRN qA4, qA6
VTRN qA5, qA7
VSWP dB0, dB4
VSWP dB1, dB5
VSWP dB2, dB6
VSWP dB3, dB7
qYj0 QN qXf0
qYj1 QN qXf1
qYj2 QN qXf2
qYj3 QN qXf3
qYj4 QN qXf4
qYj5 QN qXf5
qYj6 QN qXf6
qYj7 QN qXf7
qYjt QN qXft
dYj0lo DN (XTR0*2).S16
dYj0hi DN (XTR0*2+1).S16
dYj1lo DN (XTR1*2).S16
dYj1hi DN (XTR1*2+1).S16
dYj2lo DN (XTR2*2).S16
dYj2hi DN (XTR2*2+1).S16
dYj3lo DN (XTR3*2).S16
dYj3hi DN (XTR3*2+1).S16
dYj4lo DN (XTR4*2).S16
dYj4hi DN (XTR4*2+1).S16
dYj5lo DN (XTR5*2).S16
dYj5hi DN (XTR5*2+1).S16
dYj6lo DN (XTR6*2).S16
dYj6hi DN (XTR6*2+1).S16
dYj7lo DN (XTR7*2).S16
dYj7hi DN (XTR7*2+1).S16
dYjtlo DN (XTRt*2).S16
dYjthi DN (XTRt*2+1).S16
qYi0 QN qYj0
qYi1 QN qYj4
qYi2 QN qYj2
qYi3 QN qYj7
qYi4 QN qYj5
qYi5 QN qYjt
qYi6 QN qYj1
qYi7 QN qYj6
qYit QN qYj3
dYi0lo DN dYj0lo
dYi0hi DN dYj0hi
dYi1lo DN dYj4lo
dYi1hi DN dYj4hi
dYi2lo DN dYj2lo
dYi2hi DN dYj2hi
dYi3lo DN dYj7lo
dYi3hi DN dYj7hi
dYi4lo DN dYj5lo
dYi4hi DN dYj5hi
dYi5lo DN dYjtlo
dYi5hi DN dYjthi
dYi6lo DN dYj1lo
dYi6hi DN dYj1hi
dYi7lo DN dYj6lo
dYi7hi DN dYj6hi
dYitlo DN dYj3lo
dYithi DN dYj3hi
qYh0 QN qYit
qYh1 QN qYi0
qYh2 QN qYi2
qYh3 QN qYi3
qYh4 QN qYi7
qYh5 QN qYi5
qYh6 QN qYi4
qYh7 QN qYi1
qYht QN qYi6
dYh0lo DN dYitlo
dYh0hi DN dYithi
dYh1lo DN dYi0lo
dYh1hi DN dYi0hi
dYh2lo DN dYi2lo
dYh2hi DN dYi2hi
dYh3lo DN dYi3lo
dYh3hi DN dYi3hi
dYh4lo DN dYi7lo
dYh4hi DN dYi7hi
dYh5lo DN dYi5lo
dYh5hi DN dYi5hi
dYh6lo DN dYi4lo
dYh6hi DN dYi4hi
dYh7lo DN dYi1lo
dYh7hi DN dYi1hi
dYhtlo DN dYi6lo
dYhthi DN dYi6hi
qYg0 QN qYh2
qYg1 QN qYht
qYg2 QN qYh1
qYg3 QN qYh0
qYg4 QN qYh4
qYg5 QN qYh5
qYg6 QN qYh6
qYg7 QN qYh7
qYgt QN qYh3
qYf0 QN qYg6
qYf1 QN qYg5
qYf2 QN qYg4
qYf3 QN qYgt
qYf4 QN qYg3
qYf5 QN qYg2
qYf6 QN qYg1
qYf7 QN qYg0
qYft QN qYg7
VRSHR qYj7, qYj7, #2
VRSHR qYj6, qYj6, #1
VHADD qYi5, qYj1, qYj7 ;// i5 = (j1+j7)/2
VSUB qYi6, qYj1, qYj7 ;// i6 = j1-j7
VHADD qYi3, qYj2, qYj6 ;// i3 = (j2+j6)/2
VSUB qYi2, qYj2, qYj6 ;// i2 = j2-j6
VHADD qYi7, qYj5, qYj3 ;// i7 = (j5+j3)/2
VSUB qYi4, qYj5, qYj3 ;// i4 = j5-j3
VQDMULH qYi2, qYi2, InvSqrt2 ;// i2/sqrt(2)
;// IStage 4,3 rows 0to1 x 1/2
MOV pTemp, #0x4 ;// ensure correct round
VDUP qScale1, pTemp ;// of DC result
VADD qYi0, qYi0, qScale1
VHADD qYh0, qYi0, qYi1 ;// (i0+i1)/2
VHSUB qYh1, qYi0, qYi1 ;// (i0-i1)/2
VHADD qYh7, qYi5, qYi7 ;// (i5+i7)/4
VSUB qYh5, qYi5, qYi7 ;// (i5-i7)/2
VSUB qYh2, qYi2, qYi3 ;// h2, h3
VQDMULH qYh5, qYh5, InvSqrt2 ;// h5/sqrt(2)
VMULL qXt0, dYi4lo, C ;// c*i4
VMLAL qXt0, dYi6lo, S ;// c*i4+s*i6
VMULL qXt1, dYi4hi, C
VMLAL qXt1, dYi6hi, S
VSHRN dYh4lo, qXt0, #16 ;// h4
VSHRN dYh4hi, qXt1, #16
VMULL qXt0, dYi6lo, C ;// c*i6
VMLSL qXt0, dYi4lo, S ;// -s*i4 + c*h6
VMULL qXt1, dYi6hi, C
VMLSL qXt1, dYi4hi, S
VSHRN dYh6lo, qXt0, #16 ;// h6
VSHRN dYh6hi, qXt1, #16
VSUB qYg6, qYh6, qYh7
VSUB qYg5, qYh5, qYg6
VSUB qYg4, qYh4, qYg5
;// IStage 2 rows 0to3 x 1/2
VHADD qYg1, qYh1, qYh2 ;// (h1+h2)/2
VHSUB qYg2, qYh1, qYh2 ;// (h1-h2)/2
VHADD qYg0, qYh0, qYh3 ;// (h0+h3)/2
VHSUB qYg3, qYh0, qYh3 ;// (h0-h3)/2
;// IStage 1 all rows
VHADD qYf3, qYg3, qYg4
VHSUB qYf4, qYg3, qYg4
VHADD qYf2, qYg2, qYg5
VHSUB qYf5, qYg2, qYg5
VHADD qYf1, qYg1, qYg6
VHSUB qYf6, qYg1, qYg6
VHADD qYf0, qYg0, qYg7
VHSUB qYf7, qYg0, qYg7
YTR0 EQU Src0
YTR1 EQU Src4
YTR2 EQU Src1
YTR3 EQU Src2
YTR4 EQU Src7
YTR5 EQU Src5
YTR6 EQU Tmp
YTR7 EQU Src6
YTRt EQU Src3
qC0 QN YTR0.S32 ;// for YTRpose
qC1 QN YTR1.S32
qC2 QN YTR2.S32
qC3 QN YTR3.S32
qC4 QN YTR4.S32
qC5 QN YTR5.S32
qC6 QN YTR6.S32
qC7 QN YTR7.S32
dD0 DN YTR0*2+1 ;// for using VSWP
dD1 DN YTR1*2+1
dD2 DN YTR2*2+1
dD3 DN YTR3*2+1
dD4 DN YTR4*2
dD5 DN YTR5*2
dD6 DN YTR6*2
dD7 DN YTR7*2
VTRN qYf0, qYf1
VTRN qYf2, qYf3
VTRN qYf4, qYf5
VTRN qYf6, qYf7
VTRN qC0, qC2
VTRN qC1, qC3
VTRN qC4, qC6
VTRN qC5, qC7
VSWP dD0, dD4
VSWP dD1, dD5
VSWP dD2, dD6
VSWP dD3, dD7
dYf0U8 DN YTR0*2.U8
dYf1U8 DN YTR1*2.U8
dYf2U8 DN YTR2*2.U8
dYf3U8 DN YTR3*2.U8
dYf4U8 DN YTR4*2.U8
dYf5U8 DN YTR5*2.U8
dYf6U8 DN YTR6*2.U8
dYf7U8 DN YTR7*2.U8
;//
;// Do saturation if outsize is other than S16
;//
IF ("$outsize"="u8")
;// Output range [0-255]
VQMOVN dYf0U8, qYf0
VQMOVN dYf1U8, qYf1
VQMOVN dYf2U8, qYf2
VQMOVN dYf3U8, qYf3
VQMOVN dYf4U8, qYf4
VQMOVN dYf5U8, qYf5
VQMOVN dYf6U8, qYf6
VQMOVN dYf7U8, qYf7
ENDIF
IF ("$outsize"="s9")
;// Output range [-256 to +255]
VQSHL qYf0, qYf0, #16-9
VQSHL qYf1, qYf1, #16-9
VQSHL qYf2, qYf2, #16-9
VQSHL qYf3, qYf3, #16-9
VQSHL qYf4, qYf4, #16-9
VQSHL qYf5, qYf5, #16-9
VQSHL qYf6, qYf6, #16-9
VQSHL qYf7, qYf7, #16-9
VSHR qYf0, qYf0, #16-9
VSHR qYf1, qYf1, #16-9
VSHR qYf2, qYf2, #16-9
VSHR qYf3, qYf3, #16-9
VSHR qYf4, qYf4, #16-9
VSHR qYf5, qYf5, #16-9
VSHR qYf6, qYf6, #16-9
VSHR qYf7, qYf7, #16-9
ENDIF
;// Store output depending on the Stride size
IF "$stride"="s"
VST1 qYf0, [pDest @64], Stride
VST1 qYf1, [pDest @64], Stride
VST1 qYf2, [pDest @64], Stride
VST1 qYf3, [pDest @64], Stride
VST1 qYf4, [pDest @64], Stride
VST1 qYf5, [pDest @64], Stride
VST1 qYf6, [pDest @64], Stride
VST1 qYf7, [pDest @64]
ELSE
IF ("$outsize"="u8")
VST1 dYf0U8, [pDest @64], #8
VST1 dYf1U8, [pDest @64], #8
VST1 dYf2U8, [pDest @64], #8
VST1 dYf3U8, [pDest @64], #8
VST1 dYf4U8, [pDest @64], #8
VST1 dYf5U8, [pDest @64], #8
VST1 dYf6U8, [pDest @64], #8
VST1 dYf7U8, [pDest @64]
ELSE
;// ("$outsize"="s9") or ("$outsize"="s16")
VST1 qYf0, [pDest @64], #16
VST1 qYf1, [pDest @64], #16
VST1 qYf2, [pDest @64], #16
VST1 qYf3, [pDest @64], #16
VST1 qYf4, [pDest @64], #16
VST1 qYf5, [pDest @64], #16
VST1 qYf6, [pDest @64], #16
VST1 qYf7, [pDest @64]
ENDIF
ENDIF
ENDIF ;// CortexA8
MEND
;// Scale TWO input rows with TWO rows of 16 bit scale values
;//
;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
;// input (Eight input values) with one row of scale values. Also
;// Loads next scale values from pScale, if $LastRow flag is not set.
;//
;// Input Registers:
;//
;// $dAlo - Input D register with first four S16 values of row n
;// $dAhi - Input D register with next four S16 values of row n
;// $dBlo - Input D register with first four S16 values of row n+1
;// $dBhi - Input D register with next four S16 values of row n+1
;// pScale - Pointer to next row of scale values
;// qT0lo - Temporary scratch register
;// qT0hi - Temporary scratch register
;// qT1lo - Temporary scratch register
;// qT1hi - Temporary scratch register
;// dScale1lo - Scale value of row n
;// dScale1hi - Scale value of row n
;// dScale2lo - Scale value of row n+1
;// dScale2hi - Scale value of row n+1
;//
;// Input Flag
;//
;// $LastRow - Flag to indicate whether current row is last row
;//
;// Output Registers:
;//
;// $dAlo - Scaled output values (first four S16 of row n)
;// $dAhi - Scaled output values (next four S16 of row n)
;// $dBlo - Scaled output values (first four S16 of row n+1)
;// $dBhi - Scaled output values (next four S16 of row n+1)
;// qScale1 - Scale values for next row
;// qScale2 - Scale values for next row+1
;// pScale - Pointer to next row of scale values
;//
MACRO
M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
VMULL qT0lo, $dAlo, dScale1lo
VMULL qT0hi, $dAhi, dScale1hi
VMULL qT1lo, $dBlo, dScale2lo
VMULL qT1hi, $dBhi, dScale2hi
IF "$LastRow"="0"
VLD1 qScale1, [pScale], #16 ;// Load scale for row n+1
VLD1 qScale2, [pScale], #16 ;// Load scale for row n+2
ENDIF
VQRSHRN $dAlo, qT0lo, #12
VQRSHRN $dAhi, qT0hi, #12
VQRSHRN $dBlo, qT1lo, #12
VQRSHRN $dBhi, qT1hi, #12
MEND
;// Scale 8x8 block input values with 16 bit scale values
;//
;// This macro is used to pre-scale block of 8x8 input.
;// This also do the Ist stage transformations of IDCT.
;//
;// Input Registers:
;//
;// dXjnlo - n th input D register with first four S16 values
;// dXjnhi - n th input D register with next four S16 values
;// qXjn - n th input Q register with eight S16 values
;// pScale - Pointer to scale values
;//
;// Output Registers:
;//
;// qXin - n th output Q register with eight S16 output values of 1st stage
;//
MACRO
M_IDCT_PRESCALE16
VLD1 qScale1, [pScale], #16 ;// Load Pre scale for row 0
VLD1 qScale2, [pScale], #16 ;// Load Pre scale for row 0
M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0 ;// Pre scale row 0 & 1
M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
VHADD qXi5, qXj1, qXj7 ;// (j1+j7)/2
VSUB qXi6, qXj1, qXj7 ;// j1-j7
LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
VHADD qXi3, qXj2, qXj6 ;// (j2+j6)/2
VSUB qXi2, qXj2, qXj6 ;// j2-j6
VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants
VHADD qXi7, qXj5, qXj3 ;// (j5+j3)/2
VSUB qXi4, qXj5, qXj3 ;// j5-j3
MEND
;// Scale 8x8 block input values with 32 bit scale values
;//
;// This macro is used to pre-scale block of 8x8 input.
;// This also do the Ist stage transformations of IDCT.
;//
;// Input Registers:
;//
;// dXjnlo - n th input D register with first four S16 values
;// dXjnhi - n th input D register with next four S16 values
;// qXjn - n th input Q register with eight S16 values
;// pScale - Pointer to 32bit scale values in Q23 format
;//
;// Output Registers:
;//
;// dXinlo - n th output D register with first four S16 output values of 1st stage
;// dXinhi - n th output D register with next four S16 output values of 1st stage
;//
MACRO
M_IDCT_PRESCALE32
qScale0lo QN 0.S32
qScale0hi QN 1.S32
qScale1lo QN 2.S32
qScale1hi QN 3.S32
qScale2lo QN qScale1lo
qScale2hi QN qScale1hi
qScale3lo QN qScale1lo
qScale3hi QN qScale1hi
qScale4lo QN qScale1lo
qScale4hi QN qScale1hi
qScale5lo QN qScale0lo
qScale5hi QN qScale0hi
qScale6lo QN qScale0lo
qScale6hi QN qScale0hi
qScale7lo QN qScale0lo
qScale7hi QN qScale0hi
qSrc0lo QN 4.S32
qSrc0hi QN 5.S32
qSrc1lo QN 6.S32
qSrc1hi QN Src4.S32
qSrc2lo QN qSrc0lo
qSrc2hi QN qSrc0hi
qSrc3lo QN qSrc0lo
qSrc3hi QN qSrc0hi
qSrc4lo QN qSrc0lo
qSrc4hi QN qSrc0hi
qSrc5lo QN qSrc1lo
qSrc5hi QN qSrc1hi
qSrc6lo QN qSrc1lo
qSrc6hi QN qSrc1hi
qSrc7lo QN qSrc0lo
qSrc7hi QN qSrc0hi
qRes17lo QN qScale0lo
qRes17hi QN qScale0hi
qRes26lo QN qScale0lo
qRes26hi QN qScale0hi
qRes53lo QN qScale0lo
qRes53hi QN qScale0hi
ADD pTemp, pScale, #4*8*7 ;// Address of pScale[7]
;// Row 0
VLD1 {qScale0lo, qScale0hi}, [pScale]!
VSHLL qSrc0lo, dXj0lo, #(12-1)
VSHLL qSrc0hi, dXj0hi, #(12-1)
VLD1 {qScale1lo, qScale1hi}, [pScale]!
VQRDMULH qSrc0lo, qScale0lo, qSrc0lo
VQRDMULH qSrc0hi, qScale0hi, qSrc0hi
VLD1 {qScale7lo, qScale7hi}, [pTemp]!
VSHLL qSrc1lo, dXj1lo, #(12-1)
VSHLL qSrc1hi, dXj1hi, #(12-1)
VMOVN dXi0lo, qSrc0lo ;// Output i0
VMOVN dXi0hi, qSrc0hi
VSHLL qSrc7lo, dXj7lo, #(12-1)
VSHLL qSrc7hi, dXj7hi, #(12-1)
SUB pTemp, pTemp, #((16*2)+(4*8*1))
VQRDMULH qSrc1lo, qScale1lo, qSrc1lo
VQRDMULH qSrc1hi, qScale1hi, qSrc1hi
VQRDMULH qSrc7lo, qScale7lo, qSrc7lo
VQRDMULH qSrc7hi, qScale7hi, qSrc7hi
VLD1 {qScale2lo, qScale2hi}, [pScale]!
;// Row 1 & 7
VHADD qRes17lo, qSrc1lo, qSrc7lo ;// (j1+j7)/2
VHADD qRes17hi, qSrc1hi, qSrc7hi ;// (j1+j7)/2
VMOVN dXi5lo, qRes17lo ;// Output i5
VMOVN dXi5hi, qRes17hi
VSUB qRes17lo, qSrc1lo, qSrc7lo ;// j1-j7
VSUB qRes17hi, qSrc1hi, qSrc7hi ;// j1-j7
VMOVN dXi6lo, qRes17lo ;// Output i6
VMOVN dXi6hi, qRes17hi
VSHLL qSrc2lo, dXj2lo, #(12-1)
VSHLL qSrc2hi, dXj2hi, #(12-1)
VLD1 {qScale6lo, qScale6hi}, [pTemp]!
VSHLL qSrc6lo, dXj6lo, #(12-1)
VSHLL qSrc6hi, dXj6hi, #(12-1)
SUB pTemp, pTemp, #((16*2)+(4*8*1))
VQRDMULH qSrc2lo, qScale2lo, qSrc2lo
VQRDMULH qSrc2hi, qScale2hi, qSrc2hi
VQRDMULH qSrc6lo, qScale6lo, qSrc6lo
VQRDMULH qSrc6hi, qScale6hi, qSrc6hi
VLD1 {qScale3lo, qScale3hi}, [pScale]!
;// Row 2 & 6
VHADD qRes26lo, qSrc2lo, qSrc6lo ;// (j2+j6)/2
VHADD qRes26hi, qSrc2hi, qSrc6hi ;// (j2+j6)/2
VMOVN dXi3lo, qRes26lo ;// Output i3
VMOVN dXi3hi, qRes26hi
VSUB qRes26lo, qSrc2lo, qSrc6lo ;// j2-j6
VSUB qRes26hi, qSrc2hi, qSrc6hi ;// j2-j6
VMOVN dXi2lo, qRes26lo ;// Output i2
VMOVN dXi2hi, qRes26hi
VSHLL qSrc3lo, dXj3lo, #(12-1)
VSHLL qSrc3hi, dXj3hi, #(12-1)
VLD1 {qScale5lo, qScale5hi}, [pTemp]!
VSHLL qSrc5lo, dXj5lo, #(12-1)
VSHLL qSrc5hi, dXj5hi, #(12-1)
VQRDMULH qSrc3lo, qScale3lo, qSrc3lo
VQRDMULH qSrc3hi, qScale3hi, qSrc3hi
VQRDMULH qSrc5lo, qScale5lo, qSrc5lo
VQRDMULH qSrc5hi, qScale5hi, qSrc5hi
;// Row 3 & 5
VHADD qRes53lo, qSrc5lo, qSrc3lo ;// (j5+j3)/2
VHADD qRes53hi, qSrc5hi, qSrc3hi ;// (j5+j3)/2
SUB pSrc, pSrc, #16*2*2
VMOVN dXi7lo, qRes53lo ;// Output i7
VMOVN dXi7hi, qRes53hi
VSUB qRes53lo, qSrc5lo, qSrc3lo ;// j5-j3
VSUB qRes53hi, qSrc5hi, qSrc3hi ;// j5-j3
VLD1 qXj4, [pSrc @64]
VMOVN dXi4lo, qRes53lo ;// Output i4
VMOVN dXi4hi, qRes53hi
VSHLL qSrc4lo, dXj4lo, #(12-1)
VSHLL qSrc4hi, dXj4hi, #(12-1)
VLD1 {qScale4lo, qScale4hi}, [pScale]
LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
VQRDMULH qSrc4lo, qScale4lo, qSrc4lo
VQRDMULH qSrc4hi, qScale4hi, qSrc4hi
VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants
;// Row 4
VMOVN dXi1lo, qSrc4lo ;// Output i1
VMOVN dXi1hi, qSrc4hi
MEND
END
|