1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
|
%ifidn __OUTPUT_FORMAT__,obj
section code use32 class=code align=64
%elifidn __OUTPUT_FORMAT__,win32
%ifdef __YASM_VERSION_ID__
%if __YASM_VERSION_ID__ < 01010000h
%error yasm version 1.1.0 or later needed.
%endif
; Yasm automatically includes .00 and complains about redefining it.
; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
%else
$@feat.00 equ 1
%endif
section .text code align=64
%else
section .text code
%endif
;extern _OPENSSL_ia32cap_P
global _bn_mul_mont
align 16
_bn_mul_mont:
L$_bn_mul_mont_begin:
push ebp
push ebx
push esi
push edi
xor eax,eax
mov edi,DWORD [40+esp]
cmp edi,4
jl NEAR L$000just_leave
lea esi,[20+esp]
lea edx,[24+esp]
mov ebp,esp
add edi,2
neg edi
lea esp,[edi*4+esp-32]
neg edi
mov eax,esp
sub eax,edx
and eax,2047
sub esp,eax
xor edx,esp
and edx,2048
xor edx,2048
sub esp,edx
and esp,-64
mov eax,DWORD [esi]
mov ebx,DWORD [4+esi]
mov ecx,DWORD [8+esi]
mov edx,DWORD [12+esi]
mov esi,DWORD [16+esi]
mov esi,DWORD [esi]
mov DWORD [4+esp],eax
mov DWORD [8+esp],ebx
mov DWORD [12+esp],ecx
mov DWORD [16+esp],edx
mov DWORD [20+esp],esi
lea ebx,[edi-3]
mov DWORD [24+esp],ebp
lea eax,[_OPENSSL_ia32cap_P]
bt DWORD [eax],26
jnc NEAR L$001non_sse2
mov eax,-1
movd mm7,eax
mov esi,DWORD [8+esp]
mov edi,DWORD [12+esp]
mov ebp,DWORD [16+esp]
xor edx,edx
xor ecx,ecx
movd mm4,DWORD [edi]
movd mm5,DWORD [esi]
movd mm3,DWORD [ebp]
pmuludq mm5,mm4
movq mm2,mm5
movq mm0,mm5
pand mm0,mm7
pmuludq mm5,[20+esp]
pmuludq mm3,mm5
paddq mm3,mm0
movd mm1,DWORD [4+ebp]
movd mm0,DWORD [4+esi]
psrlq mm2,32
psrlq mm3,32
inc ecx
align 16
L$0021st:
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
paddq mm3,mm1
movq mm0,mm2
pand mm0,mm7
movd mm1,DWORD [4+ecx*4+ebp]
paddq mm3,mm0
movd mm0,DWORD [4+ecx*4+esi]
psrlq mm2,32
movd DWORD [28+ecx*4+esp],mm3
psrlq mm3,32
lea ecx,[1+ecx]
cmp ecx,ebx
jl NEAR L$0021st
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
paddq mm3,mm1
movq mm0,mm2
pand mm0,mm7
paddq mm3,mm0
movd DWORD [28+ecx*4+esp],mm3
psrlq mm2,32
psrlq mm3,32
paddq mm3,mm2
movq [32+ebx*4+esp],mm3
inc edx
L$003outer:
xor ecx,ecx
movd mm4,DWORD [edx*4+edi]
movd mm5,DWORD [esi]
movd mm6,DWORD [32+esp]
movd mm3,DWORD [ebp]
pmuludq mm5,mm4
paddq mm5,mm6
movq mm0,mm5
movq mm2,mm5
pand mm0,mm7
pmuludq mm5,[20+esp]
pmuludq mm3,mm5
paddq mm3,mm0
movd mm6,DWORD [36+esp]
movd mm1,DWORD [4+ebp]
movd mm0,DWORD [4+esi]
psrlq mm2,32
psrlq mm3,32
paddq mm2,mm6
inc ecx
dec ebx
L$004inner:
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
paddq mm3,mm1
movq mm0,mm2
movd mm6,DWORD [36+ecx*4+esp]
pand mm0,mm7
movd mm1,DWORD [4+ecx*4+ebp]
paddq mm3,mm0
movd mm0,DWORD [4+ecx*4+esi]
psrlq mm2,32
movd DWORD [28+ecx*4+esp],mm3
psrlq mm3,32
paddq mm2,mm6
dec ebx
lea ecx,[1+ecx]
jnz NEAR L$004inner
mov ebx,ecx
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
paddq mm3,mm1
movq mm0,mm2
pand mm0,mm7
paddq mm3,mm0
movd DWORD [28+ecx*4+esp],mm3
psrlq mm2,32
psrlq mm3,32
movd mm6,DWORD [36+ebx*4+esp]
paddq mm3,mm2
paddq mm3,mm6
movq [32+ebx*4+esp],mm3
lea edx,[1+edx]
cmp edx,ebx
jle NEAR L$003outer
emms
jmp NEAR L$005common_tail
align 16
L$001non_sse2:
mov esi,DWORD [8+esp]
lea ebp,[1+ebx]
mov edi,DWORD [12+esp]
xor ecx,ecx
mov edx,esi
and ebp,1
sub edx,edi
lea eax,[4+ebx*4+edi]
or ebp,edx
mov edi,DWORD [edi]
jz NEAR L$006bn_sqr_mont
mov DWORD [28+esp],eax
mov eax,DWORD [esi]
xor edx,edx
align 16
L$007mull:
mov ebp,edx
mul edi
add ebp,eax
lea ecx,[1+ecx]
adc edx,0
mov eax,DWORD [ecx*4+esi]
cmp ecx,ebx
mov DWORD [28+ecx*4+esp],ebp
jl NEAR L$007mull
mov ebp,edx
mul edi
mov edi,DWORD [20+esp]
add eax,ebp
mov esi,DWORD [16+esp]
adc edx,0
imul edi,DWORD [32+esp]
mov DWORD [32+ebx*4+esp],eax
xor ecx,ecx
mov DWORD [36+ebx*4+esp],edx
mov DWORD [40+ebx*4+esp],ecx
mov eax,DWORD [esi]
mul edi
add eax,DWORD [32+esp]
mov eax,DWORD [4+esi]
adc edx,0
inc ecx
jmp NEAR L$0082ndmadd
align 16
L$0091stmadd:
mov ebp,edx
mul edi
add ebp,DWORD [32+ecx*4+esp]
lea ecx,[1+ecx]
adc edx,0
add ebp,eax
mov eax,DWORD [ecx*4+esi]
adc edx,0
cmp ecx,ebx
mov DWORD [28+ecx*4+esp],ebp
jl NEAR L$0091stmadd
mov ebp,edx
mul edi
add eax,DWORD [32+ebx*4+esp]
mov edi,DWORD [20+esp]
adc edx,0
mov esi,DWORD [16+esp]
add ebp,eax
adc edx,0
imul edi,DWORD [32+esp]
xor ecx,ecx
add edx,DWORD [36+ebx*4+esp]
mov DWORD [32+ebx*4+esp],ebp
adc ecx,0
mov eax,DWORD [esi]
mov DWORD [36+ebx*4+esp],edx
mov DWORD [40+ebx*4+esp],ecx
mul edi
add eax,DWORD [32+esp]
mov eax,DWORD [4+esi]
adc edx,0
mov ecx,1
align 16
L$0082ndmadd:
mov ebp,edx
mul edi
add ebp,DWORD [32+ecx*4+esp]
lea ecx,[1+ecx]
adc edx,0
add ebp,eax
mov eax,DWORD [ecx*4+esi]
adc edx,0
cmp ecx,ebx
mov DWORD [24+ecx*4+esp],ebp
jl NEAR L$0082ndmadd
mov ebp,edx
mul edi
add ebp,DWORD [32+ebx*4+esp]
adc edx,0
add ebp,eax
adc edx,0
mov DWORD [28+ebx*4+esp],ebp
xor eax,eax
mov ecx,DWORD [12+esp]
add edx,DWORD [36+ebx*4+esp]
adc eax,DWORD [40+ebx*4+esp]
lea ecx,[4+ecx]
mov DWORD [32+ebx*4+esp],edx
cmp ecx,DWORD [28+esp]
mov DWORD [36+ebx*4+esp],eax
je NEAR L$005common_tail
mov edi,DWORD [ecx]
mov esi,DWORD [8+esp]
mov DWORD [12+esp],ecx
xor ecx,ecx
xor edx,edx
mov eax,DWORD [esi]
jmp NEAR L$0091stmadd
align 16
L$006bn_sqr_mont:
mov DWORD [esp],ebx
mov DWORD [12+esp],ecx
mov eax,edi
mul edi
mov DWORD [32+esp],eax
mov ebx,edx
shr edx,1
and ebx,1
inc ecx
align 16
L$010sqr:
mov eax,DWORD [ecx*4+esi]
mov ebp,edx
mul edi
add eax,ebp
lea ecx,[1+ecx]
adc edx,0
lea ebp,[eax*2+ebx]
shr eax,31
cmp ecx,DWORD [esp]
mov ebx,eax
mov DWORD [28+ecx*4+esp],ebp
jl NEAR L$010sqr
mov eax,DWORD [ecx*4+esi]
mov ebp,edx
mul edi
add eax,ebp
mov edi,DWORD [20+esp]
adc edx,0
mov esi,DWORD [16+esp]
lea ebp,[eax*2+ebx]
imul edi,DWORD [32+esp]
shr eax,31
mov DWORD [32+ecx*4+esp],ebp
lea ebp,[edx*2+eax]
mov eax,DWORD [esi]
shr edx,31
mov DWORD [36+ecx*4+esp],ebp
mov DWORD [40+ecx*4+esp],edx
mul edi
add eax,DWORD [32+esp]
mov ebx,ecx
adc edx,0
mov eax,DWORD [4+esi]
mov ecx,1
align 16
L$0113rdmadd:
mov ebp,edx
mul edi
add ebp,DWORD [32+ecx*4+esp]
adc edx,0
add ebp,eax
mov eax,DWORD [4+ecx*4+esi]
adc edx,0
mov DWORD [28+ecx*4+esp],ebp
mov ebp,edx
mul edi
add ebp,DWORD [36+ecx*4+esp]
lea ecx,[2+ecx]
adc edx,0
add ebp,eax
mov eax,DWORD [ecx*4+esi]
adc edx,0
cmp ecx,ebx
mov DWORD [24+ecx*4+esp],ebp
jl NEAR L$0113rdmadd
mov ebp,edx
mul edi
add ebp,DWORD [32+ebx*4+esp]
adc edx,0
add ebp,eax
adc edx,0
mov DWORD [28+ebx*4+esp],ebp
mov ecx,DWORD [12+esp]
xor eax,eax
mov esi,DWORD [8+esp]
add edx,DWORD [36+ebx*4+esp]
adc eax,DWORD [40+ebx*4+esp]
mov DWORD [32+ebx*4+esp],edx
cmp ecx,ebx
mov DWORD [36+ebx*4+esp],eax
je NEAR L$005common_tail
mov edi,DWORD [4+ecx*4+esi]
lea ecx,[1+ecx]
mov eax,edi
mov DWORD [12+esp],ecx
mul edi
add eax,DWORD [32+ecx*4+esp]
adc edx,0
mov DWORD [32+ecx*4+esp],eax
xor ebp,ebp
cmp ecx,ebx
lea ecx,[1+ecx]
je NEAR L$012sqrlast
mov ebx,edx
shr edx,1
and ebx,1
align 16
L$013sqradd:
mov eax,DWORD [ecx*4+esi]
mov ebp,edx
mul edi
add eax,ebp
lea ebp,[eax*1+eax]
adc edx,0
shr eax,31
add ebp,DWORD [32+ecx*4+esp]
lea ecx,[1+ecx]
adc eax,0
add ebp,ebx
adc eax,0
cmp ecx,DWORD [esp]
mov DWORD [28+ecx*4+esp],ebp
mov ebx,eax
jle NEAR L$013sqradd
mov ebp,edx
add edx,edx
shr ebp,31
add edx,ebx
adc ebp,0
L$012sqrlast:
mov edi,DWORD [20+esp]
mov esi,DWORD [16+esp]
imul edi,DWORD [32+esp]
add edx,DWORD [32+ecx*4+esp]
mov eax,DWORD [esi]
adc ebp,0
mov DWORD [32+ecx*4+esp],edx
mov DWORD [36+ecx*4+esp],ebp
mul edi
add eax,DWORD [32+esp]
lea ebx,[ecx-1]
adc edx,0
mov ecx,1
mov eax,DWORD [4+esi]
jmp NEAR L$0113rdmadd
align 16
L$005common_tail:
mov ebp,DWORD [16+esp]
mov edi,DWORD [4+esp]
lea esi,[32+esp]
mov eax,DWORD [esi]
mov ecx,ebx
xor edx,edx
align 16
L$014sub:
sbb eax,DWORD [edx*4+ebp]
mov DWORD [edx*4+edi],eax
dec ecx
mov eax,DWORD [4+edx*4+esi]
lea edx,[1+edx]
jge NEAR L$014sub
sbb eax,0
align 16
L$015copy:
mov edx,DWORD [ebx*4+esi]
mov ebp,DWORD [ebx*4+edi]
xor edx,ebp
and edx,eax
xor edx,ebp
mov DWORD [ebx*4+esi],ecx
mov DWORD [ebx*4+edi],edx
dec ebx
jge NEAR L$015copy
mov esp,DWORD [24+esp]
mov eax,1
L$000just_leave:
pop edi
pop esi
pop ebx
pop ebp
ret
db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
db 111,114,103,62,0
segment .bss
common _OPENSSL_ia32cap_P 16
|