1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
|
//====--- SPU64InstrInfo.td - Cell SPU 64-bit operations -*- tablegen -*--====//
//
// Cell SPU 64-bit operations
//
//===----------------------------------------------------------------------===//
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// 64-bit comparisons:
//
// 1. The instruction sequences for vector vice scalar differ by a
// constant. In the scalar case, we're only interested in the
// top two 32-bit slots, whereas we're interested in an exact
// all-four-slot match in the vector case.
//
// 2. There are no "immediate" forms, since loading 64-bit constants
// could be a constant pool load.
//
// 3. i64 setcc results are i32, which are subsequently converted to a FSM
// mask when used in a select pattern.
//
// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask (TODO)
// [Note: this may be moot, since gb produces v4i32 or r32.]
//
// 5. The code sequences for r64 and v2i64 are probably overly conservative,
// compared to the code that gcc produces.
//
// M00$E B!tes Kan be Pretty N@sTi!!!!! (appologies to Monty!)
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// selb instruction definition for i64. Note that the selection mask is
// a vector, produced by various forms of FSM:
def SELBr64_cond:
SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC),
[/* no pattern */]>;
// The generic i64 select pattern, which assumes that the comparison result
// is in a 32-bit register that contains a select mask pattern (i.e., gather
// bits result):
def : Pat<(select R32C:$rCond, R64C:$rFalse, R64C:$rTrue),
(SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 R32C:$rCond))>;
// select the negative condition:
class I64SELECTNegCond<PatFrag cond, CodeFrag compare>:
Pat<(select (i32 (cond R64C:$rA, R64C:$rB)), R64C:$rTrue, R64C:$rFalse),
(SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 compare.Fragment))>;
// setcc the negative condition:
class I64SETCCNegCond<PatFrag cond, CodeFrag compare>:
Pat<(cond R64C:$rA, R64C:$rB),
(XORIr32 compare.Fragment, -1)>;
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// The i64 seteq fragment that does the scalar->vector conversion and
// comparison:
def CEQr64compare:
CodeFrag<(CGTIv4i32 (GBv4i32 (CEQv4i32 (ORv2i64_i64 R64C:$rA),
(ORv2i64_i64 R64C:$rB))), 0xb)>;
// The i64 seteq fragment that does the vector comparison
def CEQv2i64compare:
CodeFrag<(CEQIv4i32 (GBv4i32 (CEQv4i32 VECREG:$rA, VECREG:$rB)), 0xf)>;
// i64 seteq (equality): the setcc result is i32, which is converted to a
// vector FSM mask when used in a select pattern.
//
// v2i64 seteq (equality): the setcc result is v4i32
multiclass CompareEqual64 {
// Plain old comparison, converts back to i32 scalar
def r64: CodeFrag<(ORi32_v4i32 CEQr64compare.Fragment)>;
def v2i64: CodeFrag<(ORi32_v4i32 CEQv2i64compare.Fragment)>;
// SELB mask from FSM:
def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQr64compare.Fragment))>;
def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQv2i64compare.Fragment))>;
}
defm I64EQ: CompareEqual64;
def : Pat<(seteq R64C:$rA, R64C:$rB), I64EQr64.Fragment>;
def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)), I64EQv2i64.Fragment>;
// i64 setne:
def : I64SETCCNegCond<setne, I64EQr64>;
def : I64SELECTNegCond<setne, I64EQr64>;
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// i64 setugt/setule:
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
def CLGTr64ugt:
CodeFrag<(CLGTv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
def CLGTr64eq:
CodeFrag<(CEQv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
def CLGTr64compare:
CodeFrag<(SELBv2i64 CLGTr64ugt.Fragment,
(XSWDv2i64 CLGTr64ugt.Fragment),
CLGTr64eq.Fragment)>;
def CLGTv2i64ugt:
CodeFrag<(CLGTv4i32 VECREG:$rA, VECREG:$rB)>;
def CLGTv2i64eq:
CodeFrag<(CEQv4i32 VECREG:$rA, VECREG:$rB)>;
def CLGTv2i64compare:
CodeFrag<(SELBv2i64 CLGTv2i64ugt.Fragment,
(XSWDv2i64 CLGTr64ugt.Fragment),
CLGTv2i64eq.Fragment)>;
multiclass CompareLogicalGreaterThan64 {
// Plain old comparison, converts back to i32 scalar
def r64: CodeFrag<(ORi32_v4i32 CLGTr64compare.Fragment)>;
def v2i64: CodeFrag<CLGTv2i64compare.Fragment>;
// SELB mask from FSM:
def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGTr64compare.Fragment))>;
def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGTv2i64compare.Fragment))>;
}
defm I64LGT: CompareLogicalGreaterThan64;
def : Pat<(setugt R64C:$rA, R64C:$rB), I64LGTr64.Fragment>;
def : Pat<(setugt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
I64LGTv2i64.Fragment>;
// i64 setult:
def : I64SETCCNegCond<setule, I64LGTr64>;
def : I64SELECTNegCond<setule, I64LGTr64>;
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// i64 setuge/setult:
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
def CLGEr64compare:
CodeFrag<(CGTIv4i32 (GBv4i32 (ORv4i32 CLGTr64ugt.Fragment,
CLGTr64eq.Fragment)), 0xb)>;
def CLGEv2i64compare:
CodeFrag<(CEQIv4i32 (GBv4i32 (ORv4i32 CLGTv2i64ugt.Fragment,
CLGTv2i64eq.Fragment)), 0xf)>;
multiclass CompareLogicalGreaterEqual64 {
// Plain old comparison, converts back to i32 scalar
def r64: CodeFrag<(ORi32_v4i32 CLGEr64compare.Fragment)>;
def v2i64: CodeFrag<CLGEv2i64compare.Fragment>;
// SELB mask from FSM:
def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGEr64compare.Fragment))>;
def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGEv2i64compare.Fragment))>;
}
defm I64LGE: CompareLogicalGreaterEqual64;
def : Pat<(setuge R64C:$rA, R64C:$rB), I64LGEr64.Fragment>;
def : Pat<(setuge (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
I64LGEv2i64.Fragment>;
// i64 setult:
def : I64SETCCNegCond<setult, I64LGEr64>;
def : I64SELECTNegCond<setult, I64LGEr64>;
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// i64 setgt/setle:
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
def CGTr64sgt:
CodeFrag<(CGTv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
def CGTr64eq:
CodeFrag<(CEQv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
def CGTr64compare:
CodeFrag<(SELBv2i64 CGTr64sgt.Fragment,
(XSWDv2i64 CGTr64sgt.Fragment),
CGTr64eq.Fragment)>;
def CGTv2i64sgt:
CodeFrag<(CGTv4i32 VECREG:$rA, VECREG:$rB)>;
def CGTv2i64eq:
CodeFrag<(CEQv4i32 VECREG:$rA, VECREG:$rB)>;
def CGTv2i64compare:
CodeFrag<(SELBv2i64 CGTv2i64sgt.Fragment,
(XSWDv2i64 CGTr64sgt.Fragment),
CGTv2i64eq.Fragment)>;
multiclass CompareGreaterThan64 {
// Plain old comparison, converts back to i32 scalar
def r64: CodeFrag<(ORi32_v4i32 CGTr64compare.Fragment)>;
def v2i64: CodeFrag<CGTv2i64compare.Fragment>;
// SELB mask from FSM:
def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGTr64compare.Fragment))>;
def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGTv2i64compare.Fragment))>;
}
defm I64GT: CompareLogicalGreaterThan64;
def : Pat<(setgt R64C:$rA, R64C:$rB), I64GTr64.Fragment>;
def : Pat<(setgt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
I64GTv2i64.Fragment>;
// i64 setult:
def : I64SETCCNegCond<setle, I64GTr64>;
def : I64SELECTNegCond<setle, I64GTr64>;
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// i64 setge/setlt:
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
def CGEr64compare:
CodeFrag<(CGTIv4i32 (GBv4i32 (ORv4i32 CGTr64sgt.Fragment,
CGTr64eq.Fragment)), 0xb)>;
def CGEv2i64compare:
CodeFrag<(CEQIv4i32 (GBv4i32 (ORv4i32 CGTv2i64sgt.Fragment,
CGTv2i64eq.Fragment)), 0xf)>;
multiclass CompareGreaterEqual64 {
// Plain old comparison, converts back to i32 scalar
def r64: CodeFrag<(ORi32_v4i32 CGEr64compare.Fragment)>;
def v2i64: CodeFrag<CGEv2i64compare.Fragment>;
// SELB mask from FSM:
def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGEr64compare.Fragment))>;
def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGEv2i64compare.Fragment))>;
}
defm I64GE: CompareGreaterEqual64;
def : Pat<(setge R64C:$rA, R64C:$rB), I64GEr64.Fragment>;
def : Pat<(setge (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
I64GEv2i64.Fragment>;
// i64 setult:
def : I64SETCCNegCond<setlt, I64GEr64>;
def : I64SELECTNegCond<setlt, I64GEr64>;
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// v2i64, i64 add
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
class v2i64_add_cg<dag lhs, dag rhs>:
CodeFrag<(CGv4i32 lhs, rhs)>;
class v2i64_add_1<dag lhs, dag rhs, dag cg, dag cg_mask>:
CodeFrag<(ADDXv4i32 lhs, rhs, (SHUFBv4i32 cg, cg, cg_mask))>;
class v2i64_add<dag lhs, dag rhs, dag cg_mask>:
v2i64_add_1<lhs, rhs, v2i64_add_cg<lhs, rhs>.Fragment, cg_mask>;
def : Pat<(SPUadd64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
(ORi64_v2i64 v2i64_add<(ORv2i64_i64 R64C:$rA),
(ORv2i64_i64 R64C:$rB),
(v4i32 VECREG:$rCGmask)>.Fragment)>;
def : Pat<(SPUadd64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
(v4i32 VECREG:$rCGmask)),
v2i64_add<(v2i64 VECREG:$rA),
(v2i64 VECREG:$rB),
(v4i32 VECREG:$rCGmask)>.Fragment>;
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// v2i64, i64 subtraction
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
class v2i64_sub_bg<dag lhs, dag rhs>: CodeFrag<(BGv4i32 lhs, rhs)>;
class v2i64_sub<dag lhs, dag rhs, dag bg, dag bg_mask>:
CodeFrag<(SFXv4i32 lhs, rhs, (SHUFBv4i32 bg, bg, bg_mask))>;
def : Pat<(SPUsub64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
(ORi64_v2i64 v2i64_sub<(ORv2i64_i64 R64C:$rA),
(ORv2i64_i64 R64C:$rB),
v2i64_sub_bg<(ORv2i64_i64 R64C:$rA),
(ORv2i64_i64 R64C:$rB)>.Fragment,
(v4i32 VECREG:$rCGmask)>.Fragment)>;
def : Pat<(SPUsub64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
(v4i32 VECREG:$rCGmask)),
v2i64_sub<(v2i64 VECREG:$rA),
(v2i64 VECREG:$rB),
v2i64_sub_bg<(v2i64 VECREG:$rA),
(v2i64 VECREG:$rB)>.Fragment,
(v4i32 VECREG:$rCGmask)>.Fragment>;
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// v2i64, i64 multiply
//
// Note: i64 multiply is simply the vector->scalar conversion of the
// full-on v2i64 multiply, since the entire vector has to be manipulated
// anyway.
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
class v2i64_mul_ahi64<dag rA> :
CodeFrag<(SELBv4i32 rA, (ILv4i32 0), (FSMBIv4i32 0x0f0f))>;
class v2i64_mul_bhi64<dag rB> :
CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0x0f0f))>;
class v2i64_mul_alo64<dag rB> :
CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0xf0f0))>;
class v2i64_mul_blo64<dag rB> :
CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0xf0f0))>;
class v2i64_mul_ashlq2<dag rA>:
CodeFrag<(SHLQBYIv4i32 rA, 0x2)>;
class v2i64_mul_ashlq4<dag rA>:
CodeFrag<(SHLQBYIv4i32 rA, 0x4)>;
class v2i64_mul_bshlq2<dag rB> :
CodeFrag<(SHLQBYIv4i32 rB, 0x2)>;
class v2i64_mul_bshlq4<dag rB> :
CodeFrag<(SHLQBYIv4i32 rB, 0x4)>;
class v2i64_highprod<dag rA, dag rB>:
CodeFrag<(Av4i32
(Av4i32
(MPYUv4i32 v2i64_mul_bshlq4<rB>.Fragment, // a1 x b3
v2i64_mul_ahi64<rA>.Fragment),
(MPYHv4i32 v2i64_mul_ahi64<rA>.Fragment, // a0 x b3
v2i64_mul_bshlq4<rB>.Fragment)),
(Av4i32
(MPYHv4i32 v2i64_mul_bhi64<rB>.Fragment,
v2i64_mul_ashlq4<rA>.Fragment),
(Av4i32
(MPYHv4i32 v2i64_mul_ashlq4<rA>.Fragment,
v2i64_mul_bhi64<rB>.Fragment),
(Av4i32
(MPYUv4i32 v2i64_mul_ashlq4<rA>.Fragment,
v2i64_mul_bhi64<rB>.Fragment),
(Av4i32
(MPYHv4i32 v2i64_mul_ashlq2<rA>.Fragment,
v2i64_mul_bshlq2<rB>.Fragment),
(MPYUv4i32 v2i64_mul_ashlq2<rA>.Fragment,
v2i64_mul_bshlq2<rB>.Fragment))))))>;
class v2i64_mul_a3_b3<dag rA, dag rB>:
CodeFrag<(MPYUv4i32 v2i64_mul_alo64<rA>.Fragment,
v2i64_mul_blo64<rB>.Fragment)>;
class v2i64_mul_a2_b3<dag rA, dag rB>:
CodeFrag<(SELBv4i32 (SHLQBYIv4i32
(MPYHHUv4i32 v2i64_mul_alo64<rA>.Fragment,
v2i64_mul_bshlq2<rB>.Fragment), 0x2),
(ILv4i32 0),
(FSMBIv4i32 0xc3c3))>;
class v2i64_mul_a3_b2<dag rA, dag rB>:
CodeFrag<(SELBv4i32 (SHLQBYIv4i32
(MPYHHUv4i32 v2i64_mul_blo64<rB>.Fragment,
v2i64_mul_ashlq2<rA>.Fragment), 0x2),
(ILv4i32 0),
(FSMBIv4i32 0xc3c3))>;
class v2i64_lowsum<dag rA, dag rB, dag rCGmask>:
v2i64_add<v2i64_add<v2i64_mul_a3_b3<rA, rB>.Fragment,
v2i64_mul_a2_b3<rA, rB>.Fragment, rCGmask>.Fragment,
v2i64_mul_a3_b2<rA, rB>.Fragment, rCGmask>;
class v2i64_mul<dag rA, dag rB, dag rCGmask>:
v2i64_add<v2i64_lowsum<rA, rB, rCGmask>.Fragment,
(SELBv4i32 v2i64_highprod<rA, rB>.Fragment,
(ILv4i32 0),
(FSMBIv4i32 0x0f0f)),
rCGmask>;
def : Pat<(SPUmul64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
(ORi64_v2i64 v2i64_mul<(ORv2i64_i64 R64C:$rA),
(ORv2i64_i64 R64C:$rB),
(v4i32 VECREG:$rCGmask)>.Fragment)>;
def : Pat<(SPUmul64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
(v4i32 VECREG:$rCGmask)),
v2i64_mul<(v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
(v4i32 VECREG:$rCGmask)>.Fragment>;
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// f64 comparisons
//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
// selb instruction definition for i64. Note that the selection mask is
// a vector, produced by various forms of FSM:
def SELBf64_cond:
SELBInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB, R32C:$rC),
[(set R64FP:$rT,
(select R32C:$rC, R64FP:$rB, R64FP:$rA))]>;
|