aboutsummaryrefslogtreecommitdiffstats
path: root/lib/AsmParser/Lexer.l
blob: faddcb16d51d3801e1d1fc9acbf5a832bf69a1b8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
/*===-- Lexer.l - Scanner for llvm assembly files ----------------*- C++ -*--=//
//
//  This file implements the flex scanner for LLVM assembly languages files.
//
//===------------------------------------------------------------------------=*/

%option prefix="llvmAsm"
%option yylineno
%option nostdinit
%option never-interactive
%option batch
%option noyywrap
%option nodefault
%option 8bit
%option outfile="Lexer.cpp"
%option ecs
%option noreject
%option noyymore

%{
#include "ParserInternals.h"
#include <list>
#include "llvmAsmParser.h"
#include <ctype.h>
#include <stdlib.h>

#define RET_TOK(type, Enum, sym) \
  llvmAsmlval.type = Instruction::Enum; return sym


// TODO: All of the static identifiers are figured out by the lexer, 
// these should be hashed to reduce the lexer size


// atoull - Convert an ascii string of decimal digits into the unsigned long
// long representation... this does not have to do input error checking, 
// because we know that the input will be matched by a suitable regex...
//
static uint64_t atoull(const char *Buffer) {
  uint64_t Result = 0;
  for (; *Buffer; Buffer++) {
    uint64_t OldRes = Result;
    Result *= 10;
    Result += *Buffer-'0';
    if (Result < OldRes)   // Uh, oh, overflow detected!!!
      ThrowException("constant bigger than 64 bits detected!");
  }
  return Result;
}

// HexToFP - Convert the ascii string in hexidecimal format to the floating
// point representation of it.
//
static double HexToFP(const char *Buffer) {
  uint64_t Result = 0;
  for (; *Buffer; ++Buffer) {
    uint64_t OldRes = Result;
    Result *= 16;
    char C = *Buffer;
    if (C >= '0' && C <= '9')
      Result += C-'0';
    else if (C >= 'A' && C <= 'F')
      Result += C-'A'+10;
    else if (C >= 'a' && C <= 'f')
      Result += C-'a'+10;

    if (Result < OldRes)   // Uh, oh, overflow detected!!!
      ThrowException("constant bigger than 64 bits detected!");
  }

  assert(sizeof(double) == sizeof(Result) &&
         "Data sizes incompatible on this target!");
  // Behave nicely in the face of C TBAA rules... see:
  // http://www.nullstone.com/htmls/category/aliastyp.htm
  //
  char *ProxyPointer = (char*)&Result;
  return *(double*)ProxyPointer;   // Cast Hex constant to double
}


// UnEscapeLexed - Run through the specified buffer and change \xx codes to the
// appropriate character.  If AllowNull is set to false, a \00 value will cause
// an exception to be thrown.
//
// If AllowNull is set to true, the return value of the function points to the
// last character of the string in memory.
//
char *UnEscapeLexed(char *Buffer, bool AllowNull = false) {
  char *BOut = Buffer;
  for (char *BIn = Buffer; *BIn; ) {
    if (BIn[0] == '\\' && isxdigit(BIn[1]) && isxdigit(BIn[2])) {
      char Tmp = BIn[3]; BIn[3] = 0;     // Terminate string
      *BOut = strtol(BIn+1, 0, 16);  // Convert to number
      if (!AllowNull && !*BOut)
        ThrowException("String literal cannot accept \\00 escape!");
      
      BIn[3] = Tmp;                  // Restore character
      BIn += 3;                      // Skip over handled chars
      ++BOut;
    } else {
      *BOut++ = *BIn++;
    }
  }

  return BOut;
}

#define YY_NEVER_INTERACTIVE 1
%}



/* Comments start with a ; and go till end of line */
Comment    ;.*

/* Variable(Value) identifiers start with a % sign */
VarID       %[-a-zA-Z$._][-a-zA-Z$._0-9]*

/* Label identifiers end with a colon */
Label       [-a-zA-Z$._0-9]+:

/* Quoted names can contain any character except " and \ */
StringConstant \"[^\"]+\"


/* [PN]Integer: match positive and negative literal integer values that
 * are preceeded by a '%' character.  These represent unnamed variable slots.
 */
EPInteger     %[0-9]+
ENInteger    %-[0-9]+


/* E[PN]Integer: match positive and negative literal integer values */
PInteger   [0-9]+
NInteger  -[0-9]+

/* FPConstant - A Floating point constant.
 */
FPConstant [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?

/* HexFPConstant - Floating point constant represented in IEEE format as a
 *  hexadecimal number for when exponential notation is not precise enough.
 */
HexFPConstant 0x[0-9A-Fa-f]+
%%

{Comment}       { /* Ignore comments for now */ }

begin           { return BEGINTOK; }
end             { return ENDTOK; }
true            { return TRUE;  }
false           { return FALSE; }
declare         { return DECLARE; }
global          { return GLOBAL; }
constant        { return CONSTANT; }
const           { return CONST; }
internal        { return INTERNAL; }
uninitialized   { return UNINIT; }
implementation  { return IMPLEMENTATION; }
\.\.\.          { return DOTDOTDOT; }
string          { return STRING; }
null            { return NULL_TOK; }
to              { return TO; }
except          { return EXCEPT; }

void            { llvmAsmlval.PrimType = Type::VoidTy  ; return VOID;   }
bool            { llvmAsmlval.PrimType = Type::BoolTy  ; return BOOL;   }
sbyte           { llvmAsmlval.PrimType = Type::SByteTy ; return SBYTE;  }
ubyte           { llvmAsmlval.PrimType = Type::UByteTy ; return UBYTE;  }
short           { llvmAsmlval.PrimType = Type::ShortTy ; return SHORT;  }
ushort          { llvmAsmlval.PrimType = Type::UShortTy; return USHORT; }
int             { llvmAsmlval.PrimType = Type::IntTy   ; return INT;    }
uint            { llvmAsmlval.PrimType = Type::UIntTy  ; return UINT;   }
long            { llvmAsmlval.PrimType = Type::LongTy  ; return LONG;   }
ulong           { llvmAsmlval.PrimType = Type::ULongTy ; return ULONG;  }
float           { llvmAsmlval.PrimType = Type::FloatTy ; return FLOAT;  }
double          { llvmAsmlval.PrimType = Type::DoubleTy; return DOUBLE; }
type            { llvmAsmlval.PrimType = Type::TypeTy  ; return TYPE;   }
label           { llvmAsmlval.PrimType = Type::LabelTy ; return LABEL;  }
opaque          { return OPAQUE; }


not             { RET_TOK(UnaryOpVal, Not, NOT); }

add             { RET_TOK(BinaryOpVal, Add, ADD); }
sub             { RET_TOK(BinaryOpVal, Sub, SUB); }
mul             { RET_TOK(BinaryOpVal, Mul, MUL); }
div             { RET_TOK(BinaryOpVal, Div, DIV); }
rem             { RET_TOK(BinaryOpVal, Rem, REM); }
and             { RET_TOK(BinaryOpVal, And, AND); }
or              { RET_TOK(BinaryOpVal, Or , OR ); }
xor             { RET_TOK(BinaryOpVal, Xor, XOR); }
setne           { RET_TOK(BinaryOpVal, SetNE, SETNE); }
seteq           { RET_TOK(BinaryOpVal, SetEQ, SETEQ); }
setlt           { RET_TOK(BinaryOpVal, SetLT, SETLT); }
setgt           { RET_TOK(BinaryOpVal, SetGT, SETGT); }
setle           { RET_TOK(BinaryOpVal, SetLE, SETLE); }
setge           { RET_TOK(BinaryOpVal, SetGE, SETGE); }

phi             { RET_TOK(OtherOpVal, PHINode, PHI); }
call            { RET_TOK(OtherOpVal, Call, CALL); }
cast            { RET_TOK(OtherOpVal, Cast, CAST); }
shl             { RET_TOK(OtherOpVal, Shl, SHL); }
shr             { RET_TOK(OtherOpVal, Shr, SHR); }

ret             { RET_TOK(TermOpVal, Ret, RET); }
br              { RET_TOK(TermOpVal, Br, BR); }
switch          { RET_TOK(TermOpVal, Switch, SWITCH); }
invoke          { RET_TOK(TermOpVal, Invoke, INVOKE); }


malloc          { RET_TOK(MemOpVal, Malloc, MALLOC); }
alloca          { RET_TOK(MemOpVal, Alloca, ALLOCA); }
free            { RET_TOK(MemOpVal, Free, FREE); }
load            { RET_TOK(MemOpVal, Load, LOAD); }
store           { RET_TOK(MemOpVal, Store, STORE); }
getelementptr   { RET_TOK(MemOpVal, GetElementPtr, GETELEMENTPTR); }


{VarID}         {
                  UnEscapeLexed(yytext+1);
                  llvmAsmlval.StrVal = strdup(yytext+1);             // Skip %
                  return VAR_ID; 
                }
{Label}         {
                  yytext[strlen(yytext)-1] = 0;  // nuke colon
                  UnEscapeLexed(yytext);
		  llvmAsmlval.StrVal = strdup(yytext);
		  return LABELSTR; 
                }

{StringConstant} { // Note that we cannot unescape a string constant here!  The
                   // string constant might contain a \00 which would not be 
                   // understood by the string stuff.  It is valid to make a
                   // [sbyte] c"Hello World\00" constant, for example.
                   //
                  yytext[strlen(yytext)-1] = 0;           // nuke end quote
		  llvmAsmlval.StrVal = strdup(yytext+1);  // Nuke start quote
		  return STRINGCONSTANT;
                 }


{PInteger}      { llvmAsmlval.UInt64Val = atoull(yytext); return EUINT64VAL; }
{NInteger}      { 
                  uint64_t Val = atoull(yytext+1);
		  // +1:  we have bigger negative range
		  if (Val > (uint64_t)INT64_MAX+1)
		    ThrowException("Constant too large for signed 64 bits!");
                  llvmAsmlval.SInt64Val = -Val; 
		  return ESINT64VAL; 
                }


{EPInteger}     { llvmAsmlval.UIntVal = atoull(yytext+1); return UINTVAL; }
{ENInteger}     {
                  uint64_t Val = atoull(yytext+2);
		  // +1:  we have bigger negative range
		  if (Val > (uint64_t)INT32_MAX+1)
		    ThrowException("Constant too large for signed 32 bits!");
                  llvmAsmlval.SIntVal = -Val;
		  return SINTVAL;
                }

{FPConstant}    { llvmAsmlval.FPVal = atof(yytext); return FPVAL; }
{HexFPConstant} { llvmAsmlval.FPVal = HexToFP(yytext); return FPVAL; }

[ \t\n]         { /* Ignore whitespace */ }
.               { return yytext[0]; }

%%