1 /**
2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 *
5 * Lexer for Polemy programming language.
6 */
7 module polemy.lex;
8 import polemy._common;
9
10 import std.file : readText;
11 import std.string : munch;
12 import std.ctype;
13
14 /// Represents a position in a source code
15
16 class LexPosition
17 {
18 immutable string filename; /// name of the source file
19 immutable int lineno; /// line number, 1, 2, ...
20 immutable int column; /// column, 1, 2, ...
21
22 override string toString() const
23 { return sprintf!"%s:%d:%d"(filename, lineno, column); }
24
25 mixin SimpleConstructor;
26 mixin SimpleCompare;
27 }
28
29 unittest
30 {
31 auto p = new LexPosition("hello.cpp", 123, 45);
32 auto q = new LexPosition("hello.cpp", 123, 46);
33
34 assert( p.filename == "hello.cpp" );
35 assert( p.lineno == 123 );
36 assert( p.column == 45 );
37 assert( to!string(p) == "hello.cpp:123:45" );
38 assert( p < q );
39 assert( p != q );
40
41 assert( !__traits(compiles, new LexPosition) );
42 assert( !__traits(compiles, p.filename="foo") );
43 assert( !__traits(compiles, p.lineno =789) );
44 assert( !__traits(compiles, p.column =222) );
45 }
46
47 /// Represents a lexer token
48
49 class Token
50 {
51 enum Kind {identifier, stringLiteral, number};
52 immutable LexPosition pos; /// position where the token occurred in the source
53 immutable string str; /// the token string itself
54 immutable Kind kind; /// which kind of token?
55
56 mixin SimpleConstructor;
57 mixin SimpleCompare;
58 }
59
60 unittest
61 {
62 auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
63 auto t = new Token(p, "class", Token.Kind.identifier);
64
65 assert( t.pos == p );
66 assert( t.str == "class" );
67 assert( t == new Token(p, "class", Token.Kind.identifier) );
68 assert( t < new Token(p, "struct", Token.Kind.identifier) );
69
70 assert( !__traits(compiles, new Token) );
71 assert( !__traits(compiles, t.pos=p) );
72 assert( !__traits(compiles, t.str=789) );
73 }
74
75 /// Named Construtor for Lexer
76
77 Lexer lexerFromFile(T...)( string filename, T rest )
78 {
79 return lexerFromString( std.file.readText(filename), filename, rest );
80 }
81
82 /// Named Construtor for Lexer
83
84 Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 )
85 {
86 return new Lexer(str, filename, lineno, column);
87 }
88
89 /// Lexer is a forward range of Tokens
90
91 class Lexer
92 {
93 /// Range primitive
94 bool empty() /*@property*/
95 {
96 return current is null;
97 }
98
99 /// Range primitive
100 Token front() /*@property*/
101 {
102 return std.exception.enforce(current, "Lexer has already reached the end");
103 }
104
105 /// Range primitive
106 void popFront() /*@property*/
107 {
108 std.exception.enforce(current, "Lexer has already reached the end");
109 current = readNext();
110 }
111
112 /// Range primitive
113 Lexer save() /*@property*/
114 {
115 return new Lexer(buffer, filename, lineno, column, current);
116 }
117
118 private: // implementation
119
120 string buffer;
121 string filename;
122 int lineno;
123 int column;
124 Token current;
125
126 invariant()
127 {
128 assert( buffer.empty || !std.ctype.isspace(buffer[0]) );
129 }
130
131 this( string buffer, string filename, int lineno, int column, Token current=null )
132 {
133 this.buffer = buffer;
134 this.filename = filename;
135 this.lineno = lineno;
136 this.column = column;
137 skipws();
138 this.current = (current is null ? readNext() : current);
139 }
140
141 void skipws()
142 {
143 bool progress = false;
144 do
145 {
146 string ws = buffer.munch(" \t");
147 column += ws.length;
148 progress = !ws.empty;
149 while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') )
150 {
151 progress = true;
152 if( buffer[0] == '\n' )
153 buffer = buffer[1..$];
154 else // if( buffer.front == '\r' )
155 {
156 buffer = buffer[1..$];
157 if( !buffer.empty && buffer[0]=='\n' )
158 buffer = buffer[1..$];
159 }
160 lineno ++;
161 column = 1;
162 }
163 }while( progress );
164 }
165
166 char readChar()
167 {
168 scope(exit) {
169 buffer = buffer[1..$];
170 column ++;
171 }
172 return buffer[0];
173 }
174
175 /// This is the main lexing routine
176 Token readNext()
177 {
178 if( buffer.empty )
179 return null;
180 scope(exit)
181 skipws();
182
183 if( isSymbol(buffer[0]) )
184 {
185 if( buffer[0] == '#' )
186 {
187 // skip comment
188 while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') )
189 readChar();
190 skipws();
191 return readNext();
192 }
193 else if( buffer[0] == '"' )
194 {
195 // string literal
196 auto pos = currentPosition();
197 string lit;
198 readChar();
199 while( !buffer.empty && buffer[0]!='"' )
200 {
201 // read one char
202 char c = readChar();
203 if( c == '\\' )
204 {
205 if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') )
206 lit ~= readChar();
207 else
208 lit ~= c;
209 }
210 else if( c == '\n' )
211 {
212 lit ~= c;
213 lineno++;
214 column = 1;
215 }
216 else if( c == '\r' )
217 {
218 if( !buffer.empty && buffer[0]=='\n' )
219 readChar();
220 lit ~= '\n';
221 lineno++;
222 column = 1;
223 }
224 else
225 lit ~= c;
226 }
227 if( !buffer.empty )
228 readChar();
229 return new Token(pos, lit, Token.Kind.stringLiteral);
230 }
231 else
232 {
233 // normal symbol
234 auto pos = currentPosition();
235 auto str = ""~readChar();
236 return new Token(pos, str, Token.Kind.identifier);
237 }
238 }
239 else
240 {
241 auto pos = currentPosition();
242 int i = 0;
243 while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) )
244 ++i;
245 auto str = buffer[0 .. i];
246 buffer = buffer[i .. $];
247 column += i;
248 bool isNumber = find!(`a<'0' || '9'<a`)(str).empty;
249 return new Token(pos, str, isNumber ? Token.Kind.number : Token.Kind.identifier);
250 }
251 }
252
253 bool isSymbol(char c)
254 {
255 return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_');
256 }
257
258 immutable(LexPosition) currentPosition()
259 {
260 return new immutable(LexPosition)(filename, lineno, column);
261 }
262 }
263
264 unittest
265 {
266 assert( std.range.isForwardRange!(Lexer) );
267 }
268
269 unittest
270 {
271 auto lex = lexerFromString("this is a \t\n pen :-( ");
272 Token[] ts = std.array.array(lex);
273
274 assert( ts[0].pos.lineno == 1 );
275 assert( ts[0].pos.column == 1 );
276 assert( ts[0].kind == Token.Kind.identifier );
277 assert( ts[0].str == "this" );
278
279 assert( ts[1].pos.lineno == 1 );
280 assert( ts[1].pos.column == 6 );
281 assert( ts[1].kind == Token.Kind.identifier );
282 assert( ts[1].str == "is" );
283
284 assert( ts[2].pos.lineno == 1 );
285 assert( ts[2].pos.column == 9 );
286 assert( ts[2].kind == Token.Kind.identifier );
287 assert( ts[2].str == "a" );
288
289 assert( ts[3].pos.lineno == 2 );
290 assert( ts[3].pos.column == 2 );
291 assert( ts[3].kind == Token.Kind.identifier );
292 assert( ts[3].str == "pen" );
293
294 // consecutive symbols are always separated
295 // hence, no "++" or "<<" or ...
296
297 assert( ts[4].pos.lineno == 2 );
298 assert( ts[4].pos.column == 6 );
299 assert( ts[4].str == ":" );
300
301 assert( ts[5].pos.lineno == 2 );
302 assert( ts[5].pos.column == 7 );
303 assert( ts[5].str == "-" );
304
305 assert( ts[6].pos.lineno == 2 );
306 assert( ts[6].pos.column == 8 );
307 assert( ts[6].str == "(" );
308
309 assert( ts.length == 7 );
310 }
311
312 unittest
313 {
314 auto lex2 = lexerFromString(" a12\n3a 5 ");
315 assert( lex2.front.str == "a12" );
316 assert( lex2.front.kind == Token.Kind.identifier );
317 lex2.popFront;
318 auto lex3 = lex2.save;
319 assert( lex2.front.str == "3a" );
320 assert( lex2.front.kind == Token.Kind.identifier );
321 lex2.popFront;
322 assert( lex3.front.str == "3a" );
323 assert( lex3.front.kind == Token.Kind.identifier );
324 assert( lex2.front.str == "5" );
325 assert( lex2.front.kind == Token.Kind.number );
326 lex2.popFront;
327 lex3.popFront;
328 assert( lex2.empty );
329 assert( !lex3.empty );
330 assert( lex3.front.str == "5" );
331 assert( lex3.front.kind == Token.Kind.number );
332 }
333
334 unittest
335 {
336 //!! be sure to run the unittest on the root of the source directory
337 auto lexf = lexerFromFile("polemy/lex.d");
338 lexf = find!`a.str == "module"`(lexf);
339 assert( lexf.front.str == "module", lexf.front.str );
340 assert( lexf.front.pos.filename == "polemy/lex.d" );
341 assert( lexf.front.pos.lineno == 7 );
342 assert( lexf.front.pos.column == 1 );
343 lexf.popFront;
344 assert( lexf.front.str == "polemy" );
345 assert( lexf.front.pos.lineno == 7 );
346 assert( lexf.front.pos.column == 8 );
347 lexf.popFront;
348 assert( lexf.front.str == "." );
349 lexf.popFront;
350 assert( lexf.front.str == "lex" );
351 lexf.popFront;
352 assert( lexf.front.str == ";" );
353 lexf.popFront;
354 assert( lexf.front.str == "import" );
355 assert( lexf.front.pos.lineno == 8 );
356 assert( lexf.front.pos.column == 1 );
357 }
358
359 unittest
360 {
361 auto lex = lexerFromString(`my # comment should
362 # hey!!
363 be ignored.
364 hahaha"hihihi""hu\\\"huhu"#123 aa
365 123 aa "aaa
366 bbb # 123
367 eee"
368 zzz
369 `);
370 Token[] ts = std.array.array(lex);
371 assert( ts[0].str == "my" );
372 assert( ts[0].pos.lineno == 1 );
373 assert( ts[1].str == "be" );
374 assert( ts[1].pos.lineno == 3 );
375 assert( ts[2].str == "ignored" );
376 assert( ts[3].str == "." );
377 assert( ts[4].str == "hahaha" );
378 assert( ts[4].pos.lineno == 4 );
379 assert( ts[4].kind == Token.Kind.identifier );
380 assert( ts[5].str == "hihihi" );
381 assert( ts[5].pos.lineno == 4 );
382 assert( ts[5].kind == Token.Kind.stringLiteral );
383 assert( ts[6].str == `hu\"huhu` );
384 assert( ts[6].kind == Token.Kind.stringLiteral );
385 assert( ts[6].pos.lineno == 4 );
386 assert( ts[7].str == "123" );
387 assert( ts[7].pos.lineno == 5 );
388 assert( ts[7].kind == Token.Kind.number );
389 assert( ts[8].str == "aa" );
390 assert( ts[9].pos.lineno == 5 );
391 assert( ts[9].str == "aaa\nbbb # 123\neee" );
392 assert( ts[9].kind == Token.Kind.stringLiteral );
393 assert( ts[10].pos.lineno == 8 );
394 assert( ts.length == 11 );
395 }