1 /**
2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 *
5 * Lexer for Polemy programming language.
6 */
7 module polemy.lex;
8 import polemy._common;
9 import std.file : readText;
10 import std.ctype : isspace, isalnum;
11
12 /*mixin*/
13 template ExceptionWithPosition()
14 {
15 const LexPosition pos;
16 this( const LexPosition pos, string msg, string file=null, size_t line=0, Throwable next=null )
17 {
18 if(pos is null)
19 super(sprintf!"[??] %s"(msg), file, line, next);
20 else
21 super(sprintf!"[%s] %s"(pos, msg), file, line, next);
22 this.pos = pos;
23 }
24 }
25
26 /// Thrown when encountered an EOF in the middle of a lexical token
27
28 class UnexpectedEOF : Exception
29 {
30 mixin ExceptionWithPosition;
31 }
32
33 /// Thrown when encountered a lexical error
34
35 class LexException : Exception
36 {
37 mixin ExceptionWithPosition;
38 };
39
40 /// Represents a position in source codes
41
42 class LexPosition
43 {
44 immutable string filename; /// name of the source file
45 immutable int lineno; /// 1-origin
46 immutable int column; /// 1-origin
47
48 mixin SimpleClass;
49 override string toString() const
50 { return sprintf!"%s:%d:%d"(filename, lineno, column); }
51
52 static immutable LexPosition dummy;
53 static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
54 }
55
56 unittest
57 {
58 auto p = new LexPosition("hello.cpp", 123, 45);
59
60 assert_eq( p.filename, "hello.cpp" );
61 assert_eq( p.lineno, 123 );
62 assert_eq( p.column, 45 );
63 assert_eq( text(p), "hello.cpp:123:45" );
64
65 assert( !__traits(compiles, new LexPosition) );
66 assert( !__traits(compiles, p.filename="foo") );
67 assert( !__traits(compiles, p.lineno =789) );
68 assert( !__traits(compiles, p.column =222) );
69
70 auto q = new LexPosition("hello.cpp", 123, 46);
71 assert_lt( p, q );
72 assert_ne( p, q );
73 }
74
75 /// Represents a lexer token
76
77 class Token
78 {
79 immutable LexPosition pos; /// Position where the token occurred in the source
80 immutable string str; /// The token string itself
81 immutable bool quoted; /// Was it a "quoted" token or unquoted?
82
83 mixin SimpleClass;
84 }
85
86 unittest
87 {
88 auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
89 auto t = new Token(p, "class", false);
90 auto u = new Token(p, "class", true);
91
92 assert_eq( t.pos, p );
93 assert_eq( t.str, "class" );
94 assert( !t.quoted );
95 assert_eq( t, new Token(p, "class", false) );
96 assert_lt( t, new Token(p, "struct", false) );
97 assert_ne( t, u );
98 assert( u.quoted );
99
100 assert( !__traits(compiles, new Token) );
101 assert( !__traits(compiles, t.pos=p) );
102 assert( !__traits(compiles, t.str=789) );
103 assert( !__traits(compiles, t.quoted=true) );
104 }
105
106 /// Named Construtors for Lexer
107
108 Lexer lexerFromFile(T...)( string filename, T ln_cn )
109 {
110 return lexerFromString( std.file.readText(filename), filename, ln_cn );
111 }
112
113 /// Named Construtor for Lexer
114
115 LexerT!(PositionedReader!CharSeq) /* ddoc doesn't recognize auto return... bugzilla:2581 */
116 lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
117 {
118 return new LexerT!(PositionedReader!CharSeq)(
119 PositionedReader!CharSeq(str, filename, lineno, column)
120 );
121 }
122
123 /// Standard Lexer Type (all you have to know is that this is a forward range of Tokens!)
124
125 alias LexerT!(PositionedReader!string) Lexer;
126
127 /// Lexer Implementation
128
129 class LexerT(Reader)
130 if( isForwardRange!(Reader) && is(ElementType!(Reader)==dchar) )
131 {
132 /// Range primitive
133 bool empty() /*@property*/
134 {
135 return current is null;
136 }
137
138 /// Range primitive
139 Token front() /*@property*/
140 {
141 return std.exception.enforce(current, "Lexer has already reached the end");
142 }
143
144 /// Range primitive
145 void popFront() /*@property*/
146 {
147 std.exception.enforce(current, "Lexer has already reached the end");
148 current = readNext();
149 }
150
151 /// Range primitive
152 typeof(this) save() /*@property*/
153 {
154 return new typeof(this)(reader.save, current);
155 }
156
157 private: // implementation
158
159 Reader reader;
160 Token current;
161
162 invariant()
163 {
164 assert( reader.empty || !isSpace(reader.front) );
165 }
166
167 this( Reader reader, Token current = null )
168 {
169 this.reader = reader;
170 readWhile!isSpace();
171 this.current = (current is null ? readNext() : current);
172 }
173
174 public static
175 {
176 bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; }
177 bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
178 bool isSSymbol (dchar c) { return "()[]{};@".canFind(c); }
179 bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
180 bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); }
181 }
182
183 string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
184 string readQuoted(const LexPosition pos, ref char[] buf)
185 {
186 if( reader.empty )
187 throw genex!UnexpectedEOF(pos, "Quoted string not terminated");
188 dchar c = reader.front;
189 reader.popFront;
190 if( c == '"' )
191 return assumeUnique(buf);
192 if( c == '\\' && !reader.empty ) {
193 if( reader.front=='"' ) {
194 reader.popFront;
195 return readQuoted(pos,buf ~= '\"');
196 }
197 if( reader.front=='\\' ) {
198 reader.popFront;
199 return readQuoted(pos,buf ~= '\\');
200 }
201 }
202 return readQuoted(pos,buf ~= c);
203 }
204
205 string readWhile(alias fn)()
206 {
207 char[] buf;
208 for(; !reader.empty && fn(reader.front); reader.popFront)
209 buf ~= reader.front;
210 return assumeUnique(buf);
211 }
212
213 Token readNext()
214 {
215 if( reader.empty )
216 return null;
217 scope(success)
218 readWhile!isSpace();
219 if( reader.front == '#' ) // comment
220 {
221 reader = find(reader, '\n');
222 readWhile!isSpace();
223 return readNext();
224 }
225 else if( reader.front == '"' ) // quoted
226 {
227 auto pos = reader.currentPosition();
228 reader.popFront;
229 return new Token(pos, readQuoted(pos), true);
230 }
231 else if( isSSymbol(reader.front) ) // paren
232 {
233 auto pos = reader.currentPosition();
234 string s; s~=reader.front; reader.popFront;
235 return new Token(pos, s, false);
236 }
237 else if( isMSymbol(reader.front) ) // symbol
238 {
239 auto pos = reader.currentPosition();
240 return new Token(pos, readWhile!isMSymbol(), false);
241 }
242 else
243 {
244 auto pos = reader.currentPosition();
245 return new Token(pos, readWhile!isLetter(), false);
246 }
247 }
248 }
249
250 unittest
251 {
252 assert( std.range.isForwardRange!(Lexer) );
253 assert( is(ElementType!(Lexer) == Token) );
254 }
255
256 unittest
257 {
258 auto lex = lexerFromString("this is a \t\r\n pen :-( @@; ");
259 Token[] ts = std.array.array(lex);
260
261 assert_eq( ts[0].pos.lineno, 1 );
262 assert_eq( ts[0].pos.column, 1 );
263 assert( !ts[0].quoted );
264 assert_eq( ts[0].str, "this" );
265
266 assert_eq( ts[1].pos.lineno, 1 );
267 assert_eq( ts[1].pos.column, 6 );
268 assert( !ts[1].quoted );
269 assert_eq( ts[1].str, "is" );
270
271 assert_eq( ts[2].pos.lineno, 1 );
272 assert_eq( ts[2].pos.column, 9 );
273 assert( !ts[2].quoted );
274 assert_eq( ts[2].str, "a" );
275
276 assert_eq( ts[3].pos.lineno, 2 );
277 assert_eq( ts[3].pos.column, 2 );
278 assert( !ts[3].quoted );
279 assert_eq( ts[3].str, "pen" );
280
281 assert_eq( ts[4].pos.lineno, 2 );
282 assert_eq( ts[4].pos.column, 6 );
283 assert_eq( ts[4].str, ":-" );
284
285 assert_eq( ts[5].pos.lineno, 2 );
286 assert_eq( ts[5].pos.column, 8 );
287 assert_eq( ts[5].str, "(" );
288 assert_eq( ts[6].str, "@" );
289 assert_eq( ts[7].str, "@" );
290 assert_eq( ts[8].str, ";" ); // paren and simicolons, atmarks are split
291
292 assert_eq( ts.length, 9 );
293 }
294
295 unittest
296 {
297 // !! be sure to run the unittest on the root of the source directory
298 auto lexf = lexerFromFile("polemy/lex.d");
299 lexf = find!`a.str == "module"`(lexf);
300 assert_eq( lexf.front.str, "module" );
301 assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
302 assert_eq( lexf.front.pos.lineno, 7 );
303 assert_eq( lexf.front.pos.column, 1 );
304 lexf.popFront;
305 assert_eq( lexf.front.str, "polemy" );
306 assert_eq( lexf.front.pos.lineno, 7 );
307 assert_eq( lexf.front.pos.column, 8 );
308 lexf.popFront;
309 lexf.popFront;
310 lexf.popFront;
311 lexf.popFront;
312 assert_eq( lexf.front.str, "import" );
313 assert_eq( lexf.front.pos.lineno, 8 );
314 assert_eq( lexf.front.pos.column, 1 );
315 }
316
317 unittest
318 {
319 assert_throw!UnexpectedEOF( lexerFromString(`"`) );
320 }
321
322 unittest
323 {
324 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
325 be ignored.
326 hahaha"hihihi""hu\\\"huhu"#123 aa
327 123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
328 zzz
329 `);
330 Token[] ts = std.array.array(lex);
331 assert_eq( ts[0].str, "my" );
332 assert_eq( ts[0].pos.lineno, 1 );
333 assert( !ts[0].quoted );
334 assert_eq( ts[1].str, "be" );
335 assert_eq( ts[1].pos.lineno, 3 );
336 assert( !ts[1].quoted );
337 assert_eq( ts[2].str, "ignored" );
338 assert( !ts[2].quoted );
339 assert_eq( ts[3].str, "." );
340 assert( !ts[3].quoted );
341 assert_eq( ts[4].str, "hahaha" );
342 assert_eq( ts[4].pos.lineno, 4 );
343 assert( !ts[4].quoted );
344 assert_eq( ts[5].str, "hihihi" );
345 assert_eq( ts[5].pos.lineno, 4 );
346 assert( ts[5].quoted );
347 assert_eq( ts[6].str, `hu\"huhu` );
348 assert_eq( ts[6].pos.lineno, 4 );
349 assert( ts[6].quoted );
350 assert_eq( ts[7].str, "123" );
351 assert_eq( ts[7].pos.lineno, 5 );
352 assert_eq( ts[8].str, "aa" );
353 assert_eq( ts[9].pos.lineno, 5 );
354 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
355 assert( ts[9].quoted );
356 assert_eq( ts[10].pos.lineno, 8 );
357 assert( !ts[10].quoted );
358 assert_eq( ts.length, 11 );
359 }
360
361 unittest
362 {
363 auto lex2 = lexerFromString(" a12\n3a 5 ");
364 assert_eq( lex2.front.str, "a12" );
365 lex2.popFront;
366 auto lex3 = lex2.save;
367 assert_eq( lex2.front.str, "3a" );
368 lex2.popFront;
369 assert_eq( lex3.front.str, "3a" );
370 assert_eq( lex2.front.str, "5" );
371 lex2.popFront;
372 lex3.popFront;
373 assert( lex2.empty );
374 assert( !lex3.empty );
375 assert_eq( lex3.front.str, "5" );
376 }
377
378 unittest
379 {
380 auto lex = lexerFromString(`=""`);
381 assert_eq(lex.front.str, "="); lex.popFront;
382 assert_eq(lex.front.str, ""); lex.popFront;
383 assert( lex.empty );
384 assert_eq( lexerFromString(`-@`).front.str, "-" );
385 }
386
387 /// Forward range for reader character by character,
388 /// keeping track of position information and caring \r\n -> \n conversion.
389
390 struct PositionedReader(CharSeq)
391 if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq)==dchar) )
392 {
393 CharSeq buffer;
394 string filename;
395 int lineno;
396 int column;
397
398 /// Range primitive
399 bool empty() /*@property*/
400 {
401 return buffer.empty;
402 }
403
404 /// Range primitive
405 dchar front() /*@property*/
406 {
407 dchar c = buffer.front;
408 return (c=='\r' ? '\n' : c);
409 }
410
411 /// Range primitive
412 void popFront() /*@property*/
413 {
414 dchar c = buffer.front;
415 buffer.popFront;
416 if( c=='\r' )
417 {
418 if( !buffer.empty && buffer.front=='\n' )
419 buffer.popFront;
420 c = '\n';
421 }
422 if( c=='\n' )
423 {
424 lineno ++;
425 column = 1;
426 }
427 else
428 column ++;
429 }
430
431 /// Range primitive
432 typeof(this) save() /*@property*/
433 {
434 return this;
435 }
436
437 /// Get the current position
438 immutable(LexPosition) currentPosition() const
439 {
440 return new immutable(LexPosition)(filename, lineno, column);
441 }
442 }
443
444 unittest
445 {
446 assert( isForwardRange!(PositionedReader!string) );
447 assert( is(ElementType!(PositionedReader!string) == dchar) );
448 }