1 /**
2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 *
5 * Lexer for Polemy programming language.
6 */
7 module polemy.lex;
8 import polemy._common;
9 import std.file : readText;
10 import std.ctype : isspace, isalnum;
11
12 /*mixin*/
13 template ExceptionWithPosition()
14 {
15 const LexPosition pos;
16 this( const LexPosition pos, string msg, string file=null, size_t line=0, Throwable next=null )
17 { super(sprintf!"[%s] %s"(pos, msg), file, line, next); this.pos = pos; }
18 }
19
20 /// Thrown when encountered an EOF in the middle of a lexical token
21
22 class UnexpectedEOF : Exception
23 {
24 mixin ExceptionWithPosition;
25 }
26
27 /// Thrown when encountered a lexical error
28
29 class LexException : Exception
30 {
31 mixin ExceptionWithPosition;
32 };
33
34 /// Represents a position in source codes
35
36 class LexPosition
37 {
38 immutable string filename; /// name of the source file
39 immutable int lineno; /// 1-origin
40 immutable int column; /// 1-origin
41
42 mixin SimpleClass;
43 override string toString() const
44 { return sprintf!"%s:%d:%d"(filename, lineno, column); }
45
46 static immutable LexPosition dummy;
47 static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
48 }
49
50 unittest
51 {
52 auto p = new LexPosition("hello.cpp", 123, 45);
53
54 assert_eq( p.filename, "hello.cpp" );
55 assert_eq( p.lineno, 123 );
56 assert_eq( p.column, 45 );
57 assert_eq( text(p), "hello.cpp:123:45" );
58
59 assert( !__traits(compiles, new LexPosition) );
60 assert( !__traits(compiles, p.filename="foo") );
61 assert( !__traits(compiles, p.lineno =789) );
62 assert( !__traits(compiles, p.column =222) );
63
64 auto q = new LexPosition("hello.cpp", 123, 46);
65 assert_lt( p, q );
66 assert_ne( p, q );
67 }
68
69 /// Represents a lexer token
70
71 class Token
72 {
73 immutable LexPosition pos; /// Position where the token occurred in the source
74 immutable string str; /// The token string itself
75 immutable bool quoted; /// Was it a "quoted" token or unquoted?
76
77 mixin SimpleClass;
78 }
79
80 unittest
81 {
82 auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
83 auto t = new Token(p, "class", false);
84 auto u = new Token(p, "class", true);
85
86 assert_eq( t.pos, p );
87 assert_eq( t.str, "class" );
88 assert( !t.quoted );
89 assert_eq( t, new Token(p, "class", false) );
90 assert_lt( t, new Token(p, "struct", false) );
91 assert_ne( t, u );
92 assert( u.quoted );
93
94 assert( !__traits(compiles, new Token) );
95 assert( !__traits(compiles, t.pos=p) );
96 assert( !__traits(compiles, t.str=789) );
97 assert( !__traits(compiles, t.quoted=true) );
98 }
99
100 /// Named Construtors for Lexer
101
102 Lexer lexerFromFile(T...)( string filename, T ln_cn )
103 {
104 return lexerFromString( std.file.readText(filename), filename, ln_cn );
105 }
106
107 /// Named Construtor for Lexer
108
109 LexerT!(PositionedReader!CharSeq) /* ddoc doesn't recognize auto return... bugzilla:2581 */
110 lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
111 {
112 return new LexerT!(PositionedReader!CharSeq)(
113 PositionedReader!CharSeq(str, filename, lineno, column)
114 );
115 }
116
117 /// Standard Lexer Type (all you have to know is that this is a forward range of Tokens!)
118
119 alias LexerT!(PositionedReader!string) Lexer;
120
121 /// Lexer Implementation
122
123 class LexerT(Reader)
124 if( isForwardRange!(Reader) && is(ElementType!(Reader)==dchar) )
125 {
126 /// Range primitive
127 bool empty() /*@property*/
128 {
129 return current is null;
130 }
131
132 /// Range primitive
133 Token front() /*@property*/
134 {
135 return std.exception.enforce(current, "Lexer has already reached the end");
136 }
137
138 /// Range primitive
139 void popFront() /*@property*/
140 {
141 std.exception.enforce(current, "Lexer has already reached the end");
142 current = readNext();
143 }
144
145 /// Range primitive
146 typeof(this) save() /*@property*/
147 {
148 return new typeof(this)(reader.save, current);
149 }
150
151 private: // implementation
152
153 Reader reader;
154 Token current;
155
156 invariant()
157 {
158 assert( reader.empty || !isSpace(reader.front) );
159 }
160
161 this( Reader reader, Token current = null )
162 {
163 this.reader = reader;
164 readWhile!isSpace();
165 this.current = (current is null ? readNext() : current);
166 }
167
168 public static
169 {
170 bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; }
171 bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
172 bool isSSymbol (dchar c) { return "()[]{};@".canFind(c); }
173 bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
174 bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); }
175 }
176
177 string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
178 string readQuoted(const LexPosition pos, ref char[] buf)
179 {
180 if( reader.empty )
181 throw genex!UnexpectedEOF(pos, "Quoted string not terminated");
182 dchar c = reader.front;
183 reader.popFront;
184 if( c == '"' )
185 return assumeUnique(buf);
186 if( c == '\\' && !reader.empty ) {
187 if( reader.front=='"' ) {
188 reader.popFront;
189 return readQuoted(pos,buf ~= '\"');
190 }
191 if( reader.front=='\\' ) {
192 reader.popFront;
193 return readQuoted(pos,buf ~= '\\');
194 }
195 }
196 return readQuoted(pos,buf ~= c);
197 }
198
199 string readWhile(alias fn)()
200 {
201 char[] buf;
202 for(; !reader.empty && fn(reader.front); reader.popFront)
203 buf ~= reader.front;
204 return assumeUnique(buf);
205 }
206
207 Token readNext()
208 {
209 if( reader.empty )
210 return null;
211 scope(success)
212 readWhile!isSpace();
213 if( reader.front == '#' ) // comment
214 {
215 reader = find(reader, '\n');
216 readWhile!isSpace();
217 return readNext();
218 }
219 else if( reader.front == '"' ) // quoted
220 {
221 auto pos = reader.currentPosition();
222 reader.popFront;
223 return new Token(pos, readQuoted(pos), true);
224 }
225 else if( isSSymbol(reader.front) ) // paren
226 {
227 auto pos = reader.currentPosition();
228 string s; s~=reader.front; reader.popFront;
229 return new Token(pos, s, false);
230 }
231 else if( isMSymbol(reader.front) ) // symbol
232 {
233 auto pos = reader.currentPosition();
234 return new Token(pos, readWhile!isMSymbol(), false);
235 }
236 else
237 {
238 auto pos = reader.currentPosition();
239 return new Token(pos, readWhile!isLetter(), false);
240 }
241 }
242 }
243
244 unittest
245 {
246 assert( std.range.isForwardRange!(Lexer) );
247 assert( is(ElementType!(Lexer) == Token) );
248 }
249
250 unittest
251 {
252 auto lex = lexerFromString("this is a \t\r\n pen :-( @@; ");
253 Token[] ts = std.array.array(lex);
254
255 assert_eq( ts[0].pos.lineno, 1 );
256 assert_eq( ts[0].pos.column, 1 );
257 assert( !ts[0].quoted );
258 assert_eq( ts[0].str, "this" );
259
260 assert_eq( ts[1].pos.lineno, 1 );
261 assert_eq( ts[1].pos.column, 6 );
262 assert( !ts[1].quoted );
263 assert_eq( ts[1].str, "is" );
264
265 assert_eq( ts[2].pos.lineno, 1 );
266 assert_eq( ts[2].pos.column, 9 );
267 assert( !ts[2].quoted );
268 assert_eq( ts[2].str, "a" );
269
270 assert_eq( ts[3].pos.lineno, 2 );
271 assert_eq( ts[3].pos.column, 2 );
272 assert( !ts[3].quoted );
273 assert_eq( ts[3].str, "pen" );
274
275 assert_eq( ts[4].pos.lineno, 2 );
276 assert_eq( ts[4].pos.column, 6 );
277 assert_eq( ts[4].str, ":-" );
278
279 assert_eq( ts[5].pos.lineno, 2 );
280 assert_eq( ts[5].pos.column, 8 );
281 assert_eq( ts[5].str, "(" );
282 assert_eq( ts[6].str, "@" );
283 assert_eq( ts[7].str, "@" );
284 assert_eq( ts[8].str, ";" ); // paren and simicolons, atmarks are split
285
286 assert_eq( ts.length, 9 );
287 }
288
289 unittest
290 {
291 // !! be sure to run the unittest on the root of the source directory
292 auto lexf = lexerFromFile("polemy/lex.d");
293 lexf = find!`a.str == "module"`(lexf);
294 assert_eq( lexf.front.str, "module" );
295 assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
296 assert_eq( lexf.front.pos.lineno, 7 );
297 assert_eq( lexf.front.pos.column, 1 );
298 lexf.popFront;
299 assert_eq( lexf.front.str, "polemy" );
300 assert_eq( lexf.front.pos.lineno, 7 );
301 assert_eq( lexf.front.pos.column, 8 );
302 lexf.popFront;
303 lexf.popFront;
304 lexf.popFront;
305 lexf.popFront;
306 assert_eq( lexf.front.str, "import" );
307 assert_eq( lexf.front.pos.lineno, 8 );
308 assert_eq( lexf.front.pos.column, 1 );
309 }
310
311 unittest
312 {
313 assert_throw!UnexpectedEOF( lexerFromString(`"`) );
314 }
315
316 unittest
317 {
318 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
319 be ignored.
320 hahaha"hihihi""hu\\\"huhu"#123 aa
321 123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
322 zzz
323 `);
324 Token[] ts = std.array.array(lex);
325 assert_eq( ts[0].str, "my" );
326 assert_eq( ts[0].pos.lineno, 1 );
327 assert( !ts[0].quoted );
328 assert_eq( ts[1].str, "be" );
329 assert_eq( ts[1].pos.lineno, 3 );
330 assert( !ts[1].quoted );
331 assert_eq( ts[2].str, "ignored" );
332 assert( !ts[2].quoted );
333 assert_eq( ts[3].str, "." );
334 assert( !ts[3].quoted );
335 assert_eq( ts[4].str, "hahaha" );
336 assert_eq( ts[4].pos.lineno, 4 );
337 assert( !ts[4].quoted );
338 assert_eq( ts[5].str, "hihihi" );
339 assert_eq( ts[5].pos.lineno, 4 );
340 assert( ts[5].quoted );
341 assert_eq( ts[6].str, `hu\"huhu` );
342 assert_eq( ts[6].pos.lineno, 4 );
343 assert( ts[6].quoted );
344 assert_eq( ts[7].str, "123" );
345 assert_eq( ts[7].pos.lineno, 5 );
346 assert_eq( ts[8].str, "aa" );
347 assert_eq( ts[9].pos.lineno, 5 );
348 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
349 assert( ts[9].quoted );
350 assert_eq( ts[10].pos.lineno, 8 );
351 assert( !ts[10].quoted );
352 assert_eq( ts.length, 11 );
353 }
354
355 unittest
356 {
357 auto lex2 = lexerFromString(" a12\n3a 5 ");
358 assert_eq( lex2.front.str, "a12" );
359 lex2.popFront;
360 auto lex3 = lex2.save;
361 assert_eq( lex2.front.str, "3a" );
362 lex2.popFront;
363 assert_eq( lex3.front.str, "3a" );
364 assert_eq( lex2.front.str, "5" );
365 lex2.popFront;
366 lex3.popFront;
367 assert( lex2.empty );
368 assert( !lex3.empty );
369 assert_eq( lex3.front.str, "5" );
370 }
371
372 unittest
373 {
374 auto lex = lexerFromString(`=""`);
375 assert_eq(lex.front.str, "="); lex.popFront;
376 assert_eq(lex.front.str, ""); lex.popFront;
377 assert( lex.empty );
378 assert_eq( lexerFromString(`-@`).front.str, "-" );
379 }
380
381 /// Forward range for reader character by character,
382 /// keeping track of position information and caring \r\n -> \n conversion.
383
384 struct PositionedReader(CharSeq)
385 if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq)==dchar) )
386 {
387 CharSeq buffer;
388 string filename;
389 int lineno;
390 int column;
391
392 /// Range primitive
393 bool empty() /*@property*/
394 {
395 return buffer.empty;
396 }
397
398 /// Range primitive
399 dchar front() /*@property*/
400 {
401 dchar c = buffer.front;
402 return (c=='\r' ? '\n' : c);
403 }
404
405 /// Range primitive
406 void popFront() /*@property*/
407 {
408 dchar c = buffer.front;
409 buffer.popFront;
410 if( c=='\r' )
411 {
412 if( !buffer.empty && buffer.front=='\n' )
413 buffer.popFront;
414 c = '\n';
415 }
416 if( c=='\n' )
417 {
418 lineno ++;
419 column = 1;
420 }
421 else
422 column ++;
423 }
424
425 /// Range primitive
426 typeof(this) save() /*@property*/
427 {
428 return this;
429 }
430
431 /// Get the current position
432 immutable(LexPosition) currentPosition() const
433 {
434 return new immutable(LexPosition)(filename, lineno, column);
435 }
436 }
437
438 unittest
439 {
440 assert( isForwardRange!(PositionedReader!string) );
441 assert( is(ElementType!(PositionedReader!string) == dchar) );
442 }