1 /**
2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 *
5 * Lexer for Polemy programming language.
6 */
7 module polemy.lex;
8 import polemy._common;
9 import std.file : readText;
10 import std.ctype : isspace, isalnum;
11
12 /*mixin*/
13 template ExceptionWithPosition()
14 {
15 const LexPosition pos;
16 this( const LexPosition pos, string msg, string file=null, size_t line=0, Throwable next=null )
17 { super(sprintf!"[%s] %s"(pos, msg), file, line, next); this.pos = pos; }
18 }
19
20 ///
21 class UnexpectedEOF : Exception
22 {
23 mixin ExceptionWithPosition;
24 }
25
26 ///
27 class LexException : Exception
28 {
29 mixin ExceptionWithPosition;
30 };
31
32 /// Represents a position in a source code
33
34 class LexPosition
35 {
36 immutable string filename; /// name of the source file
37 immutable int lineno; /// 1-origin
38 immutable int column; /// 1-origin
39
40 mixin SimpleClass;
41 override string toString() const
42 { return sprintf!"%s:%d:%d"(filename, lineno, column); }
43
44 static immutable LexPosition dummy;
45 static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
46 }
47
48 unittest
49 {
50 auto p = new LexPosition("hello.cpp", 123, 45);
51
52 assert_eq( p.filename, "hello.cpp" );
53 assert_eq( p.lineno, 123 );
54 assert_eq( p.column, 45 );
55 assert_eq( to!string(p), "hello.cpp:123:45" );
56
57 assert( !__traits(compiles, new LexPosition) );
58 assert( !__traits(compiles, p.filename="foo") );
59 assert( !__traits(compiles, p.lineno =789) );
60 assert( !__traits(compiles, p.column =222) );
61
62 auto q = new LexPosition("hello.cpp", 123, 46);
63 assert_lt( p, q );
64 assert_ne( p, q );
65 }
66
67 /// Represents a lexer token
68
69 class Token
70 {
71 immutable LexPosition pos; /// Position where the token occurred in the source
72 immutable string str; /// The token string itself
73 immutable bool quoted; /// Was it a "quoted" token or unquoted?
74
75 mixin SimpleClass;
76 }
77
78 unittest
79 {
80 auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
81 auto t = new Token(p, "class", false);
82 auto u = new Token(p, "class", true);
83
84 assert_eq( t.pos, p );
85 assert_eq( t.str, "class" );
86 assert( !t.quoted );
87 assert_eq( t, new Token(p, "class", false) );
88 assert_lt( t, new Token(p, "struct", false) );
89 assert_ne( t, u );
90 assert( u.quoted );
91
92 assert( !__traits(compiles, new Token) );
93 assert( !__traits(compiles, t.pos=p) );
94 assert( !__traits(compiles, t.str=789) );
95 assert( !__traits(compiles, t.quoted=true) );
96 }
97
98 /// Named Construtors for Lexer
99
100 Lexer lexerFromFile(T...)( string filename, T rest )
101 {
102 return lexerFromString( std.file.readText(filename), filename, rest );
103 }
104
105 /// Named Construtors for Lexer
106
107 LexerT!(PositionedReader!CharSeq) /* ddoc doesn't recognize auto return... bugzilla:2581 */
108 lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
109 {
110 return new LexerT!(PositionedReader!CharSeq)(
111 PositionedReader!CharSeq(str, filename, lineno, column)
112 );
113 }
114
115 /// Standard Lexer Type (all you have to know is that this is a forward range of Tokens)
116
117 alias LexerT!(PositionedReader!string) Lexer;
118
119 /// Lexer Implementation
120
121 class LexerT(Reader)
122 if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
123 {
124 /// Range primitive
125 bool empty() /*@property*/
126 {
127 return current is null;
128 }
129
130 /// Range primitive
131 Token front() /*@property*/
132 {
133 return std.exception.enforce(current, "Lexer has already reached the end");
134 }
135
136 /// Range primitive
137 void popFront() /*@property*/
138 {
139 std.exception.enforce(current, "Lexer has already reached the end");
140 current = readNext();
141 }
142
143 /// Range primitive
144 typeof(this) save() /*@property*/
145 {
146 return new typeof(this)(reader.save, current);
147 }
148
149 private: // implementation
150
151 Reader reader;
152 Token current;
153
154 invariant()
155 {
156 assert( reader.empty || !std.ctype.isspace(reader.front) );
157 }
158
159 this( Reader reader, Token current = null )
160 {
161 this.reader = reader;
162 readWhile!isSpace();
163 this.current = (current is null ? readNext() : current);
164 }
165
166 public static {
167 bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; }
168 bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
169 bool isSSymbol (dchar c) { return "()[]{};@".canFind(c); }
170 bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
171 bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); }
172 }
173
174 string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
175 string readQuoted(const LexPosition pos, ref char[] buf)
176 {
177 if( reader.empty )
178 throw genex!UnexpectedEOF(pos, "Quoted string not terminated");
179 dchar c = reader.front;
180 reader.popFront;
181 if( c == '"' )
182 return assumeUnique(buf);
183 if( c == '\\' && !reader.empty ) {
184 if( reader.front=='"' ) {
185 reader.popFront;
186 return readQuoted(pos,buf ~= '\"');
187 }
188 if( reader.front=='\\' ) {
189 reader.popFront;
190 return readQuoted(pos,buf ~= '\\');
191 }
192 }
193 return readQuoted(pos,buf ~= c);
194 }
195
196 string readWhile(alias fn)()
197 {
198 char[] buf;
199 for(; !reader.empty && fn(reader.front); reader.popFront)
200 buf ~= reader.front;
201 return assumeUnique(buf);
202 }
203
204 Token readNext()
205 {
206 if( reader.empty )
207 return null;
208 scope(success)
209 readWhile!isSpace();
210 if( reader.front == '#' ) // comment
211 {
212 reader = find(reader, '\n');
213 readWhile!isSpace();
214 return readNext();
215 }
216 else if( reader.front == '"' ) // quoted
217 {
218 auto pos = reader.currentPosition();
219 reader.popFront;
220 return new Token(pos, readQuoted(pos), true);
221 }
222 else if( isSSymbol(reader.front) ) // paren
223 {
224 auto pos = reader.currentPosition();
225 string s; s~=reader.front; reader.popFront;
226 return new Token(pos, s, false);
227 }
228 else if( isMSymbol(reader.front) ) // symbol
229 {
230 auto pos = reader.currentPosition();
231 return new Token(pos, readWhile!isMSymbol(), false);
232 }
233 else
234 {
235 auto pos = reader.currentPosition();
236 return new Token(pos, readWhile!isLetter(), false);
237 }
238 }
239 }
240
241 unittest
242 {
243 assert( std.range.isForwardRange!(Lexer) );
244 assert( is(ElementType!(Lexer) == Token) );
245 }
246
247 unittest
248 {
249 auto lex = lexerFromString("this is a \t\r\n pen :-( @@; ");
250 Token[] ts = std.array.array(lex);
251
252 assert_eq( ts[0].pos.lineno, 1 );
253 assert_eq( ts[0].pos.column, 1 );
254 assert( !ts[0].quoted );
255 assert_eq( ts[0].str, "this" );
256
257 assert_eq( ts[1].pos.lineno, 1 );
258 assert_eq( ts[1].pos.column, 6 );
259 assert( !ts[1].quoted );
260 assert_eq( ts[1].str, "is" );
261
262 assert_eq( ts[2].pos.lineno, 1 );
263 assert_eq( ts[2].pos.column, 9 );
264 assert( !ts[2].quoted );
265 assert_eq( ts[2].str, "a" );
266
267 assert_eq( ts[3].pos.lineno, 2 );
268 assert_eq( ts[3].pos.column, 2 );
269 assert( !ts[3].quoted );
270 assert_eq( ts[3].str, "pen" );
271
272 assert_eq( ts[4].pos.lineno, 2 );
273 assert_eq( ts[4].pos.column, 6 );
274 assert_eq( ts[4].str, ":-" );
275
276 assert_eq( ts[5].pos.lineno, 2 );
277 assert_eq( ts[5].pos.column, 8 );
278 assert_eq( ts[5].str, "(" );
279 assert_eq( ts[6].str, "@" );
280 assert_eq( ts[7].str, "@" );
281 assert_eq( ts[8].str, ";" ); // paren and simicolons, atmarks are split
282
283 assert_eq( ts.length, 9 );
284 }
285
286 unittest
287 {
288 // !! be sure to run the unittest on the root of the source directory
289 auto lexf = lexerFromFile("polemy/lex.d");
290 lexf = find!`a.str == "module"`(lexf);
291 assert_eq( lexf.front.str, "module" );
292 assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
293 assert_eq( lexf.front.pos.lineno, 7 );
294 assert_eq( lexf.front.pos.column, 1 );
295 lexf.popFront;
296 assert_eq( lexf.front.str, "polemy" );
297 assert_eq( lexf.front.pos.lineno, 7 );
298 assert_eq( lexf.front.pos.column, 8 );
299 lexf.popFront;
300 lexf.popFront;
301 lexf.popFront;
302 lexf.popFront;
303 assert_eq( lexf.front.str, "import" );
304 assert_eq( lexf.front.pos.lineno, 8 );
305 assert_eq( lexf.front.pos.column, 1 );
306 }
307
308 unittest
309 {
310 assert_throw!UnexpectedEOF( lexerFromString(`"`) );
311 }
312
313 unittest
314 {
315 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
316 be ignored.
317 hahaha"hihihi""hu\\\"huhu"#123 aa
318 123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
319 zzz
320 `);
321 Token[] ts = std.array.array(lex);
322 assert_eq( ts[0].str, "my" );
323 assert_eq( ts[0].pos.lineno, 1 );
324 assert( !ts[0].quoted );
325 assert_eq( ts[1].str, "be" );
326 assert_eq( ts[1].pos.lineno, 3 );
327 assert( !ts[1].quoted );
328 assert_eq( ts[2].str, "ignored" );
329 assert( !ts[2].quoted );
330 assert_eq( ts[3].str, "." );
331 assert( !ts[3].quoted );
332 assert_eq( ts[4].str, "hahaha" );
333 assert_eq( ts[4].pos.lineno, 4 );
334 assert( !ts[4].quoted );
335 assert_eq( ts[5].str, "hihihi" );
336 assert_eq( ts[5].pos.lineno, 4 );
337 assert( ts[5].quoted );
338 assert_eq( ts[6].str, `hu\"huhu` );
339 assert_eq( ts[6].pos.lineno, 4 );
340 assert( ts[6].quoted );
341 assert_eq( ts[7].str, "123" );
342 assert_eq( ts[7].pos.lineno, 5 );
343 assert_eq( ts[8].str, "aa" );
344 assert_eq( ts[9].pos.lineno, 5 );
345 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
346 assert( ts[9].quoted );
347 assert_eq( ts[10].pos.lineno, 8 );
348 assert( !ts[10].quoted );
349 assert_eq( ts.length, 11 );
350 }
351
352 unittest
353 {
354 auto lex2 = lexerFromString(" a12\n3a 5 ");
355 assert_eq( lex2.front.str, "a12" );
356 lex2.popFront;
357 auto lex3 = lex2.save;
358 assert_eq( lex2.front.str, "3a" );
359 lex2.popFront;
360 assert_eq( lex3.front.str, "3a" );
361 assert_eq( lex2.front.str, "5" );
362 lex2.popFront;
363 lex3.popFront;
364 assert( lex2.empty );
365 assert( !lex3.empty );
366 assert_eq( lex3.front.str, "5" );
367 }
368
369 unittest
370 {
371 auto lex = lexerFromString(`=""`);
372 assert_eq(lex.front.str, "="); lex.popFront;
373 assert_eq(lex.front.str, ""); lex.popFront;
374 assert( lex.empty );
375 assert_eq( lexerFromString(`-@`).front.str, "-" );
376 }
377
378 /// Forward range for reader character by character,
379 /// keeping track of position information and caring \r\n -> \n conversion.
380
381 private
382 struct PositionedReader(CharSeq)
383 if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
384 {
385 CharSeq buffer;
386 string filename;
387 int lineno;
388 int column;
389
390 /// Range primitive
391 bool empty() /*@property*/
392 {
393 return buffer.empty;
394 }
395
396 /// Range primitive
397 dchar front() /*@property*/
398 {
399 dchar c = buffer.front;
400 return (c=='\r' ? '\n' : c);
401 }
402
403 /// Range primitive
404 void popFront() /*@property*/
405 {
406 dchar c = buffer.front;
407 buffer.popFront;
408 if( c=='\r' )
409 {
410 if( !buffer.empty && buffer.front=='\n' )
411 buffer.popFront;
412 c = '\n';
413 }
414 if( c=='\n' )
415 {
416 lineno ++;
417 column = 1;
418 }
419 else
420 column ++;
421 }
422
423 /// Range primitive
424 typeof(this) save() /*@property*/
425 {
426 return this;
427 }
428
429 /// Get the current position
430 immutable(LexPosition) currentPosition() const
431 {
432 return new immutable(LexPosition)(filename, lineno, column);
433 }
434 }
435
436 unittest
437 {
438 assert( isForwardRange!(PositionedReader!string) );
439 assert( is(ElementType!(PositionedReader!string) == dchar) );
440 }