1 /**
2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 *
5 * Lexer for Polemy programming language.
6 */
7 module polemy.lex;
8 import polemy._common;
9 import std.file : readText;
10 import std.ctype : isspace, isalnum;
11
12 /// Exception from this module
13
14 class LexException : Exception
15 {
16 const LexPosition pos;
17
18 this( const LexPosition pos, string msg, string file="", int line=0 )
19 { super(sprintf!"[%s] %s"(pos, msg), file, line); this.pos = pos; }
20 };
21
22 /// Represents a position in a source code
23
24 class LexPosition
25 {
26 immutable string filename; /// name of the source file
27 immutable int lineno; /// 1-origin
28 immutable int column; /// 1-origin
29
30 mixin SimpleClass;
31 override string toString() const
32 { return sprintf!"%s:%d:%d"(filename, lineno, column); }
33
34 static immutable LexPosition dummy;
35 static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
36 }
37
38 unittest
39 {
40 auto p = new LexPosition("hello.cpp", 123, 45);
41 auto q = new LexPosition("hello.cpp", 123, 46);
42
43 assert_eq( p.filename, "hello.cpp" );
44 assert_eq( p.lineno, 123 );
45 assert_eq( p.column, 45 );
46 assert_eq( to!string(p), "hello.cpp:123:45" );
47 assert_lt( p, q );
48 assert_ne( p, q );
49
50 assert( !__traits(compiles, new LexPosition) );
51 assert( !__traits(compiles, p.filename="foo") );
52 assert( !__traits(compiles, p.lineno =789) );
53 assert( !__traits(compiles, p.column =222) );
54 }
55
56 /// Represents a lexer token
57
58 class Token
59 {
60 immutable LexPosition pos; /// Position where the token occurred in the source
61 immutable string str; /// The token string itself
62 immutable bool quoted; /// Was it a "quoted" token or unquoted?
63
64 mixin SimpleClass;
65 }
66
67 unittest
68 {
69 auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
70 auto t = new Token(p, "class", false);
71 auto u = new Token(p, "class", true);
72
73 assert_eq( t.pos, p );
74 assert_eq( t.str, "class" );
75 assert( !t.quoted );
76 assert_eq( t, new Token(p, "class", false) );
77 assert_lt( t, new Token(p, "struct", false) );
78 assert_ne( t, u );
79 assert( u.quoted );
80
81 assert( !__traits(compiles, new Token) );
82 assert( !__traits(compiles, t.pos=p) );
83 assert( !__traits(compiles, t.str=789) );
84 assert( !__traits(compiles, t.quoted=true) );
85 }
86
87 /// Named Construtors for Lexer
88
89 auto lexerFromFile(T...)( string filename, T rest )
90 {
91 return lexerFromString( std.file.readText(filename), filename, rest );
92 }
93
94 auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
95 {
96 return new LexerT!(PositionedReader!CharSeq)(
97 PositionedReader!CharSeq(str, filename, lineno, column)
98 );
99 }
100
101 /// Standard Lexer Type (all you have to know is that this is a forward range of Tokens)
102
103 alias LexerT!(PositionedReader!string) Lexer;
104
105 /// Lexer Implementation
106
107 class LexerT(Reader)
108 if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
109 {
110 /// Range primitive
111 bool empty() /*@property*/
112 {
113 return current is null;
114 }
115
116 /// Range primitive
117 Token front() /*@property*/
118 {
119 return std.exception.enforce(current, "Lexer has already reached the end");
120 }
121
122 /// Range primitive
123 void popFront() /*@property*/
124 {
125 std.exception.enforce(current, "Lexer has already reached the end");
126 current = readNext();
127 }
128
129 /// Range primitive
130 typeof(this) save() /*@property*/
131 {
132 return new typeof(this)(reader.save, current);
133 }
134
135 private: // implementation
136
137 Reader reader;
138 Token current;
139
140 invariant()
141 {
142 assert( reader.empty || !std.ctype.isspace(reader.front) );
143 }
144
145 this( Reader reader, Token current = null )
146 {
147 this.reader = reader;
148 readWhile!isSpace();
149 this.current = (current is null ? readNext() : current);
150 }
151
152 public static {
153 bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; }
154 bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
155 bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; }
156 bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
157 bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); }
158 }
159
160 string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
161 string readQuoted(const LexPosition pos, ref char[] buf)
162 {
163 if( reader.empty )
164 throw new LexException(pos, "EOF found while lexing a quoted-string");
165 dchar c = reader.front;
166 reader.popFront;
167 if( c == '"' )
168 return assumeUnique(buf);
169 if( c == '\\' && !reader.empty ) {
170 if( reader.front=='"' ) {
171 reader.popFront;
172 return readQuoted(pos,buf ~= '\"');
173 }
174 if( reader.front=='\\' ) {
175 reader.popFront;
176 return readQuoted(pos,buf ~= '\\');
177 }
178 }
179 return readQuoted(pos,buf ~= c);
180 }
181
182 string readWhile(alias fn)()
183 {
184 char[] buf;
185 for(; !reader.empty && fn(reader.front); reader.popFront)
186 buf ~= reader.front;
187 return assumeUnique(buf);
188 }
189
190 Token readNext()
191 {
192 if( reader.empty )
193 return null;
194 scope(success)
195 readWhile!isSpace();
196 if( reader.front == '#' ) // comment
197 {
198 reader = find(reader, '\n');
199 readWhile!isSpace();
200 return readNext();
201 }
202 else if( reader.front == '"' ) // quoted
203 {
204 auto pos = reader.currentPosition();
205 reader.popFront;
206 return new Token(pos, readQuoted(pos), true);
207 }
208 else if( isSSymbol(reader.front) ) // paren
209 {
210 auto pos = reader.currentPosition();
211 string s; s~=reader.front; reader.popFront;
212 return new Token(pos, s, false);
213 }
214 else if( isMSymbol(reader.front) ) // symbol
215 {
216 auto pos = reader.currentPosition();
217 return new Token(pos, readWhile!isMSymbol(), false);
218 }
219 else
220 {
221 auto pos = reader.currentPosition();
222 return new Token(pos, readWhile!isLetter(), false);
223 }
224 }
225 }
226
227 unittest
228 {
229 assert( std.range.isForwardRange!(Lexer) );
230 }
231
232 unittest
233 {
234 auto lex = lexerFromString("this is a \t\r\n pen :-( @@; ");
235 Token[] ts = std.array.array(lex);
236
237 assert_eq( ts[0].pos.lineno, 1 );
238 assert_eq( ts[0].pos.column, 1 );
239 assert( !ts[0].quoted );
240 assert_eq( ts[0].str, "this" );
241
242 assert_eq( ts[1].pos.lineno, 1 );
243 assert_eq( ts[1].pos.column, 6 );
244 assert( !ts[1].quoted );
245 assert_eq( ts[1].str, "is" );
246
247 assert_eq( ts[2].pos.lineno, 1 );
248 assert_eq( ts[2].pos.column, 9 );
249 assert( !ts[2].quoted );
250 assert_eq( ts[2].str, "a" );
251
252 assert_eq( ts[3].pos.lineno, 2 );
253 assert_eq( ts[3].pos.column, 2 );
254 assert( !ts[3].quoted );
255 assert_eq( ts[3].str, "pen" );
256
257 assert_eq( ts[4].pos.lineno, 2 );
258 assert_eq( ts[4].pos.column, 6 );
259 assert_eq( ts[4].str, ":-" );
260
261 assert_eq( ts[5].pos.lineno, 2 );
262 assert_eq( ts[5].pos.column, 8 );
263 assert_eq( ts[5].str, "(" );
264 assert_eq( ts[6].str, "@@" );
265 assert_eq( ts[7].str, ";" ); // paren and simicolons are split
266
267 assert_eq( ts.length, 8 );
268 }
269
270 unittest
271 {
272 // !! be sure to run the unittest on the root of the source directory
273 auto lexf = lexerFromFile("polemy/lex.d");
274 lexf = find!`a.str == "module"`(lexf);
275 assert_eq( lexf.front.str, "module" );
276 assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
277 assert_eq( lexf.front.pos.lineno, 7 );
278 assert_eq( lexf.front.pos.column, 1 );
279 lexf.popFront;
280 assert_eq( lexf.front.str, "polemy" );
281 assert_eq( lexf.front.pos.lineno, 7 );
282 assert_eq( lexf.front.pos.column, 8 );
283 lexf.popFront;
284 lexf.popFront;
285 lexf.popFront;
286 lexf.popFront;
287 assert_eq( lexf.front.str, "import" );
288 assert_eq( lexf.front.pos.lineno, 8 );
289 assert_eq( lexf.front.pos.column, 1 );
290 }
291
292 unittest
293 {
294 assert_throw!LexException( lexerFromString(`"`) );
295 }
296
297 unittest
298 {
299 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
300 be ignored.
301 hahaha"hihihi""hu\\\"huhu"#123 aa
302 123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
303 zzz
304 `);
305 Token[] ts = std.array.array(lex);
306 assert_eq( ts[0].str, "my" );
307 assert_eq( ts[0].pos.lineno, 1 );
308 assert( !ts[0].quoted );
309 assert_eq( ts[1].str, "be" );
310 assert_eq( ts[1].pos.lineno, 3 );
311 assert( !ts[1].quoted );
312 assert_eq( ts[2].str, "ignored" );
313 assert( !ts[2].quoted );
314 assert_eq( ts[3].str, "." );
315 assert( !ts[3].quoted );
316 assert_eq( ts[4].str, "hahaha" );
317 assert_eq( ts[4].pos.lineno, 4 );
318 assert( !ts[4].quoted );
319 assert_eq( ts[5].str, "hihihi" );
320 assert_eq( ts[5].pos.lineno, 4 );
321 assert( ts[5].quoted );
322 assert_eq( ts[6].str, `hu\"huhu` );
323 assert_eq( ts[6].pos.lineno, 4 );
324 assert( ts[6].quoted );
325 assert_eq( ts[7].str, "123" );
326 assert_eq( ts[7].pos.lineno, 5 );
327 assert_eq( ts[8].str, "aa" );
328 assert_eq( ts[9].pos.lineno, 5 );
329 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
330 assert( ts[9].quoted );
331 assert_eq( ts[10].pos.lineno, 8 );
332 assert( !ts[10].quoted );
333 assert_eq( ts.length, 11 );
334 }
335
336 unittest
337 {
338 auto lex2 = lexerFromString(" a12\n3a 5 ");
339 assert_eq( lex2.front.str, "a12" );
340 lex2.popFront;
341 auto lex3 = lex2.save;
342 assert_eq( lex2.front.str, "3a" );
343 lex2.popFront;
344 assert_eq( lex3.front.str, "3a" );
345 assert_eq( lex2.front.str, "5" );
346 lex2.popFront;
347 lex3.popFront;
348 assert( lex2.empty );
349 assert( !lex3.empty );
350 assert_eq( lex3.front.str, "5" );
351 }
352
353 unittest
354 {
355 auto lex = lexerFromString(`=""`);
356 assert_eq(lex.front.str, "="); lex.popFront;
357 assert_eq(lex.front.str, ""); lex.popFront;
358 assert( lex.empty );
359 }
360
361 /// Forward range for reader character by character,
362 /// keeping track of position information and caring \r\n -> \n conversion.
363
364 private
365 struct PositionedReader(CharSeq)
366 if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
367 {
368 CharSeq buffer;
369 string filename;
370 int lineno;
371 int column;
372
373 /// Range primitive
374 bool empty() /*@property*/
375 {
376 return buffer.empty;
377 }
378
379 /// Range primitive
380 dchar front() /*@property*/
381 {
382 dchar c = buffer.front;
383 return (c=='\r' ? '\n' : c);
384 }
385
386 /// Range primitive
387 void popFront() /*@property*/
388 {
389 dchar c = buffer.front;
390 buffer.popFront;
391 if( c=='\r' )
392 {
393 if( !buffer.empty && buffer.front=='\n' )
394 buffer.popFront;
395 c = '\n';
396 }
397 if( c=='\n' )
398 {
399 lineno ++;
400 column = 1;
401 }
402 else
403 column ++;
404 }
405
406 /// Range primitive
407 typeof(this) save() /*@property*/
408 {
409 return this;
410 }
411
412 /// Get the current position
413 immutable(LexPosition) currentPosition() const
414 {
415 return new immutable(LexPosition)(filename, lineno, column);
416 }
417 }
418
419 unittest
420 {
421 assert( isForwardRange!(PositionedReader!string) );
422 assert( is(ElementType!(PositionedReader!string) == dchar) );
423 }