1 /**
2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 *
5 * Lexer for Polemy programming language.
6 */
7 module polemy.lex;
8 import polemy._common;
9
10 import std.file : readText;
11 import std.string : munch;
12 import std.ctype;
13
14 /// Represents a position in a source code
15
16 class LexPosition
17 {
18 immutable string filename; /// name of the source file
19 immutable int lineno; /// line number, 1, 2, ...
20 immutable int column; /// column, 1, 2, ...
21
22 override string toString() const
23 { return sprintf!"%s:%d:%d"(filename, lineno, column); }
24
25 mixin SimpleConstructor;
26 mixin SimpleCompare;
27 }
28
29 unittest
30 {
31 auto p = new LexPosition("hello.cpp", 123, 45);
32 auto q = new LexPosition("hello.cpp", 123, 46);
33
34 assert_eq( p.filename, "hello.cpp" );
35 assert_eq( p.lineno, 123 );
36 assert_eq( p.column, 45 );
37 assert_eq( to!string(p), "hello.cpp:123:45" );
38 assert_lt( p, q );
39 assert_ne( p, q );
40
41 assert( !__traits(compiles, new LexPosition) );
42 assert( !__traits(compiles, p.filename="foo") );
43 assert( !__traits(compiles, p.lineno =789) );
44 assert( !__traits(compiles, p.column =222) );
45 }
46
47 /// Represents a lexer token
48
49 class Token
50 {
51 immutable LexPosition pos; /// Position where the token occurred in the source
52 immutable string str; /// The token string itself
53 immutable bool quoted; /// Was it a "quoted" token or unquoted?
54
55 mixin SimpleConstructor;
56 mixin SimpleCompare;
57 }
58
59 unittest
60 {
61 auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
62 auto t = new Token(p, "class", false);
63 auto u = new Token(p, "class", true);
64
65 assert_eq( t.pos, p );
66 assert_eq( t.str, "class" );
67 assert( !t.quoted );
68 assert_eq( t, new Token(p, "class", false) );
69 assert_lt( t, new Token(p, "struct", false) );
70 assert_ne( t, u );
71 assert( u.quoted );
72
73 assert( !__traits(compiles, new Token) );
74 assert( !__traits(compiles, t.pos=p) );
75 assert( !__traits(compiles, t.str=789) );
76 assert( !__traits(compiles, t.quoted=true) );
77 }
78
79 /// Named Construtor for Lexer
80
81 Lexer lexerFromFile(T...)( string filename, T rest )
82 {
83 return lexerFromString( std.file.readText(filename), filename, rest );
84 }
85
86 /// Named Construtor for Lexer
87
88 Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 )
89 {
90 return new Lexer(str, filename, lineno, column);
91 }
92
93 /// Lexer is a forward range of Tokens
94
95 class Lexer
96 {
97 /// Range primitive
98 bool empty() /*@property*/
99 {
100 return current is null;
101 }
102
103 /// Range primitive
104 Token front() /*@property*/
105 {
106 return std.exception.enforce(current, "Lexer has already reached the end");
107 }
108
109 /// Range primitive
110 void popFront() /*@property*/
111 {
112 std.exception.enforce(current, "Lexer has already reached the end");
113 current = readNext();
114 }
115
116 /// Range primitive
117 Lexer save() /*@property*/
118 {
119 return new Lexer(this.tupleof);
120 }
121
122 private: // implementation
123
124 string buffer;
125 string filename;
126 int lineno;
127 int column;
128 Token current;
129
130 invariant()
131 {
132 assert( buffer.empty || !std.ctype.isspace(buffer[0]) );
133 }
134
135 this( string buffer, string filename, int lineno, int column, Token current=null )
136 {
137 this.buffer = buffer;
138 this.filename = filename;
139 this.lineno = lineno;
140 this.column = column;
141 skipws();
142 this.current = (current is null ? readNext() : current);
143 }
144
145 void skipws()
146 {
147 bool progress = false;
148 do
149 {
150 string ws = buffer.munch(" \t");
151 column += ws.length;
152 progress = !ws.empty;
153 while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') )
154 {
155 progress = true;
156 if( buffer[0] == '\n' )
157 buffer = buffer[1..$];
158 else // if( buffer.front == '\r' )
159 {
160 buffer = buffer[1..$];
161 if( !buffer.empty && buffer[0]=='\n' )
162 buffer = buffer[1..$];
163 }
164 lineno ++;
165 column = 1;
166 }
167 }while( progress );
168 }
169
170 char readChar()
171 {
172 scope(exit) {
173 buffer = buffer[1..$];
174 column ++;
175 }
176 return buffer[0];
177 }
178
179 /// This is the main lexing routine
180 Token readNext()
181 {
182 if( buffer.empty )
183 return null;
184 scope(exit)
185 skipws();
186
187 if( isSymbol(buffer[0]) )
188 {
189 if( buffer[0] == '#' )
190 {
191 // skip comment
192 while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') )
193 readChar();
194 skipws();
195 return readNext();
196 }
197 else if( buffer[0] == '"' )
198 {
199 // string literal
200 auto pos = currentPosition();
201 string lit;
202 readChar();
203 while( !buffer.empty && buffer[0]!='"' )
204 {
205 // read one char
206 char c = readChar();
207 if( c == '\\' )
208 {
209 if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') )
210 lit ~= readChar();
211 else
212 lit ~= c;
213 }
214 else if( c == '\n' )
215 {
216 lit ~= c;
217 lineno++;
218 column = 1;
219 }
220 else if( c == '\r' )
221 {
222 if( !buffer.empty && buffer[0]=='\n' )
223 readChar();
224 lit ~= '\n';
225 lineno++;
226 column = 1;
227 }
228 else
229 lit ~= c;
230 }
231 if( !buffer.empty )
232 readChar();
233 return new Token(pos, lit, true);
234 }
235 else
236 {
237 // normal symbol
238 auto pos = currentPosition();
239 auto str = ""~readChar();
240 return new Token(pos, str, false);
241 }
242 }
243 else
244 {
245 auto pos = currentPosition();
246 int i = 0;
247 while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) )
248 ++i;
249 auto str = buffer[0 .. i];
250 buffer = buffer[i .. $];
251 column += i;
252 return new Token(pos, str, false);
253 }
254 }
255
256 bool isSymbol(char c)
257 {
258 return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_');
259 }
260
261 immutable(LexPosition) currentPosition()
262 {
263 return new immutable(LexPosition)(filename, lineno, column);
264 }
265 }
266
267 unittest
268 {
269 assert( std.range.isForwardRange!(Lexer) );
270 }
271
272 unittest
273 {
274 auto lex = lexerFromString("this is a \t\r\n pen :-( ");
275 Token[] ts = std.array.array(lex);
276
277 assert_eq( ts[0].pos.lineno, 1 );
278 assert_eq( ts[0].pos.column, 1 );
279 assert( !ts[0].quoted );
280 assert_eq( ts[0].str, "this" );
281
282 assert_eq( ts[1].pos.lineno, 1 );
283 assert_eq( ts[1].pos.column, 6 );
284 assert( !ts[1].quoted );
285 assert_eq( ts[1].str, "is" );
286
287 assert_eq( ts[2].pos.lineno, 1 );
288 assert_eq( ts[2].pos.column, 9 );
289 assert( !ts[2].quoted );
290 assert_eq( ts[2].str, "a" );
291
292 assert_eq( ts[3].pos.lineno, 2 );
293 assert_eq( ts[3].pos.column, 2 );
294 assert( !ts[3].quoted );
295 assert_eq( ts[3].str, "pen" );
296
297 // consecutive symbols are always separated
298 // hence, no "++" or "<<" or ...
299
300 assert_eq( ts[4].pos.lineno, 2 );
301 assert_eq( ts[4].pos.column, 6 );
302 assert_eq( ts[4].str, ":" );
303
304 assert_eq( ts[5].pos.lineno, 2 );
305 assert_eq( ts[5].pos.column, 7 );
306 assert_eq( ts[5].str, "-" );
307
308 assert_eq( ts[6].pos.lineno, 2 );
309 assert_eq( ts[6].pos.column, 8 );
310 assert_eq( ts[6].str, "(" );
311
312 assert_eq( ts.length, 7 );
313 }
314
315 unittest
316 {
317 auto lex2 = lexerFromString(" a12\n3a 5 ");
318 assert_eq( lex2.front.str, "a12" );
319 lex2.popFront;
320 auto lex3 = lex2.save;
321 assert_eq( lex2.front.str, "3a" );
322 lex2.popFront;
323 assert_eq( lex3.front.str, "3a" );
324 assert_eq( lex2.front.str, "5" );
325 lex2.popFront;
326 lex3.popFront;
327 assert( lex2.empty );
328 assert( !lex3.empty );
329 assert_eq( lex3.front.str, "5" );
330 }
331
332 unittest
333 {
334 //!! be sure to run the unittest on the root of the source directory
335 auto lexf = lexerFromFile("polemy/lex.d");
336 lexf = find!`a.str == "module"`(lexf);
337 assert_eq( lexf.front.str, "module" );
338 assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
339 assert_eq( lexf.front.pos.lineno, 7 );
340 assert_eq( lexf.front.pos.column, 1 );
341 lexf.popFront;
342 assert_eq( lexf.front.str, "polemy" );
343 assert_eq( lexf.front.pos.lineno, 7 );
344 assert_eq( lexf.front.pos.column, 8 );
345 lexf.popFront;
346 assert_eq( lexf.front.str, "." );
347 lexf.popFront;
348 assert_eq( lexf.front.str, "lex" );
349 lexf.popFront;
350 assert_eq( lexf.front.str, ";" );
351 lexf.popFront;
352 assert_eq( lexf.front.str, "import" );
353 assert_eq( lexf.front.pos.lineno, 8 );
354 assert_eq( lexf.front.pos.column, 1 );
355 }
356
357 unittest
358 {
359 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
360 be ignored.
361 hahaha"hihihi""hu\\\"huhu"#123 aa
362 123 aa "aaa`~"\r\n"~`bbb # 123`~"\r\n"~`eee"
363 zzz
364 `);
365 Token[] ts = std.array.array(lex);
366 assert_eq( ts[0].str, "my" );
367 assert_eq( ts[0].pos.lineno, 1 );
368 assert( !ts[0].quoted );
369 assert_eq( ts[1].str, "be" );
370 assert_eq( ts[1].pos.lineno, 3 );
371 assert( !ts[1].quoted );
372 assert_eq( ts[2].str, "ignored" );
373 assert( !ts[2].quoted );
374 assert_eq( ts[3].str, "." );
375 assert( !ts[3].quoted );
376 assert_eq( ts[4].str, "hahaha" );
377 assert_eq( ts[4].pos.lineno, 4 );
378 assert( !ts[4].quoted );
379 assert_eq( ts[5].str, "hihihi" );
380 assert_eq( ts[5].pos.lineno, 4 );
381 assert( ts[5].quoted );
382 assert_eq( ts[6].str, `hu\"huhu` );
383 assert_eq( ts[6].pos.lineno, 4 );
384 assert( ts[6].quoted );
385 assert_eq( ts[7].str, "123" );
386 assert_eq( ts[7].pos.lineno, 5 );
387 assert_eq( ts[8].str, "aa" );
388 assert_eq( ts[9].pos.lineno, 5 );
389 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
390 assert( ts[9].quoted );
391 assert_eq( ts[10].pos.lineno, 8 );
392 assert( !ts[10].quoted );
393 assert_eq( ts.length, 11 );
394 }