1 /**
2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 *
5 * Lexer for Polemy programming language.
6 */
7 module polemy.lex;
8 import polemy._common;
9
10 import std.file : readText;
11 import std.string : munch;
12 import std.ctype;
13
14 /// Represents a position in a source code
15
16 class LexPosition
17 {
18 immutable string filename; /// name of the source file
19 immutable int lineno; /// line number, 1, 2, ...
20 immutable int column; /// column, 1, 2, ...
21
22 override string toString() const
23 { return sprintf!"%s:%d:%d"(filename, lineno, column); }
24
25 mixin SimpleConstructor;
26 mixin SimpleCompare;
27 }
28
29 unittest
30 {
31 auto p = new LexPosition("hello.cpp", 123, 45);
32 auto q = new LexPosition("hello.cpp", 123, 46);
33
34 assert_eq( p.filename, "hello.cpp" );
35 assert_eq( p.lineno, 123 );
36 assert_eq( p.column, 45 );
37 assert_eq( to!string(p), "hello.cpp:123:45" );
38 assert_lt( p, q );
39 assert_ne( p, q );
40
41 assert( !__traits(compiles, new LexPosition) );
42 assert( !__traits(compiles, p.filename="foo") );
43 assert( !__traits(compiles, p.lineno =789) );
44 assert( !__traits(compiles, p.column =222) );
45 }
46
47 /// Represents a lexer token
48
49 class Token
50 {
51 /// currently we have three kinds of token
52 enum Kind {
53 identifier, /// anything other than others
54 stringLiteral, /// "string literal"
55 number /// 42
56 };
57 immutable LexPosition pos; /// position where the token occurred in the source
58 immutable string str; /// the token string itself
59 immutable Kind kind; /// which kind of token?
60
61 mixin SimpleConstructor;
62 mixin SimpleCompare;
63 }
64
65 unittest
66 {
67 auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
68 auto t = new Token(p, "class", Token.Kind.identifier);
69
70 assert_eq( t.pos, p );
71 assert_eq( t.str, "class" );
72 assert_eq( t, new Token(p, "class", Token.Kind.identifier) );
73 assert_lt( t, new Token(p, "struct", Token.Kind.identifier) );
74
75 assert( !__traits(compiles, new Token) );
76 assert( !__traits(compiles, t.pos=p) );
77 assert( !__traits(compiles, t.str=789) );
78 }
79
80 /// Named Construtor for Lexer
81
82 Lexer lexerFromFile(T...)( string filename, T rest )
83 {
84 return lexerFromString( std.file.readText(filename), filename, rest );
85 }
86
87 /// Named Construtor for Lexer
88
89 Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 )
90 {
91 return new Lexer(str, filename, lineno, column);
92 }
93
94 /// Lexer is a forward range of Tokens
95
96 class Lexer
97 {
98 /// Range primitive
99 bool empty() /*@property*/
100 {
101 return current is null;
102 }
103
104 /// Range primitive
105 Token front() /*@property*/
106 {
107 return std.exception.enforce(current, "Lexer has already reached the end");
108 }
109
110 /// Range primitive
111 void popFront() /*@property*/
112 {
113 std.exception.enforce(current, "Lexer has already reached the end");
114 current = readNext();
115 }
116
117 /// Range primitive
118 Lexer save() /*@property*/
119 {
120 return new Lexer(buffer, filename, lineno, column, current);
121 }
122
123 private: // implementation
124
125 string buffer;
126 string filename;
127 int lineno;
128 int column;
129 Token current;
130
131 invariant()
132 {
133 assert( buffer.empty || !std.ctype.isspace(buffer[0]) );
134 }
135
136 this( string buffer, string filename, int lineno, int column, Token current=null )
137 {
138 this.buffer = buffer;
139 this.filename = filename;
140 this.lineno = lineno;
141 this.column = column;
142 skipws();
143 this.current = (current is null ? readNext() : current);
144 }
145
146 void skipws()
147 {
148 bool progress = false;
149 do
150 {
151 string ws = buffer.munch(" \t");
152 column += ws.length;
153 progress = !ws.empty;
154 while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') )
155 {
156 progress = true;
157 if( buffer[0] == '\n' )
158 buffer = buffer[1..$];
159 else // if( buffer.front == '\r' )
160 {
161 buffer = buffer[1..$];
162 if( !buffer.empty && buffer[0]=='\n' )
163 buffer = buffer[1..$];
164 }
165 lineno ++;
166 column = 1;
167 }
168 }while( progress );
169 }
170
171 char readChar()
172 {
173 scope(exit) {
174 buffer = buffer[1..$];
175 column ++;
176 }
177 return buffer[0];
178 }
179
180 /// This is the main lexing routine
181 Token readNext()
182 {
183 if( buffer.empty )
184 return null;
185 scope(exit)
186 skipws();
187
188 if( isSymbol(buffer[0]) )
189 {
190 if( buffer[0] == '#' )
191 {
192 // skip comment
193 while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') )
194 readChar();
195 skipws();
196 return readNext();
197 }
198 else if( buffer[0] == '"' )
199 {
200 // string literal
201 auto pos = currentPosition();
202 string lit;
203 readChar();
204 while( !buffer.empty && buffer[0]!='"' )
205 {
206 // read one char
207 char c = readChar();
208 if( c == '\\' )
209 {
210 if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') )
211 lit ~= readChar();
212 else
213 lit ~= c;
214 }
215 else if( c == '\n' )
216 {
217 lit ~= c;
218 lineno++;
219 column = 1;
220 }
221 else if( c == '\r' )
222 {
223 if( !buffer.empty && buffer[0]=='\n' )
224 readChar();
225 lit ~= '\n';
226 lineno++;
227 column = 1;
228 }
229 else
230 lit ~= c;
231 }
232 if( !buffer.empty )
233 readChar();
234 return new Token(pos, lit, Token.Kind.stringLiteral);
235 }
236 else
237 {
238 // normal symbol
239 auto pos = currentPosition();
240 auto str = ""~readChar();
241 return new Token(pos, str, Token.Kind.identifier);
242 }
243 }
244 else
245 {
246 auto pos = currentPosition();
247 int i = 0;
248 while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) )
249 ++i;
250 auto str = buffer[0 .. i];
251 buffer = buffer[i .. $];
252 column += i;
253 bool isNumber = find!(`a<'0' || '9'<a`)(str).empty;
254 return new Token(pos, str, isNumber ? Token.Kind.number : Token.Kind.identifier);
255 }
256 }
257
258 bool isSymbol(char c)
259 {
260 return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_');
261 }
262
263 immutable(LexPosition) currentPosition()
264 {
265 return new immutable(LexPosition)(filename, lineno, column);
266 }
267 }
268
269 unittest
270 {
271 assert( std.range.isForwardRange!(Lexer) );
272 }
273
274 unittest
275 {
276 auto lex = lexerFromString("this is a \t\n pen :-( ");
277 Token[] ts = std.array.array(lex);
278
279 assert( ts[0].pos.lineno == 1 );
280 assert( ts[0].pos.column == 1 );
281 assert( ts[0].kind == Token.Kind.identifier );
282 assert( ts[0].str == "this" );
283
284 assert( ts[1].pos.lineno == 1 );
285 assert( ts[1].pos.column == 6 );
286 assert( ts[1].kind == Token.Kind.identifier );
287 assert( ts[1].str == "is" );
288
289 assert( ts[2].pos.lineno == 1 );
290 assert( ts[2].pos.column == 9 );
291 assert( ts[2].kind == Token.Kind.identifier );
292 assert( ts[2].str == "a" );
293
294 assert( ts[3].pos.lineno == 2 );
295 assert( ts[3].pos.column == 2 );
296 assert( ts[3].kind == Token.Kind.identifier );
297 assert( ts[3].str == "pen" );
298
299 // consecutive symbols are always separated
300 // hence, no "++" or "<<" or ...
301
302 assert( ts[4].pos.lineno == 2 );
303 assert( ts[4].pos.column == 6 );
304 assert( ts[4].str == ":" );
305
306 assert( ts[5].pos.lineno == 2 );
307 assert( ts[5].pos.column == 7 );
308 assert( ts[5].str == "-" );
309
310 assert( ts[6].pos.lineno == 2 );
311 assert( ts[6].pos.column == 8 );
312 assert( ts[6].str == "(" );
313
314 assert( ts.length == 7 );
315 }
316
317 unittest
318 {
319 auto lex2 = lexerFromString(" a12\n3a 5 ");
320 assert( lex2.front.str == "a12" );
321 assert( lex2.front.kind == Token.Kind.identifier );
322 lex2.popFront;
323 auto lex3 = lex2.save;
324 assert( lex2.front.str == "3a" );
325 assert( lex2.front.kind == Token.Kind.identifier );
326 lex2.popFront;
327 assert( lex3.front.str == "3a" );
328 assert( lex3.front.kind == Token.Kind.identifier );
329 assert( lex2.front.str == "5" );
330 assert( lex2.front.kind == Token.Kind.number );
331 lex2.popFront;
332 lex3.popFront;
333 assert( lex2.empty );
334 assert( !lex3.empty );
335 assert( lex3.front.str == "5" );
336 assert( lex3.front.kind == Token.Kind.number );
337 }
338
339 unittest
340 {
341 //!! be sure to run the unittest on the root of the source directory
342 auto lexf = lexerFromFile("polemy/lex.d");
343 lexf = find!`a.str == "module"`(lexf);
344 assert( lexf.front.str == "module", lexf.front.str );
345 assert( lexf.front.pos.filename == "polemy/lex.d" );
346 assert( lexf.front.pos.lineno == 7 );
347 assert( lexf.front.pos.column == 1 );
348 lexf.popFront;
349 assert( lexf.front.str == "polemy" );
350 assert( lexf.front.pos.lineno == 7 );
351 assert( lexf.front.pos.column == 8 );
352 lexf.popFront;
353 assert( lexf.front.str == "." );
354 lexf.popFront;
355 assert( lexf.front.str == "lex" );
356 lexf.popFront;
357 assert( lexf.front.str == ";" );
358 lexf.popFront;
359 assert( lexf.front.str == "import" );
360 assert( lexf.front.pos.lineno == 8 );
361 assert( lexf.front.pos.column == 1 );
362 }
363
364 unittest
365 {
366 auto lex = lexerFromString(`my # comment should
367 # hey!!
368 be ignored.
369 hahaha"hihihi""hu\\\"huhu"#123 aa
370 123 aa "aaa
371 bbb # 123
372 eee"
373 zzz
374 `);
375 Token[] ts = std.array.array(lex);
376 assert( ts[0].str == "my" );
377 assert( ts[0].pos.lineno == 1 );
378 assert( ts[1].str == "be" );
379 assert( ts[1].pos.lineno == 3 );
380 assert( ts[2].str == "ignored" );
381 assert( ts[3].str == "." );
382 assert( ts[4].str == "hahaha" );
383 assert( ts[4].pos.lineno == 4 );
384 assert( ts[4].kind == Token.Kind.identifier );
385 assert( ts[5].str == "hihihi" );
386 assert( ts[5].pos.lineno == 4 );
387 assert( ts[5].kind == Token.Kind.stringLiteral );
388 assert( ts[6].str == `hu\"huhu` );
389 assert( ts[6].kind == Token.Kind.stringLiteral );
390 assert( ts[6].pos.lineno == 4 );
391 assert( ts[7].str == "123" );
392 assert( ts[7].pos.lineno == 5 );
393 assert( ts[7].kind == Token.Kind.number );
394 assert( ts[8].str == "aa" );
395 assert( ts[9].pos.lineno == 5 );
396 assert( ts[9].str == "aaa\nbbb # 123\neee" );
397 assert( ts[9].kind == Token.Kind.stringLiteral );
398 assert( ts[10].pos.lineno == 8 );
399 assert( ts.length == 11 );
400 }