1 /**
2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 *
5 * Lexer for Polemy programming language.
6 */
7 module polemy.lex;
8 import polemy._common;
9 import std.file : readText;
10 import std.ctype : isspace, isalnum;
11
12 /// Exception from this module
13
14 class LexException : Exception
15 {
16 this( const LexPosition pos, string msg )
17 { super(sprintf!"%s [%s]"(msg, pos)); this.pos = pos; }
18 const LexPosition pos;
19 };
20
21 /// Represents a position in a source code
22
23 class LexPosition
24 {
25 immutable string filename; /// name of the source file
26 immutable int lineno; /// line number, 1, 2, ...
27 immutable int column; /// column, 1, 2, ...
28
29 override string toString() const
30 { return sprintf!"%s:%d:%d"(filename, lineno, column); }
31
32 mixin SimpleConstructor;
33 mixin SimpleCompare;
34
35 static immutable LexPosition dummy;
36 static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
37 }
38
39 unittest
40 {
41 auto p = new LexPosition("hello.cpp", 123, 45);
42 auto q = new LexPosition("hello.cpp", 123, 46);
43
44 assert_eq( p.filename, "hello.cpp" );
45 assert_eq( p.lineno, 123 );
46 assert_eq( p.column, 45 );
47 assert_eq( to!string(p), "hello.cpp:123:45" );
48 assert_lt( p, q );
49 assert_ne( p, q );
50
51 assert( !__traits(compiles, new LexPosition) );
52 assert( !__traits(compiles, p.filename="foo") );
53 assert( !__traits(compiles, p.lineno =789) );
54 assert( !__traits(compiles, p.column =222) );
55 }
56
57 /// Represents a lexer token
58
59 class Token
60 {
61 immutable LexPosition pos; /// Position where the token occurred in the source
62 immutable string str; /// The token string itself
63 immutable bool quoted; /// Was it a "quoted" token or unquoted?
64
65 mixin SimpleConstructor;
66 mixin SimpleCompare;
67 mixin SimpleToString;
68 }
69
70 unittest
71 {
72 auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
73 auto t = new Token(p, "class", false);
74 auto u = new Token(p, "class", true);
75
76 assert_eq( t.pos, p );
77 assert_eq( t.str, "class" );
78 assert( !t.quoted );
79 assert_eq( t, new Token(p, "class", false) );
80 assert_lt( t, new Token(p, "struct", false) );
81 assert_ne( t, u );
82 assert( u.quoted );
83
84 assert( !__traits(compiles, new Token) );
85 assert( !__traits(compiles, t.pos=p) );
86 assert( !__traits(compiles, t.str=789) );
87 assert( !__traits(compiles, t.quoted=true) );
88 }
89
90 /// Named Construtor for Lexer
91
92 auto lexerFromFile(T...)( string filename, T rest )
93 {
94 return lexerFromString( std.file.readText(filename), filename, rest );
95 }
96
97 /// Named Construtor for Lexer
98
99 auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
100 {
101 return new LexerT!(PositionedReader!CharSeq)(
102 PositionedReader!CharSeq(str, filename, lineno, column)
103 );
104 }
105
106 /// Standard Lexer Type (all users have to know is that this is a forward range of Tokens)
107
108 alias LexerT!(PositionedReader!string) Lexer;
109
110 /// Lexer Implementation
111
112 class LexerT(Reader)
113 if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
114 {
115 /// Range primitive
116 bool empty() /*@property*/
117 {
118 return current is null;
119 }
120
121 /// Range primitive
122 Token front() /*@property*/
123 {
124 return std.exception.enforce(current, "Lexer has already reached the end");
125 }
126
127 /// Range primitive
128 void popFront() /*@property*/
129 {
130 std.exception.enforce(current, "Lexer has already reached the end");
131 current = readNext();
132 }
133
134 /// Range primitive
135 typeof(this) save() /*@property*/
136 {
137 return new typeof(this)(reader.save, current);
138 }
139
140 private: // implementation
141
142 Reader reader;
143 Token current;
144
145 invariant()
146 {
147 assert( reader.empty || !std.ctype.isspace(reader.front) );
148 }
149
150 this( Reader reader, Token current = null )
151 {
152 this.reader = reader;
153 readWhile!isSpace();
154 this.current = (current is null ? readNext() : current);
155 }
156
157 public static {
158 bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; }
159 bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
160 bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; }
161 bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c); }
162 bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); }
163 }
164
165 string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
166 string readQuoted(const LexPosition pos, ref char[] buf)
167 {
168 if( reader.empty )
169 throw new LexException(pos, "EOF found while lexing a quoted-string");
170 dchar c = reader.front;
171 reader.popFront;
172 if( c == '"' )
173 return assumeUnique(buf);
174 if( c == '\\' && !reader.empty ) {
175 if( reader.front=='"' ) {
176 reader.popFront;
177 return readQuoted(pos,buf ~= '\"');
178 }
179 if( reader.front=='\\' ) {
180 reader.popFront;
181 return readQuoted(pos,buf ~= '\\');
182 }
183 }
184 return readQuoted(pos,buf ~= c);
185 }
186
187 string readWhile(alias fn)()
188 {
189 char[] buf;
190 for(; !reader.empty && fn(reader.front); reader.popFront)
191 buf ~= reader.front;
192 return assumeUnique(buf);
193 }
194
195 Token readNext()
196 {
197 if( reader.empty )
198 return null;
199 scope(success)
200 readWhile!isSpace();
201 if( reader.front == '#' ) // comment
202 {
203 reader = find(reader, '\n');
204 readWhile!isSpace();
205 return readNext();
206 }
207 else if( reader.front == '"' ) // quoted
208 {
209 auto pos = reader.currentPosition();
210 reader.popFront;
211 return new Token(pos, readQuoted(pos), true);
212 }
213 else if( isSSymbol(reader.front) ) // paren
214 {
215 auto pos = reader.currentPosition();
216 string s; s~=reader.front; reader.popFront;
217 return new Token(pos, s, false);
218 }
219 else if( isMSymbol(reader.front) ) // symbol
220 {
221 auto pos = reader.currentPosition();
222 return new Token(pos, readWhile!isMSymbol(), false);
223 }
224 else
225 {
226 auto pos = reader.currentPosition();
227 return new Token(pos, readWhile!isLetter(), false);
228 }
229 }
230 }
231
232 unittest
233 {
234 assert( std.range.isForwardRange!(Lexer) );
235 }
236
237 unittest
238 {
239 auto lex = lexerFromString("this is a \t\r\n pen :-( @@; ");
240 Token[] ts = std.array.array(lex);
241
242 assert_eq( ts[0].pos.lineno, 1 );
243 assert_eq( ts[0].pos.column, 1 );
244 assert( !ts[0].quoted );
245 assert_eq( ts[0].str, "this" );
246
247 assert_eq( ts[1].pos.lineno, 1 );
248 assert_eq( ts[1].pos.column, 6 );
249 assert( !ts[1].quoted );
250 assert_eq( ts[1].str, "is" );
251
252 assert_eq( ts[2].pos.lineno, 1 );
253 assert_eq( ts[2].pos.column, 9 );
254 assert( !ts[2].quoted );
255 assert_eq( ts[2].str, "a" );
256
257 assert_eq( ts[3].pos.lineno, 2 );
258 assert_eq( ts[3].pos.column, 2 );
259 assert( !ts[3].quoted );
260 assert_eq( ts[3].str, "pen" );
261
262 assert_eq( ts[4].pos.lineno, 2 );
263 assert_eq( ts[4].pos.column, 6 );
264 assert_eq( ts[4].str, ":-" );
265
266 assert_eq( ts[5].pos.lineno, 2 );
267 assert_eq( ts[5].pos.column, 8 );
268 assert_eq( ts[5].str, "(" );
269 assert_eq( ts[6].str, "@@" );
270 assert_eq( ts[7].str, ";" ); // paren and simicolons are split
271
272 assert_eq( ts.length, 8 );
273 }
274
275 unittest
276 {
277 // !! be sure to run the unittest on the root of the source directory
278 auto lexf = lexerFromFile("polemy/lex.d");
279 lexf = find!`a.str == "module"`(lexf);
280 assert_eq( lexf.front.str, "module" );
281 assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
282 assert_eq( lexf.front.pos.lineno, 7 );
283 assert_eq( lexf.front.pos.column, 1 );
284 lexf.popFront;
285 assert_eq( lexf.front.str, "polemy" );
286 assert_eq( lexf.front.pos.lineno, 7 );
287 assert_eq( lexf.front.pos.column, 8 );
288 lexf.popFront;
289 lexf.popFront;
290 lexf.popFront;
291 lexf.popFront;
292 assert_eq( lexf.front.str, "import" );
293 assert_eq( lexf.front.pos.lineno, 8 );
294 assert_eq( lexf.front.pos.column, 1 );
295 }
296
297 unittest
298 {
299 assert_throw!LexException( lexerFromString(`"`) );
300 }
301
302 unittest
303 {
304 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
305 be ignored.
306 hahaha"hihihi""hu\\\"huhu"#123 aa
307 123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
308 zzz
309 `);
310 Token[] ts = std.array.array(lex);
311 assert_eq( ts[0].str, "my" );
312 assert_eq( ts[0].pos.lineno, 1 );
313 assert( !ts[0].quoted );
314 assert_eq( ts[1].str, "be" );
315 assert_eq( ts[1].pos.lineno, 3 );
316 assert( !ts[1].quoted );
317 assert_eq( ts[2].str, "ignored" );
318 assert( !ts[2].quoted );
319 assert_eq( ts[3].str, "." );
320 assert( !ts[3].quoted );
321 assert_eq( ts[4].str, "hahaha" );
322 assert_eq( ts[4].pos.lineno, 4 );
323 assert( !ts[4].quoted );
324 assert_eq( ts[5].str, "hihihi" );
325 assert_eq( ts[5].pos.lineno, 4 );
326 assert( ts[5].quoted );
327 assert_eq( ts[6].str, `hu\"huhu` );
328 assert_eq( ts[6].pos.lineno, 4 );
329 assert( ts[6].quoted );
330 assert_eq( ts[7].str, "123" );
331 assert_eq( ts[7].pos.lineno, 5 );
332 assert_eq( ts[8].str, "aa" );
333 assert_eq( ts[9].pos.lineno, 5 );
334 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
335 assert( ts[9].quoted );
336 assert_eq( ts[10].pos.lineno, 8 );
337 assert( !ts[10].quoted );
338 assert_eq( ts.length, 11 );
339 }
340
341 unittest
342 {
343 auto lex2 = lexerFromString(" a12\n3a 5 ");
344 assert_eq( lex2.front.str, "a12" );
345 lex2.popFront;
346 auto lex3 = lex2.save;
347 assert_eq( lex2.front.str, "3a" );
348 lex2.popFront;
349 assert_eq( lex3.front.str, "3a" );
350 assert_eq( lex2.front.str, "5" );
351 lex2.popFront;
352 lex3.popFront;
353 assert( lex2.empty );
354 assert( !lex3.empty );
355 assert_eq( lex3.front.str, "5" );
356 }
357
358 /// Forward range for reader character by character,
359 /// keeping track of position information and caring \r\n -> \n conversion.
360
361 private
362 struct PositionedReader(CharSeq)
363 if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
364 {
365 CharSeq buffer;
366 string filename;
367 int lineno;
368 int column;
369
370 /// Range primitive
371 bool empty() /*@property*/
372 {
373 return buffer.empty;
374 }
375
376 /// Range primitive
377 dchar front() /*@property*/
378 {
379 dchar c = buffer.front;
380 return (c=='\r' ? '\n' : c);
381 }
382
383 /// Range primitive
384 void popFront() /*@property*/
385 {
386 dchar c = buffer.front;
387 buffer.popFront;
388 if( c=='\r' )
389 {
390 if( !buffer.empty && buffer.front=='\n' )
391 buffer.popFront;
392 c = '\n';
393 }
394 if( c=='\n' )
395 {
396 lineno ++;
397 column = 1;
398 }
399 else
400 column ++;
401 }
402
403 /// Range primitive
404 typeof(this) save() /*@property*/
405 {
406 return this;
407 }
408
409 /// Get the current position
410 immutable(LexPosition) currentPosition() const
411 {
412 return new immutable(LexPosition)(filename, lineno, column);
413 }
414 }
415
416 unittest
417 {
418 assert( isForwardRange!(PositionedReader!string) );
419 assert( is(ElementType!(PositionedReader!string) == dchar) );
420 }