1 /**
2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 *
5 * Lexer for Polemy programming language.
6 */
7 module polemy.lex;
8 import polemy._common;
9 import std.file : readText;
10 import std.ctype : isspace, isalnum;
11
12 /// Exception from this module
13
14 class LexException : Exception
15 {
16 this( const LexPosition pos, string msg )
17 { super(sprintf!"%s [%s]"(msg, pos)); this.pos = pos; }
18 const LexPosition pos;
19 };
20
21 /// Represents a position in a source code
22
23 class LexPosition
24 {
25 immutable string filename; /// name of the source file
26 immutable int lineno; /// line number, 1, 2, ...
27 immutable int column; /// column, 1, 2, ...
28
29 override string toString() const
30 { return sprintf!"%s:%d:%d"(filename, lineno, column); }
31
32 mixin SimpleConstructor;
33 mixin SimpleCompare;
34 }
35
36 unittest
37 {
38 auto p = new LexPosition("hello.cpp", 123, 45);
39 auto q = new LexPosition("hello.cpp", 123, 46);
40
41 assert_eq( p.filename, "hello.cpp" );
42 assert_eq( p.lineno, 123 );
43 assert_eq( p.column, 45 );
44 assert_eq( to!string(p), "hello.cpp:123:45" );
45 assert_lt( p, q );
46 assert_ne( p, q );
47
48 assert( !__traits(compiles, new LexPosition) );
49 assert( !__traits(compiles, p.filename="foo") );
50 assert( !__traits(compiles, p.lineno =789) );
51 assert( !__traits(compiles, p.column =222) );
52 }
53
54 /// Represents a lexer token
55
56 class Token
57 {
58 immutable LexPosition pos; /// Position where the token occurred in the source
59 immutable string str; /// The token string itself
60 immutable bool quoted; /// Was it a "quoted" token or unquoted?
61
62 mixin SimpleConstructor;
63 mixin SimpleCompare;
64 }
65
66 unittest
67 {
68 auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
69 auto t = new Token(p, "class", false);
70 auto u = new Token(p, "class", true);
71
72 assert_eq( t.pos, p );
73 assert_eq( t.str, "class" );
74 assert( !t.quoted );
75 assert_eq( t, new Token(p, "class", false) );
76 assert_lt( t, new Token(p, "struct", false) );
77 assert_ne( t, u );
78 assert( u.quoted );
79
80 assert( !__traits(compiles, new Token) );
81 assert( !__traits(compiles, t.pos=p) );
82 assert( !__traits(compiles, t.str=789) );
83 assert( !__traits(compiles, t.quoted=true) );
84 }
85
86 /// Named Construtor for Lexer
87
88 auto lexerFromFile(T...)( string filename, T rest )
89 {
90 return lexerFromString( std.file.readText(filename), filename, rest );
91 }
92
93 /// Named Construtor for Lexer
94
95 auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
96 {
97 return new LexerT!(PositionedReader!CharSeq)(
98 PositionedReader!CharSeq(str, filename, lineno, column)
99 );
100 }
101
102 /// Standard Lexer Type (all users have to know is that this is a forward range of Tokens)
103
104 alias LexerT!(PositionedReader!string) Lexer;
105
106 /// Lexer Implementation
107
108 class LexerT(Reader)
109 if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
110 {
111 /// Range primitive
112 bool empty() /*@property*/
113 {
114 return current is null;
115 }
116
117 /// Range primitive
118 Token front() /*@property*/
119 {
120 return std.exception.enforce(current, "Lexer has already reached the end");
121 }
122
123 /// Range primitive
124 void popFront() /*@property*/
125 {
126 std.exception.enforce(current, "Lexer has already reached the end");
127 current = readNext();
128 }
129
130 /// Range primitive
131 typeof(this) save() /*@property*/
132 {
133 return new typeof(this)(reader.save, current);
134 }
135
136 private: // implementation
137
138 Reader reader;
139 Token current;
140
141 invariant()
142 {
143 assert( reader.empty || !std.ctype.isspace(reader.front) );
144 }
145
146 this( Reader reader, Token current = null )
147 {
148 this.reader = reader;
149 readWhile!isSpace();
150 this.current = (current is null ? readNext() : current);
151 }
152
153 public static {
154 bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; }
155 bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
156 bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; }
157 bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c); }
158 bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); }
159 }
160
161 string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
162 string readQuoted(const LexPosition pos, ref char[] buf)
163 {
164 if( reader.empty )
165 throw new LexException(pos, "EOF found while lexing a quoted-string");
166 dchar c = reader.front;
167 reader.popFront;
168 if( c == '"' )
169 return assumeUnique(buf);
170 if( c == '\\' && !reader.empty ) {
171 if( reader.front=='"' ) {
172 reader.popFront;
173 return readQuoted(pos,buf ~= '\"');
174 }
175 if( reader.front=='\\' ) {
176 reader.popFront;
177 return readQuoted(pos,buf ~= '\\');
178 }
179 }
180 return readQuoted(pos,buf ~= c);
181 }
182
183 string readWhile(alias fn)()
184 {
185 char[] buf;
186 for(; !reader.empty && fn(reader.front); reader.popFront)
187 buf ~= reader.front;
188 return assumeUnique(buf);
189 }
190
191 Token readNext()
192 {
193 if( reader.empty )
194 return null;
195 scope(success)
196 readWhile!isSpace();
197 if( reader.front == '#' ) // comment
198 {
199 reader = find(reader, '\n');
200 readWhile!isSpace();
201 return readNext();
202 }
203 else if( reader.front == '"' ) // quoted
204 {
205 auto pos = reader.currentPosition();
206 reader.popFront;
207 return new Token(pos, readQuoted(pos), true);
208 }
209 else if( isSSymbol(reader.front) ) // paren
210 {
211 auto pos = reader.currentPosition();
212 string s; s~=reader.front; reader.popFront;
213 return new Token(pos, s, false);
214 }
215 else if( isMSymbol(reader.front) ) // symbol
216 {
217 auto pos = reader.currentPosition();
218 return new Token(pos, readWhile!isMSymbol(), false);
219 }
220 else
221 {
222 auto pos = reader.currentPosition();
223 return new Token(pos, readWhile!isLetter(), false);
224 }
225 }
226 }
227
228 unittest
229 {
230 assert( std.range.isForwardRange!(Lexer) );
231 }
232
233 unittest
234 {
235 auto lex = lexerFromString("this is a \t\r\n pen :-( @@; ");
236 Token[] ts = std.array.array(lex);
237
238 assert_eq( ts[0].pos.lineno, 1 );
239 assert_eq( ts[0].pos.column, 1 );
240 assert( !ts[0].quoted );
241 assert_eq( ts[0].str, "this" );
242
243 assert_eq( ts[1].pos.lineno, 1 );
244 assert_eq( ts[1].pos.column, 6 );
245 assert( !ts[1].quoted );
246 assert_eq( ts[1].str, "is" );
247
248 assert_eq( ts[2].pos.lineno, 1 );
249 assert_eq( ts[2].pos.column, 9 );
250 assert( !ts[2].quoted );
251 assert_eq( ts[2].str, "a" );
252
253 assert_eq( ts[3].pos.lineno, 2 );
254 assert_eq( ts[3].pos.column, 2 );
255 assert( !ts[3].quoted );
256 assert_eq( ts[3].str, "pen" );
257
258 assert_eq( ts[4].pos.lineno, 2 );
259 assert_eq( ts[4].pos.column, 6 );
260 assert_eq( ts[4].str, ":-" );
261
262 assert_eq( ts[5].pos.lineno, 2 );
263 assert_eq( ts[5].pos.column, 8 );
264 assert_eq( ts[5].str, "(" );
265 assert_eq( ts[6].str, "@@" );
266 assert_eq( ts[7].str, ";" ); // paren and simicolons are split
267
268 assert_eq( ts.length, 8 );
269 }
270
271 unittest
272 {
273 // !! be sure to run the unittest on the root of the source directory
274 auto lexf = lexerFromFile("polemy/lex.d");
275 lexf = find!`a.str == "module"`(lexf);
276 assert_eq( lexf.front.str, "module" );
277 assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
278 assert_eq( lexf.front.pos.lineno, 7 );
279 assert_eq( lexf.front.pos.column, 1 );
280 lexf.popFront;
281 assert_eq( lexf.front.str, "polemy" );
282 assert_eq( lexf.front.pos.lineno, 7 );
283 assert_eq( lexf.front.pos.column, 8 );
284 lexf.popFront;
285 lexf.popFront;
286 lexf.popFront;
287 lexf.popFront;
288 assert_eq( lexf.front.str, "import" );
289 assert_eq( lexf.front.pos.lineno, 8 );
290 assert_eq( lexf.front.pos.column, 1 );
291 }
292
293 unittest
294 {
295 assert_throw!LexException( lexerFromString(`"`) );
296 }
297
298 unittest
299 {
300 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
301 be ignored.
302 hahaha"hihihi""hu\\\"huhu"#123 aa
303 123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
304 zzz
305 `);
306 Token[] ts = std.array.array(lex);
307 assert_eq( ts[0].str, "my" );
308 assert_eq( ts[0].pos.lineno, 1 );
309 assert( !ts[0].quoted );
310 assert_eq( ts[1].str, "be" );
311 assert_eq( ts[1].pos.lineno, 3 );
312 assert( !ts[1].quoted );
313 assert_eq( ts[2].str, "ignored" );
314 assert( !ts[2].quoted );
315 assert_eq( ts[3].str, "." );
316 assert( !ts[3].quoted );
317 assert_eq( ts[4].str, "hahaha" );
318 assert_eq( ts[4].pos.lineno, 4 );
319 assert( !ts[4].quoted );
320 assert_eq( ts[5].str, "hihihi" );
321 assert_eq( ts[5].pos.lineno, 4 );
322 assert( ts[5].quoted );
323 assert_eq( ts[6].str, `hu\"huhu` );
324 assert_eq( ts[6].pos.lineno, 4 );
325 assert( ts[6].quoted );
326 assert_eq( ts[7].str, "123" );
327 assert_eq( ts[7].pos.lineno, 5 );
328 assert_eq( ts[8].str, "aa" );
329 assert_eq( ts[9].pos.lineno, 5 );
330 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
331 assert( ts[9].quoted );
332 assert_eq( ts[10].pos.lineno, 8 );
333 assert( !ts[10].quoted );
334 assert_eq( ts.length, 11 );
335 }
336
337 unittest
338 {
339 auto lex2 = lexerFromString(" a12\n3a 5 ");
340 assert_eq( lex2.front.str, "a12" );
341 lex2.popFront;
342 auto lex3 = lex2.save;
343 assert_eq( lex2.front.str, "3a" );
344 lex2.popFront;
345 assert_eq( lex3.front.str, "3a" );
346 assert_eq( lex2.front.str, "5" );
347 lex2.popFront;
348 lex3.popFront;
349 assert( lex2.empty );
350 assert( !lex3.empty );
351 assert_eq( lex3.front.str, "5" );
352 }
353
354 /// Forward range for reader character by character,
355 /// keeping track of position information and caring \r\n -> \n conversion.
356
357 private
358 struct PositionedReader(CharSeq)
359 if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
360 {
361 CharSeq buffer;
362 string filename;
363 int lineno;
364 int column;
365
366 /// Range primitive
367 bool empty() /*@property*/
368 {
369 return buffer.empty;
370 }
371
372 /// Range primitive
373 dchar front() /*@property*/
374 {
375 dchar c = buffer.front;
376 return (c=='\r' ? '\n' : c);
377 }
378
379 /// Range primitive
380 void popFront() /*@property*/
381 {
382 dchar c = buffer.front;
383 buffer.popFront;
384 if( c=='\r' )
385 {
386 if( !buffer.empty && buffer.front=='\n' )
387 buffer.popFront;
388 c = '\n';
389 }
390 if( c=='\n' )
391 {
392 lineno ++;
393 column = 1;
394 }
395 else
396 column ++;
397 }
398
399 /// Range primitive
400 typeof(this) save() /*@property*/
401 {
402 return this;
403 }
404
405 /// Get the current position
406 immutable(LexPosition) currentPosition() const
407 {
408 return new immutable(LexPosition)(filename, lineno, column);
409 }
410 }
411
412 unittest
413 {
414 assert( isForwardRange!(PositionedReader!string) );
415 assert( is(ElementType!(PositionedReader!string) == dchar) );
416 }