1 /**
2 * Authors: k.inaba
3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
4 *
5 * Lexer for Polemy programming language.
6 */
7 module polemy.lex;
8 import polemy._common;
9 import polemy.failure;
10 import std.file : readText;
11 import std.ctype : isspace, isalnum;
12
13 /// Represents a lexer token
14
15 class Token
16 {
17 immutable LexPosition pos; /// Position where the token occurred in the source
18 immutable string str; /// The token string itself
19 immutable bool quoted; /// Was it a "quoted" token or unquoted?
20
21 mixin SimpleClass;
22 }
23
24 unittest
25 {
26 auto p = new LexPosition("hello.cpp", 123, 45);
27 auto t = new Token(p, "class", false);
28 auto u = new Token(p, "class", true);
29
30 assert_eq( t.pos, p );
31 assert_eq( t.str, "class" );
32 assert( !t.quoted );
33 assert_eq( t, new Token(p, "class", false) );
34 assert_lt( t, new Token(p, "struct", false) );
35 assert_ne( t, u );
36 assert( u.quoted );
37
38 assert( !__traits(compiles, new Token) );
39 assert( !__traits(compiles, t.pos=p) );
40 assert( !__traits(compiles, t.str="789") );
41 assert( !__traits(compiles, t.quoted=true) );
42 }
43
44 /// Named Construtors for Lexer
45
46 Lexer lexerFromFile(T...)( string filename, T ln_cn )
47 {
48 return lexerFromString( std.file.readText(filename), filename, ln_cn );
49 }
50
51 /// Named Construtor for Lexer
52
53 LexerT!(PositionedReader!CharSeq) /* ddoc doesn't recognize auto return... bugzilla:2581 */
54 lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
55 {
56 return new LexerT!(PositionedReader!CharSeq)(
57 PositionedReader!CharSeq(str, filename, lineno, column)
58 );
59 }
60
61 /// Standard Lexer Type (all you have to know is that this is a forward range of Tokens!)
62
63 alias LexerT!(PositionedReader!string) Lexer;
64
65 /// Lexer Implementation
66
67 class LexerT(Reader)
68 if( isForwardRange!(Reader) && is(ElementType!(Reader)==dchar) )
69 {
70 /// Range primitive
71 bool empty() /*@property*/
72 {
73 return current is null;
74 }
75
76 /// Range primitive
77 Token front() /*@property*/
78 {
79 return std.exception.enforce(current, "Lexer has already reached the end");
80 }
81
82 /// Range primitive
83 void popFront() /*@property*/
84 {
85 std.exception.enforce(current, "Lexer has already reached the end");
86 current = readNext();
87 }
88
89 /// Range primitive
90 typeof(this) save() /*@property*/
91 {
92 return new typeof(this)(reader.save, current);
93 }
94
95 private: // implementation
96
97 Reader reader;
98 Token current;
99
100 invariant()
101 {
102 assert( reader.empty || !isSpace(reader.front) );
103 }
104
105 this( Reader reader, Token current = null )
106 {
107 this.reader = reader;
108 readWhile!isSpace();
109 this.current = (current is null ? readNext() : current);
110 }
111
112 public static
113 {
114 bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; }
115 bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
116 bool isSSymbol (dchar c) { return "()[]{};,@".canFind(c); }
117 bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
118 bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); }
119 }
120
121 string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
122 string readQuoted(const LexPosition pos, ref char[] buf)
123 {
124 if( reader.empty )
125 throw genex!UnexpectedEOF(pos, "Quoted string not terminated");
126 dchar c = reader.front;
127 reader.popFront;
128 if( c == '"' )
129 return assumeUnique(buf);
130 if( c == '\\' && !reader.empty ) {
131 if( reader.front=='"' ) {
132 reader.popFront;
133 return readQuoted(pos,buf ~= '\"');
134 }
135 if( reader.front=='\\' ) {
136 reader.popFront;
137 return readQuoted(pos,buf ~= '\\');
138 }
139 }
140 return readQuoted(pos,buf ~= c);
141 }
142
143 string readWhile(alias fn)()
144 {
145 char[] buf;
146 for(; !reader.empty && fn(reader.front); reader.popFront)
147 buf ~= reader.front;
148 return assumeUnique(buf);
149 }
150
151 Token readNext()
152 {
153 if( reader.empty )
154 return null;
155 scope(success)
156 readWhile!isSpace();
157 if( reader.front == '#' ) // comment
158 {
159 reader = find(reader, '\n');
160 readWhile!isSpace();
161 return readNext();
162 }
163 else if( reader.front == '"' ) // quoted
164 {
165 auto pos = reader.currentPosition();
166 reader.popFront;
167 return new Token(pos, readQuoted(pos), true);
168 }
169 else if( isSSymbol(reader.front) ) // paren
170 {
171 auto pos = reader.currentPosition();
172 string s; s~=reader.front; reader.popFront;
173 return new Token(pos, s, false);
174 }
175 else if( isMSymbol(reader.front) ) // symbol
176 {
177 auto pos = reader.currentPosition();
178 return new Token(pos, readWhile!isMSymbol(), false);
179 }
180 else
181 {
182 auto pos = reader.currentPosition();
183 return new Token(pos, readWhile!isLetter(), false);
184 }
185 }
186 }
187
188 unittest
189 {
190 assert( std.range.isForwardRange!(Lexer) );
191 assert( is(ElementType!(Lexer) == Token) );
192 }
193
194 unittest
195 {
196 auto lex = lexerFromString("this is a \t\r\n pen :-( @@; ");
197 Token[] ts = std.array.array(lex);
198
199 assert_eq( ts[0].pos.lineno, 1 );
200 assert_eq( ts[0].pos.column, 1 );
201 assert( !ts[0].quoted );
202 assert_eq( ts[0].str, "this" );
203
204 assert_eq( ts[1].pos.lineno, 1 );
205 assert_eq( ts[1].pos.column, 6 );
206 assert( !ts[1].quoted );
207 assert_eq( ts[1].str, "is" );
208
209 assert_eq( ts[2].pos.lineno, 1 );
210 assert_eq( ts[2].pos.column, 9 );
211 assert( !ts[2].quoted );
212 assert_eq( ts[2].str, "a" );
213
214 assert_eq( ts[3].pos.lineno, 2 );
215 assert_eq( ts[3].pos.column, 2 );
216 assert( !ts[3].quoted );
217 assert_eq( ts[3].str, "pen" );
218
219 assert_eq( ts[4].pos.lineno, 2 );
220 assert_eq( ts[4].pos.column, 6 );
221 assert_eq( ts[4].str, ":-" );
222
223 assert_eq( ts[5].pos.lineno, 2 );
224 assert_eq( ts[5].pos.column, 8 );
225 assert_eq( ts[5].str, "(" );
226 assert_eq( ts[6].str, "@" );
227 assert_eq( ts[7].str, "@" );
228 assert_eq( ts[8].str, ";" ); // paren and simicolons, atmarks are split
229
230 assert_eq( ts.length, 9 );
231 }
232
233 unittest
234 {
235 // !! be sure to run the unittest on the root of the source directory
236 auto lexf = lexerFromFile("polemy/lex.d");
237 lexf = find!`a.str == "module"`(lexf);
238 assert_eq( lexf.front.str, "module" );
239 assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
240 assert_eq( lexf.front.pos.lineno, 7 );
241 assert_eq( lexf.front.pos.column, 1 );
242 lexf.popFront;
243 assert_eq( lexf.front.str, "polemy" );
244 assert_eq( lexf.front.pos.lineno, 7 );
245 assert_eq( lexf.front.pos.column, 8 );
246 lexf.popFront;
247 lexf.popFront;
248 lexf.popFront;
249 lexf.popFront;
250 assert_eq( lexf.front.str, "import" );
251 assert_eq( lexf.front.pos.lineno, 8 );
252 assert_eq( lexf.front.pos.column, 1 );
253 }
254
255 unittest
256 {
257 assert_throw!UnexpectedEOF( lexerFromString(`"`) );
258 }
259
260 unittest
261 {
262 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
263 be ignored.
264 hahaha"hihihi""hu\\\"huhu"#123 aa
265 123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
266 zzz
267 `);
268 Token[] ts = std.array.array(lex);
269 assert_eq( ts[0].str, "my" );
270 assert_eq( ts[0].pos.lineno, 1 );
271 assert( !ts[0].quoted );
272 assert_eq( ts[1].str, "be" );
273 assert_eq( ts[1].pos.lineno, 3 );
274 assert( !ts[1].quoted );
275 assert_eq( ts[2].str, "ignored" );
276 assert( !ts[2].quoted );
277 assert_eq( ts[3].str, "." );
278 assert( !ts[3].quoted );
279 assert_eq( ts[4].str, "hahaha" );
280 assert_eq( ts[4].pos.lineno, 4 );
281 assert( !ts[4].quoted );
282 assert_eq( ts[5].str, "hihihi" );
283 assert_eq( ts[5].pos.lineno, 4 );
284 assert( ts[5].quoted );
285 assert_eq( ts[6].str, `hu\"huhu` );
286 assert_eq( ts[6].pos.lineno, 4 );
287 assert( ts[6].quoted );
288 assert_eq( ts[7].str, "123" );
289 assert_eq( ts[7].pos.lineno, 5 );
290 assert_eq( ts[8].str, "aa" );
291 assert_eq( ts[9].pos.lineno, 5 );
292 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
293 assert( ts[9].quoted );
294 assert_eq( ts[10].pos.lineno, 8 );
295 assert( !ts[10].quoted );
296 assert_eq( ts.length, 11 );
297 }
298
299 unittest
300 {
301 auto lex2 = lexerFromString(" a12\n3a 5 ");
302 assert_eq( lex2.front.str, "a12" );
303 lex2.popFront;
304 auto lex3 = lex2.save;
305 assert_eq( lex2.front.str, "3a" );
306 lex2.popFront;
307 assert_eq( lex3.front.str, "3a" );
308 assert_eq( lex2.front.str, "5" );
309 lex2.popFront;
310 lex3.popFront;
311 assert( lex2.empty );
312 assert( !lex3.empty );
313 assert_eq( lex3.front.str, "5" );
314 }
315
316 unittest
317 {
318 auto lex = lexerFromString(`=""`);
319 assert_eq(lex.front.str, "="); lex.popFront;
320 assert_eq(lex.front.str, ""); lex.popFront;
321 assert( lex.empty );
322 assert_eq( lexerFromString(`-@`).front.str, "-" );
323 }
324
325 /// Forward range for reader character by character,
326 /// keeping track of position information and caring \r\n -> \n conversion.
327
328 struct PositionedReader(CharSeq)
329 if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq)==dchar) )
330 {
331 CharSeq buffer;
332 string filename;
333 int lineno;
334 int column;
335
336 /// Range primitive
337 bool empty() /*@property*/
338 {
339 return buffer.empty;
340 }
341
342 /// Range primitive
343 dchar front() /*@property*/
344 {
345 dchar c = buffer.front;
346 return (c=='\r' ? '\n' : c);
347 }
348
349 /// Range primitive
350 void popFront() /*@property*/
351 {
352 dchar c = buffer.front;
353 buffer.popFront;
354 if( c=='\r' )
355 {
356 if( !buffer.empty && buffer.front=='\n' )
357 buffer.popFront;
358 c = '\n';
359 }
360 if( c=='\n' )
361 {
362 lineno ++;
363 column = 1;
364 }
365 else
366 column ++;
367 }
368
369 /// Range primitive
370 typeof(this) save() /*@property*/
371 {
372 return this;
373 }
374
375 /// Get the current position
376 LexPosition currentPosition() const
377 {
378 return new LexPosition(filename, lineno, column);
379 }
380 }
381
382 unittest
383 {
384 assert( isForwardRange!(PositionedReader!string) );
385 assert( is(ElementType!(PositionedReader!string) == dchar) );
386 {
387 auto pr = PositionedReader!string("abc","",1,1);
388 assert_eq(pr.currentPosition().column, 1); pr.popFront;
389 assert_eq(pr.currentPosition().column, 2); pr.popFront;
390 assert_eq(pr.currentPosition().column, 3); pr.popFront;
391 }
392 {
393 auto pr = PositionedReader!string("\n\r\n\n","",1,1);
394 assert_eq(pr.currentPosition().lineno, 1); pr.popFront;
395 assert_eq(pr.currentPosition().lineno, 2); pr.popFront;
396 assert_eq(pr.currentPosition().lineno, 3); pr.popFront;
397 }
398 }