Diff
Not logged in

Differences From Artifact [0972f7a454ea8e4f]:

To Artifact [5f52873e3ff7ae30]:


2 * Authors: k.inaba 2 * Authors: k.inaba 3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/ 3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/ 4 * 4 * 5 * Lexer for Polemy programming language. 5 * Lexer for Polemy programming language. 6 */ 6 */ 7 module polemy.lex; 7 module polemy.lex; 8 import polemy._common; 8 import polemy._common; > 9 import std.file : readText; > 10 import std.ctype : isspace, isalnum; 9 11 10 import std.file : readText; | 12 /// Exception from this module 11 import std.string : munch; < > 13 12 import std.ctype; | 14 class LexException : Exception > 15 { > 16 this( const LexPosition pos, string msg ) > 17 { super(sprintf!"%s [%s]"(msg, pos)); this.pos = pos; } > 18 const LexPosition pos; > 19 }; 13 20 14 /// Represents a position in a source code 21 /// Represents a position in a source code 15 22 16 class LexPosition 23 class LexPosition 17 { 24 { 18 immutable string filename; /// name of the source file 25 immutable string filename; /// name of the source file 19 immutable int lineno; /// line number, 1, 2, ... 26 immutable int lineno; /// line number, 1, 2, ... ................................................................................................................................................................................ 74 assert( !__traits(compiles, t.pos=p) ); 81 assert( !__traits(compiles, t.pos=p) ); 75 assert( !__traits(compiles, t.str=789) ); 82 assert( !__traits(compiles, t.str=789) ); 76 assert( !__traits(compiles, t.quoted=true) ); 83 assert( !__traits(compiles, t.quoted=true) ); 77 } 84 } 78 85 79 /// Named Construtor for Lexer 86 /// Named Construtor for Lexer 80 87 81 Lexer lexerFromFile(T...)( string filename, T rest ) | 88 auto lexerFromFile(T...)( string filename, T rest ) 82 { 89 { 83 return lexerFromString( std.file.readText(filename), filename, rest ); 90 return lexerFromString( std.file.readText(filename), filename, rest ); 84 } 91 } 85 92 86 /// Named Construtor for Lexer 93 /// Named Construtor for Lexer 87 94 88 Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, in | 95 auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lin 89 { 96 { > 97 return new LexerT!(PositionedReader!CharSeq)( 90 return new Lexer(str, filename, lineno, column); | 98 PositionedReader!CharSeq(str, filename, lineno, column) > 99 ); 91 } 100 } 92 101 93 /// Lexer is a forward range of Tokens | 102 /// Standard Lexer Type (all users have to know is that this is a forward range 94 103 > 104 alias LexerT!(PositionedReader!string) Lexer; > 105 > 106 /// Lexer Implementation > 107 95 class Lexer | 108 class LexerT(Reader) > 109 if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) ) 96 { 110 { 97 /// Range primitive 111 /// Range primitive 98 bool empty() /*@property*/ 112 bool empty() /*@property*/ 99 { 113 { 100 return current is null; 114 return current is null; 101 } 115 } 102 116 ................................................................................................................................................................................ 110 void popFront() /*@property*/ 124 void popFront() /*@property*/ 111 { 125 { 112 std.exception.enforce(current, "Lexer has already reached the en 126 std.exception.enforce(current, "Lexer has already reached the en 113 current = readNext(); 127 current = readNext(); 114 } 128 } 115 129 116 /// Range primitive 130 /// Range primitive 117 Lexer save() /*@property*/ | 131 typeof(this) save() /*@property*/ 118 { 132 { 119 return new Lexer(this.tupleof); | 133 return new typeof(this)(reader.save, current); 120 } 134 } 121 135 122 private: // implementation 136 private: // implementation 123 137 124 string buffer; | 138 Reader reader; 125 string filename; < 126 int lineno; < 127 int column; < 128 Token current; 139 Token current; 129 140 130 invariant() 141 invariant() 131 { 142 { 132 assert( buffer.empty || !std.ctype.isspace(buffer[0]) ); | 143 assert( reader.empty || !std.ctype.isspace(reader.front) ); > 144 } > 145 > 146 this( Reader reader, Token current = null ) > 147 { > 148 this.reader = reader; > 149 readWhile!isSpace(); > 150 this.current = (current is null ? readNext() : current); > 151 } > 152 > 153 public static { > 154 bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; } > 155 bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.cty > 156 bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; } > 157 bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c); > 158 bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); } 133 } 159 } 134 160 135 this( string buffer, string filename, int lineno, int column, Token curr | 161 string readQuoted(const LexPosition pos){char[] buf; return readQuoted(p > 162 string readQuoted(const LexPosition pos, ref char[] buf) 136 { 163 { 137 this.buffer = buffer; | 164 if( reader.empty ) 138 this.filename = filename; | 165 throw new LexException(pos, "EOF found while lexing a qu 139 this.lineno = lineno; | 166 dchar c = reader.front; 140 this.column = column; | 167 reader.popFront; 141 skipws(); | 168 if( c == '"' ) 142 this.current = (current is null ? readNext() : current); | 169 return assumeUnique(buf); > 170 if( c == '\\' && !reader.empty ) { > 171 if( reader.front=='"' ) { > 172 reader.popFront; > 173 return readQuoted(pos,buf ~= '\"'); > 174 } > 175 if( reader.front=='\\' ) { > 176 reader.popFront; > 177 return readQuoted(pos,buf ~= '\\'); > 178 } > 179 } > 180 return readQuoted(pos,buf ~= c); 143 } 181 } 144 182 145 void skipws() | 183 string readWhile(alias fn)() 146 { 184 { 147 bool progress = false; | 185 char[] buf; 148 do | 186 for(; !reader.empty && fn(reader.front); reader.popFront) 149 { < > 187 buf ~= reader.front; 150 string ws = buffer.munch(" \t"); | 188 return assumeUnique(buf); 151 column += ws.length; < 152 progress = !ws.empty; < 153 while( !buffer.empty && (buffer[0]=='\r' || buffer[0]==' < 154 { < 155 progress = true; < 156 if( buffer[0] == '\n' ) < 157 buffer = buffer[1..$]; < 158 else // if( buffer.front == '\r' ) < 159 { < 160 buffer = buffer[1..$]; < 161 if( !buffer.empty && buffer[0]=='\n' ) < 162 buffer = buffer[1..$]; < 163 } < 164 lineno ++; < 165 column = 1; < 166 } < 167 }while( progress ); < 168 } 189 } 169 190 170 char readChar() < 171 { < 172 scope(exit) { < 173 buffer = buffer[1..$]; < 174 column ++; < 175 } < 176 return buffer[0]; < 177 } < 178 < 179 /// This is the main lexing routine < 180 Token readNext() 191 Token readNext() 181 { 192 { 182 if( buffer.empty ) | 193 if( reader.empty ) 183 return null; 194 return null; 184 scope(exit) | 195 scope(success) 185 skipws(); | 196 readWhile!isSpace(); > 197 if( reader.front == '#' ) // comment 186 | 198 { 187 if( isSymbol(buffer[0]) ) < > 199 reader = find(reader, '\n'); > 200 readWhile!isSpace(); > 201 return readNext(); > 202 } > 203 else if( reader.front == '"' ) // quoted > 204 { > 205 auto pos = reader.currentPosition(); > 206 reader.popFront; > 207 return new Token(pos, readQuoted(pos), true); > 208 } > 209 else if( isSSymbol(reader.front) ) // paren > 210 { > 211 auto pos = reader.currentPosition(); > 212 string s; s~=reader.front; reader.popFront; > 213 return new Token(pos, s, false); > 214 } > 215 else if( isMSymbol(reader.front) ) // symbol 188 { 216 { 189 if( buffer[0] == '#' ) < 190 { < 191 // skip comment < 192 while( !buffer.empty && (buffer[0]!='\n' && buff < 193 readChar(); < 194 skipws(); < 195 return readNext(); < 196 } < 197 else if( buffer[0] == '"' ) < 198 { < 199 // string literal < 200 auto pos = currentPosition(); | 217 auto pos = reader.currentPosition(); 201 string lit; < 202 readChar(); < 203 while( !buffer.empty && buffer[0]!='"' ) < 204 { < 205 // read one char < 206 char c = readChar(); < 207 if( c == '\\' ) < 208 { < 209 if( !buffer.empty && (buffer[0]= < 210 lit ~= readChar(); < 211 else < 212 lit ~= c; < 213 } < 214 else if( c == '\n' ) < 215 { < 216 lit ~= c; < 217 lineno++; < 218 column = 1; < 219 } < 220 else if( c == '\r' ) < 221 { < 222 if( !buffer.empty && buffer[0]== < 223 readChar(); < 224 lit ~= '\n'; < 225 lineno++; < 226 column = 1; < 227 } < 228 else < 229 lit ~= c; < 230 } < 231 if( !buffer.empty ) < 232 readChar(); < 233 return new Token(pos, lit, true); | 218 return new Token(pos, readWhile!isMSymbol(), false); 234 } < 235 else < 236 { < 237 // normal symbol < 238 auto pos = currentPosition(); < 239 auto str = ""~readChar(); < 240 return new Token(pos, str, false); < 241 } < 242 } 219 } 243 else 220 else 244 { 221 { 245 auto pos = currentPosition(); | 222 auto pos = reader.currentPosition(); 246 int i = 0; < 247 while( i<buffer.length && !std.ctype.isspace(buffer[i]) < 248 ++i; < 249 auto str = buffer[0 .. i]; < 250 buffer = buffer[i .. $]; < 251 column += i; < 252 return new Token(pos, str, false); | 223 return new Token(pos, readWhile!isLetter(), false); 253 } 224 } 254 } 225 } 255 < 256 bool isSymbol(char c) < 257 { < 258 return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_'); < 259 } < 260 < 261 immutable(LexPosition) currentPosition() < 262 { < 263 return new immutable(LexPosition)(filename, lineno, column); < 264 } < 265 } 226 } 266 227 267 unittest 228 unittest 268 { 229 { 269 assert( std.range.isForwardRange!(Lexer) ); 230 assert( std.range.isForwardRange!(Lexer) ); 270 } 231 } 271 232 272 unittest 233 unittest 273 { 234 { 274 auto lex = lexerFromString("this is a \t\r\n pen :-( "); | 235 auto lex = lexerFromString("this is a \t\r\n pen :-( @@; "); 275 Token[] ts = std.array.array(lex); 236 Token[] ts = std.array.array(lex); 276 237 277 assert_eq( ts[0].pos.lineno, 1 ); 238 assert_eq( ts[0].pos.lineno, 1 ); 278 assert_eq( ts[0].pos.column, 1 ); 239 assert_eq( ts[0].pos.column, 1 ); 279 assert( !ts[0].quoted ); 240 assert( !ts[0].quoted ); 280 assert_eq( ts[0].str, "this" ); 241 assert_eq( ts[0].str, "this" ); 281 242 ................................................................................................................................................................................ 290 assert_eq( ts[2].str, "a" ); 251 assert_eq( ts[2].str, "a" ); 291 252 292 assert_eq( ts[3].pos.lineno, 2 ); 253 assert_eq( ts[3].pos.lineno, 2 ); 293 assert_eq( ts[3].pos.column, 2 ); 254 assert_eq( ts[3].pos.column, 2 ); 294 assert( !ts[3].quoted ); 255 assert( !ts[3].quoted ); 295 assert_eq( ts[3].str, "pen" ); 256 assert_eq( ts[3].str, "pen" ); 296 257 297 // consecutive symbols are always separated < 298 // hence, no "++" or "<<" or ... < 299 < 300 assert_eq( ts[4].pos.lineno, 2 ); 258 assert_eq( ts[4].pos.lineno, 2 ); 301 assert_eq( ts[4].pos.column, 6 ); 259 assert_eq( ts[4].pos.column, 6 ); 302 assert_eq( ts[4].str, ":" ); | 260 assert_eq( ts[4].str, ":-" ); 303 261 304 assert_eq( ts[5].pos.lineno, 2 ); 262 assert_eq( ts[5].pos.lineno, 2 ); 305 assert_eq( ts[5].pos.column, 7 ); | 263 assert_eq( ts[5].pos.column, 8 ); 306 assert_eq( ts[5].str, "-" ); | 264 assert_eq( ts[5].str, "(" ); > 265 assert_eq( ts[6].str, "@@" ); > 266 assert_eq( ts[7].str, ";" ); // paren and simicolons are split 307 267 308 assert_eq( ts[6].pos.lineno, 2 ); < 309 assert_eq( ts[6].pos.column, 8 ); < 310 assert_eq( ts[6].str, "(" ); < 311 < 312 assert_eq( ts.length, 7 ); | 268 assert_eq( ts.length, 8 ); 313 } 269 } 314 270 315 unittest 271 unittest 316 { 272 { 317 auto lex2 = lexerFromString(" a12\n3a 5 "); < 318 assert_eq( lex2.front.str, "a12" ); < 319 lex2.popFront; < 320 auto lex3 = lex2.save; < 321 assert_eq( lex2.front.str, "3a" ); < 322 lex2.popFront; < 323 assert_eq( lex3.front.str, "3a" ); < 324 assert_eq( lex2.front.str, "5" ); < 325 lex2.popFront; < 326 lex3.popFront; < 327 assert( lex2.empty ); < 328 assert( !lex3.empty ); < 329 assert_eq( lex3.front.str, "5" ); < 330 } < 331 < 332 unittest < 333 { < 334 //!! be sure to run the unittest on the root of the source directory | 273 // !! be sure to run the unittest on the root of the source directory 335 auto lexf = lexerFromFile("polemy/lex.d"); 274 auto lexf = lexerFromFile("polemy/lex.d"); 336 lexf = find!`a.str == "module"`(lexf); 275 lexf = find!`a.str == "module"`(lexf); 337 assert_eq( lexf.front.str, "module" ); 276 assert_eq( lexf.front.str, "module" ); 338 assert_eq( lexf.front.pos.filename, "polemy/lex.d" ); 277 assert_eq( lexf.front.pos.filename, "polemy/lex.d" ); 339 assert_eq( lexf.front.pos.lineno, 7 ); 278 assert_eq( lexf.front.pos.lineno, 7 ); 340 assert_eq( lexf.front.pos.column, 1 ); 279 assert_eq( lexf.front.pos.column, 1 ); 341 lexf.popFront; 280 lexf.popFront; 342 assert_eq( lexf.front.str, "polemy" ); 281 assert_eq( lexf.front.str, "polemy" ); 343 assert_eq( lexf.front.pos.lineno, 7 ); 282 assert_eq( lexf.front.pos.lineno, 7 ); 344 assert_eq( lexf.front.pos.column, 8 ); 283 assert_eq( lexf.front.pos.column, 8 ); 345 lexf.popFront; 284 lexf.popFront; 346 assert_eq( lexf.front.str, "." ); < 347 lexf.popFront; 285 lexf.popFront; 348 assert_eq( lexf.front.str, "lex" ); < 349 lexf.popFront; 286 lexf.popFront; 350 assert_eq( lexf.front.str, ";" ); < 351 lexf.popFront; 287 lexf.popFront; 352 assert_eq( lexf.front.str, "import" ); 288 assert_eq( lexf.front.str, "import" ); 353 assert_eq( lexf.front.pos.lineno, 8 ); 289 assert_eq( lexf.front.pos.lineno, 8 ); 354 assert_eq( lexf.front.pos.column, 1 ); 290 assert_eq( lexf.front.pos.column, 1 ); 355 } 291 } > 292 > 293 unittest > 294 { > 295 assert_throw!LexException( lexerFromString(`"`) ); > 296 } 356 297 357 unittest 298 unittest 358 { 299 { 359 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!! 300 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!! 360 be ignored. 301 be ignored. 361 hahaha"hihihi""hu\\\"huhu"#123 aa 302 hahaha"hihihi""hu\\\"huhu"#123 aa 362 123 aa "aaa`~"\r\n"~`bbb # 123`~"\r\n"~`eee" | 303 123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee" 363 zzz 304 zzz 364 `); 305 `); 365 Token[] ts = std.array.array(lex); 306 Token[] ts = std.array.array(lex); 366 assert_eq( ts[0].str, "my" ); 307 assert_eq( ts[0].str, "my" ); 367 assert_eq( ts[0].pos.lineno, 1 ); 308 assert_eq( ts[0].pos.lineno, 1 ); 368 assert( !ts[0].quoted ); 309 assert( !ts[0].quoted ); 369 assert_eq( ts[1].str, "be" ); 310 assert_eq( ts[1].str, "be" ); ................................................................................................................................................................................ 388 assert_eq( ts[9].pos.lineno, 5 ); 329 assert_eq( ts[9].pos.lineno, 5 ); 389 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" ); 330 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" ); 390 assert( ts[9].quoted ); 331 assert( ts[9].quoted ); 391 assert_eq( ts[10].pos.lineno, 8 ); 332 assert_eq( ts[10].pos.lineno, 8 ); 392 assert( !ts[10].quoted ); 333 assert( !ts[10].quoted ); 393 assert_eq( ts.length, 11 ); 334 assert_eq( ts.length, 11 ); 394 } 335 } > 336 > 337 unittest > 338 { > 339 auto lex2 = lexerFromString(" a12\n3a 5 "); > 340 assert_eq( lex2.front.str, "a12" ); > 341 lex2.popFront; > 342 auto lex3 = lex2.save; > 343 assert_eq( lex2.front.str, "3a" ); > 344 lex2.popFront; > 345 assert_eq( lex3.front.str, "3a" ); > 346 assert_eq( lex2.front.str, "5" ); > 347 lex2.popFront; > 348 lex3.popFront; > 349 assert( lex2.empty ); > 350 assert( !lex3.empty ); > 351 assert_eq( lex3.front.str, "5" ); > 352 } > 353 > 354 /// Forward range for reader character by character, > 355 /// keeping track of position information and caring \r\n -> \n conversion. > 356 > 357 private > 358 struct PositionedReader(CharSeq) > 359 if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) ) > 360 { > 361 CharSeq buffer; > 362 string filename; > 363 int lineno; > 364 int column; > 365 > 366 /// Range primitive > 367 bool empty() /*@property*/ > 368 { > 369 return buffer.empty; > 370 } > 371 > 372 /// Range primitive > 373 dchar front() /*@property*/ > 374 { > 375 dchar c = buffer.front; > 376 return (c=='\r' ? '\n' : c); > 377 } > 378 > 379 /// Range primitive > 380 void popFront() /*@property*/ > 381 { > 382 dchar c = buffer.front; > 383 buffer.popFront; > 384 if( c=='\r' ) > 385 { > 386 if( !buffer.empty && buffer.front=='\n' ) > 387 buffer.popFront; > 388 c = '\n'; > 389 } > 390 if( c=='\n' ) > 391 { > 392 lineno ++; > 393 column = 1; > 394 } > 395 else > 396 column ++; > 397 } > 398 > 399 /// Range primitive > 400 typeof(this) save() /*@property*/ > 401 { > 402 return this; > 403 } > 404 > 405 /// Get the current position > 406 immutable(LexPosition) currentPosition() const > 407 { > 408 return new immutable(LexPosition)(filename, lineno, column); > 409 } > 410 } > 411 > 412 unittest > 413 { > 414 assert( isForwardRange!(PositionedReader!string) ); > 415 assert( is(ElementType!(PositionedReader!string) == dchar) ); > 416 }