Diff
Not logged in

Differences From Artifact [0972f7a454ea8e4f]:

To Artifact [5f52873e3ff7ae30]:


2 2 * Authors: k.inaba 3 3 * License: NYSL 0.9982 http://www.kmonos.net/nysl/ 4 4 * 5 5 * Lexer for Polemy programming language. 6 6 */ 7 7 module polemy.lex; 8 8 import polemy._common; 9 +import std.file : readText; 10 +import std.ctype : isspace, isalnum; 9 11 10 -import std.file : readText; 11 -import std.string : munch; 12 -import std.ctype; 12 +/// Exception from this module 13 + 14 +class LexException : Exception 15 +{ 16 + this( const LexPosition pos, string msg ) 17 + { super(sprintf!"%s [%s]"(msg, pos)); this.pos = pos; } 18 + const LexPosition pos; 19 +}; 13 20 14 21 /// Represents a position in a source code 15 22 16 23 class LexPosition 17 24 { 18 25 immutable string filename; /// name of the source file 19 26 immutable int lineno; /// line number, 1, 2, ... ................................................................................ 74 81 assert( !__traits(compiles, t.pos=p) ); 75 82 assert( !__traits(compiles, t.str=789) ); 76 83 assert( !__traits(compiles, t.quoted=true) ); 77 84 } 78 85 79 86 /// Named Construtor for Lexer 80 87 81 -Lexer lexerFromFile(T...)( string filename, T rest ) 88 +auto lexerFromFile(T...)( string filename, T rest ) 82 89 { 83 90 return lexerFromString( std.file.readText(filename), filename, rest ); 84 91 } 85 92 86 93 /// Named Construtor for Lexer 87 94 88 -Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 ) 95 +auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 ) 89 96 { 90 - return new Lexer(str, filename, lineno, column); 97 + return new LexerT!(PositionedReader!CharSeq)( 98 + PositionedReader!CharSeq(str, filename, lineno, column) 99 + ); 91 100 } 92 101 93 -/// Lexer is a forward range of Tokens 102 +/// Standard Lexer Type (all users have to know is that this is a forward range of Tokens) 94 103 95 -class Lexer 104 +alias LexerT!(PositionedReader!string) Lexer; 105 + 106 +/// Lexer Implementation 107 + 108 +class LexerT(Reader) 109 + if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) ) 96 110 { 97 111 /// Range primitive 98 112 bool empty() /*@property*/ 99 113 { 100 114 return current is null; 101 115 } 102 116 ................................................................................ 110 124 void popFront() /*@property*/ 111 125 { 112 126 std.exception.enforce(current, "Lexer has already reached the end"); 113 127 current = readNext(); 114 128 } 115 129 116 130 /// Range primitive 117 - Lexer save() /*@property*/ 131 + typeof(this) save() /*@property*/ 118 132 { 119 - return new Lexer(this.tupleof); 133 + return new typeof(this)(reader.save, current); 120 134 } 121 135 122 136 private: // implementation 123 137 124 - string buffer; 125 - string filename; 126 - int lineno; 127 - int column; 138 + Reader reader; 128 139 Token current; 129 140 130 141 invariant() 131 142 { 132 - assert( buffer.empty || !std.ctype.isspace(buffer[0]) ); 143 + assert( reader.empty || !std.ctype.isspace(reader.front) ); 144 + } 145 + 146 + this( Reader reader, Token current = null ) 147 + { 148 + this.reader = reader; 149 + readWhile!isSpace(); 150 + this.current = (current is null ? readNext() : current); 151 + } 152 + 153 + public static { 154 + bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; } 155 + bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; } 156 + bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; } 157 + bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c); } 158 + bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); } 133 159 } 134 160 135 - this( string buffer, string filename, int lineno, int column, Token current=null ) 161 + string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);} 162 + string readQuoted(const LexPosition pos, ref char[] buf) 136 163 { 137 - this.buffer = buffer; 138 - this.filename = filename; 139 - this.lineno = lineno; 140 - this.column = column; 141 - skipws(); 142 - this.current = (current is null ? readNext() : current); 164 + if( reader.empty ) 165 + throw new LexException(pos, "EOF found while lexing a quoted-string"); 166 + dchar c = reader.front; 167 + reader.popFront; 168 + if( c == '"' ) 169 + return assumeUnique(buf); 170 + if( c == '\\' && !reader.empty ) { 171 + if( reader.front=='"' ) { 172 + reader.popFront; 173 + return readQuoted(pos,buf ~= '\"'); 174 + } 175 + if( reader.front=='\\' ) { 176 + reader.popFront; 177 + return readQuoted(pos,buf ~= '\\'); 178 + } 179 + } 180 + return readQuoted(pos,buf ~= c); 143 181 } 144 182 145 - void skipws() 183 + string readWhile(alias fn)() 146 184 { 147 - bool progress = false; 148 - do 149 - { 150 - string ws = buffer.munch(" \t"); 151 - column += ws.length; 152 - progress = !ws.empty; 153 - while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') ) 154 - { 155 - progress = true; 156 - if( buffer[0] == '\n' ) 157 - buffer = buffer[1..$]; 158 - else // if( buffer.front == '\r' ) 159 - { 160 - buffer = buffer[1..$]; 161 - if( !buffer.empty && buffer[0]=='\n' ) 162 - buffer = buffer[1..$]; 163 - } 164 - lineno ++; 165 - column = 1; 166 - } 167 - }while( progress ); 185 + char[] buf; 186 + for(; !reader.empty && fn(reader.front); reader.popFront) 187 + buf ~= reader.front; 188 + return assumeUnique(buf); 168 189 } 169 190 170 - char readChar() 171 - { 172 - scope(exit) { 173 - buffer = buffer[1..$]; 174 - column ++; 175 - } 176 - return buffer[0]; 177 - } 178 - 179 - /// This is the main lexing routine 180 191 Token readNext() 181 192 { 182 - if( buffer.empty ) 193 + if( reader.empty ) 183 194 return null; 184 - scope(exit) 185 - skipws(); 186 - 187 - if( isSymbol(buffer[0]) ) 195 + scope(success) 196 + readWhile!isSpace(); 197 + if( reader.front == '#' ) // comment 198 + { 199 + reader = find(reader, '\n'); 200 + readWhile!isSpace(); 201 + return readNext(); 202 + } 203 + else if( reader.front == '"' ) // quoted 204 + { 205 + auto pos = reader.currentPosition(); 206 + reader.popFront; 207 + return new Token(pos, readQuoted(pos), true); 208 + } 209 + else if( isSSymbol(reader.front) ) // paren 210 + { 211 + auto pos = reader.currentPosition(); 212 + string s; s~=reader.front; reader.popFront; 213 + return new Token(pos, s, false); 214 + } 215 + else if( isMSymbol(reader.front) ) // symbol 188 216 { 189 - if( buffer[0] == '#' ) 190 - { 191 - // skip comment 192 - while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') ) 193 - readChar(); 194 - skipws(); 195 - return readNext(); 196 - } 197 - else if( buffer[0] == '"' ) 198 - { 199 - // string literal 200 - auto pos = currentPosition(); 201 - string lit; 202 - readChar(); 203 - while( !buffer.empty && buffer[0]!='"' ) 204 - { 205 - // read one char 206 - char c = readChar(); 207 - if( c == '\\' ) 208 - { 209 - if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') ) 210 - lit ~= readChar(); 211 - else 212 - lit ~= c; 213 - } 214 - else if( c == '\n' ) 215 - { 216 - lit ~= c; 217 - lineno++; 218 - column = 1; 219 - } 220 - else if( c == '\r' ) 221 - { 222 - if( !buffer.empty && buffer[0]=='\n' ) 223 - readChar(); 224 - lit ~= '\n'; 225 - lineno++; 226 - column = 1; 227 - } 228 - else 229 - lit ~= c; 230 - } 231 - if( !buffer.empty ) 232 - readChar(); 233 - return new Token(pos, lit, true); 234 - } 235 - else 236 - { 237 - // normal symbol 238 - auto pos = currentPosition(); 239 - auto str = ""~readChar(); 240 - return new Token(pos, str, false); 241 - } 217 + auto pos = reader.currentPosition(); 218 + return new Token(pos, readWhile!isMSymbol(), false); 242 219 } 243 220 else 244 221 { 245 - auto pos = currentPosition(); 246 - int i = 0; 247 - while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) ) 248 - ++i; 249 - auto str = buffer[0 .. i]; 250 - buffer = buffer[i .. $]; 251 - column += i; 252 - return new Token(pos, str, false); 222 + auto pos = reader.currentPosition(); 223 + return new Token(pos, readWhile!isLetter(), false); 253 224 } 254 225 } 255 - 256 - bool isSymbol(char c) 257 - { 258 - return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_'); 259 - } 260 - 261 - immutable(LexPosition) currentPosition() 262 - { 263 - return new immutable(LexPosition)(filename, lineno, column); 264 - } 265 226 } 266 227 267 228 unittest 268 229 { 269 230 assert( std.range.isForwardRange!(Lexer) ); 270 231 } 271 232 272 233 unittest 273 234 { 274 - auto lex = lexerFromString("this is a \t\r\n pen :-( "); 235 + auto lex = lexerFromString("this is a \t\r\n pen :-( @@; "); 275 236 Token[] ts = std.array.array(lex); 276 237 277 238 assert_eq( ts[0].pos.lineno, 1 ); 278 239 assert_eq( ts[0].pos.column, 1 ); 279 240 assert( !ts[0].quoted ); 280 241 assert_eq( ts[0].str, "this" ); 281 242 ................................................................................ 290 251 assert_eq( ts[2].str, "a" ); 291 252 292 253 assert_eq( ts[3].pos.lineno, 2 ); 293 254 assert_eq( ts[3].pos.column, 2 ); 294 255 assert( !ts[3].quoted ); 295 256 assert_eq( ts[3].str, "pen" ); 296 257 297 - // consecutive symbols are always separated 298 - // hence, no "++" or "<<" or ... 299 - 300 258 assert_eq( ts[4].pos.lineno, 2 ); 301 259 assert_eq( ts[4].pos.column, 6 ); 302 - assert_eq( ts[4].str, ":" ); 260 + assert_eq( ts[4].str, ":-" ); 303 261 304 262 assert_eq( ts[5].pos.lineno, 2 ); 305 - assert_eq( ts[5].pos.column, 7 ); 306 - assert_eq( ts[5].str, "-" ); 263 + assert_eq( ts[5].pos.column, 8 ); 264 + assert_eq( ts[5].str, "(" ); 265 + assert_eq( ts[6].str, "@@" ); 266 + assert_eq( ts[7].str, ";" ); // paren and simicolons are split 307 267 308 - assert_eq( ts[6].pos.lineno, 2 ); 309 - assert_eq( ts[6].pos.column, 8 ); 310 - assert_eq( ts[6].str, "(" ); 311 - 312 - assert_eq( ts.length, 7 ); 268 + assert_eq( ts.length, 8 ); 313 269 } 314 270 315 271 unittest 316 272 { 317 - auto lex2 = lexerFromString(" a12\n3a 5 "); 318 - assert_eq( lex2.front.str, "a12" ); 319 - lex2.popFront; 320 - auto lex3 = lex2.save; 321 - assert_eq( lex2.front.str, "3a" ); 322 - lex2.popFront; 323 - assert_eq( lex3.front.str, "3a" ); 324 - assert_eq( lex2.front.str, "5" ); 325 - lex2.popFront; 326 - lex3.popFront; 327 - assert( lex2.empty ); 328 - assert( !lex3.empty ); 329 - assert_eq( lex3.front.str, "5" ); 330 -} 331 - 332 -unittest 333 -{ 334 -//!! be sure to run the unittest on the root of the source directory 273 + // !! be sure to run the unittest on the root of the source directory 335 274 auto lexf = lexerFromFile("polemy/lex.d"); 336 275 lexf = find!`a.str == "module"`(lexf); 337 276 assert_eq( lexf.front.str, "module" ); 338 277 assert_eq( lexf.front.pos.filename, "polemy/lex.d" ); 339 278 assert_eq( lexf.front.pos.lineno, 7 ); 340 279 assert_eq( lexf.front.pos.column, 1 ); 341 280 lexf.popFront; 342 281 assert_eq( lexf.front.str, "polemy" ); 343 282 assert_eq( lexf.front.pos.lineno, 7 ); 344 283 assert_eq( lexf.front.pos.column, 8 ); 345 284 lexf.popFront; 346 - assert_eq( lexf.front.str, "." ); 347 285 lexf.popFront; 348 - assert_eq( lexf.front.str, "lex" ); 349 286 lexf.popFront; 350 - assert_eq( lexf.front.str, ";" ); 351 287 lexf.popFront; 352 288 assert_eq( lexf.front.str, "import" ); 353 289 assert_eq( lexf.front.pos.lineno, 8 ); 354 290 assert_eq( lexf.front.pos.column, 1 ); 355 291 } 292 + 293 +unittest 294 +{ 295 + assert_throw!LexException( lexerFromString(`"`) ); 296 +} 356 297 357 298 unittest 358 299 { 359 300 auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!! 360 301 be ignored. 361 302 hahaha"hihihi""hu\\\"huhu"#123 aa 362 -123 aa "aaa`~"\r\n"~`bbb # 123`~"\r\n"~`eee" 303 +123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee" 363 304 zzz 364 305 `); 365 306 Token[] ts = std.array.array(lex); 366 307 assert_eq( ts[0].str, "my" ); 367 308 assert_eq( ts[0].pos.lineno, 1 ); 368 309 assert( !ts[0].quoted ); 369 310 assert_eq( ts[1].str, "be" ); ................................................................................ 388 329 assert_eq( ts[9].pos.lineno, 5 ); 389 330 assert_eq( ts[9].str, "aaa\nbbb # 123\neee" ); 390 331 assert( ts[9].quoted ); 391 332 assert_eq( ts[10].pos.lineno, 8 ); 392 333 assert( !ts[10].quoted ); 393 334 assert_eq( ts.length, 11 ); 394 335 } 336 + 337 +unittest 338 +{ 339 + auto lex2 = lexerFromString(" a12\n3a 5 "); 340 + assert_eq( lex2.front.str, "a12" ); 341 + lex2.popFront; 342 + auto lex3 = lex2.save; 343 + assert_eq( lex2.front.str, "3a" ); 344 + lex2.popFront; 345 + assert_eq( lex3.front.str, "3a" ); 346 + assert_eq( lex2.front.str, "5" ); 347 + lex2.popFront; 348 + lex3.popFront; 349 + assert( lex2.empty ); 350 + assert( !lex3.empty ); 351 + assert_eq( lex3.front.str, "5" ); 352 +} 353 + 354 +/// Forward range for reader character by character, 355 +/// keeping track of position information and caring \r\n -> \n conversion. 356 + 357 +private 358 +struct PositionedReader(CharSeq) 359 + if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) ) 360 +{ 361 + CharSeq buffer; 362 + string filename; 363 + int lineno; 364 + int column; 365 + 366 + /// Range primitive 367 + bool empty() /*@property*/ 368 + { 369 + return buffer.empty; 370 + } 371 + 372 + /// Range primitive 373 + dchar front() /*@property*/ 374 + { 375 + dchar c = buffer.front; 376 + return (c=='\r' ? '\n' : c); 377 + } 378 + 379 + /// Range primitive 380 + void popFront() /*@property*/ 381 + { 382 + dchar c = buffer.front; 383 + buffer.popFront; 384 + if( c=='\r' ) 385 + { 386 + if( !buffer.empty && buffer.front=='\n' ) 387 + buffer.popFront; 388 + c = '\n'; 389 + } 390 + if( c=='\n' ) 391 + { 392 + lineno ++; 393 + column = 1; 394 + } 395 + else 396 + column ++; 397 + } 398 + 399 + /// Range primitive 400 + typeof(this) save() /*@property*/ 401 + { 402 + return this; 403 + } 404 + 405 + /// Get the current position 406 + immutable(LexPosition) currentPosition() const 407 + { 408 + return new immutable(LexPosition)(filename, lineno, column); 409 + } 410 +} 411 + 412 +unittest 413 +{ 414 + assert( isForwardRange!(PositionedReader!string) ); 415 + assert( is(ElementType!(PositionedReader!string) == dchar) ); 416 +}