@@ -1,394 +1,416 @@ -/** - * Authors: k.inaba - * License: NYSL 0.9982 http://www.kmonos.net/nysl/ - * - * Lexer for Polemy programming language. - */ +/** + * Authors: k.inaba + * License: NYSL 0.9982 http://www.kmonos.net/nysl/ + * + * Lexer for Polemy programming language. + */ module polemy.lex; import polemy._common; +import std.file : readText; +import std.ctype : isspace, isalnum; -import std.file : readText; -import std.string : munch; -import std.ctype; +/// Exception from this module +class LexException : Exception +{ + this( const LexPosition pos, string msg ) + { super(sprintf!"%s [%s]"(msg, pos)); this.pos = pos; } + const LexPosition pos; +}; + /// Represents a position in a source code - + class LexPosition { immutable string filename; /// name of the source file immutable int lineno; /// line number, 1, 2, ... - immutable int column; /// column, 1, 2, ... - - override string toString() const - { return sprintf!"%s:%d:%d"(filename, lineno, column); } + immutable int column; /// column, 1, 2, ... - mixin SimpleConstructor; + override string toString() const + { return sprintf!"%s:%d:%d"(filename, lineno, column); } + + mixin SimpleConstructor; mixin SimpleCompare; } unittest { - auto p = new LexPosition("hello.cpp", 123, 45); - auto q = new LexPosition("hello.cpp", 123, 46); + auto p = new LexPosition("hello.cpp", 123, 45); + auto q = new LexPosition("hello.cpp", 123, 46); assert_eq( p.filename, "hello.cpp" ); assert_eq( p.lineno, 123 ); - assert_eq( p.column, 45 ); - assert_eq( to!string(p), "hello.cpp:123:45" ); - assert_lt( p, q ); - assert_ne( p, q ); + assert_eq( p.column, 45 ); + assert_eq( to!string(p), "hello.cpp:123:45" ); + assert_lt( p, q ); + assert_ne( p, q ); assert( !__traits(compiles, new LexPosition) ); assert( !__traits(compiles, p.filename="foo") ); assert( !__traits(compiles, p.lineno =789) ); assert( !__traits(compiles, p.column =222) ); } - + /// Represents a lexer token - + class Token -{ +{ immutable LexPosition pos; /// Position where the token occurred in the source - immutable string str; /// The token string itself - immutable bool quoted; /// Was it a "quoted" token or unquoted? - - mixin SimpleConstructor; + immutable string str; /// The token string itself + immutable bool quoted; /// Was it a "quoted" token or unquoted? + + mixin SimpleConstructor; mixin SimpleCompare; } unittest { auto p = new immutable(LexPosition)("hello.cpp", 123, 45); auto t = new Token(p, "class", false); - auto u = new Token(p, "class", true); + auto u = new Token(p, "class", true); assert_eq( t.pos, p ); - assert_eq( t.str, "class" ); - assert( !t.quoted ); - assert_eq( t, new Token(p, "class", false) ); - assert_lt( t, new Token(p, "struct", false) ); - assert_ne( t, u ); - assert( u.quoted ); + assert_eq( t.str, "class" ); + assert( !t.quoted ); + assert_eq( t, new Token(p, "class", false) ); + assert_lt( t, new Token(p, "struct", false) ); + assert_ne( t, u ); + assert( u.quoted ); assert( !__traits(compiles, new Token) ); assert( !__traits(compiles, t.pos=p) ); - assert( !__traits(compiles, t.str=789) ); - assert( !__traits(compiles, t.quoted=true) ); + assert( !__traits(compiles, t.str=789) ); + assert( !__traits(compiles, t.quoted=true) ); +} + +/// Named Construtor for Lexer + +auto lexerFromFile(T...)( string filename, T rest ) +{ + return lexerFromString( std.file.readText(filename), filename, rest ); +} + +/// Named Construtor for Lexer + +auto lexerFromString(CharSeq)( CharSeq str, string filename="", int lineno=1, int column=1 ) +{ + return new LexerT!(PositionedReader!CharSeq)( + PositionedReader!CharSeq(str, filename, lineno, column) + ); } + +/// Standard Lexer Type (all users have to know is that this is a forward range of Tokens) -/// Named Construtor for Lexer +alias LexerT!(PositionedReader!string) Lexer; + +/// Lexer Implementation -Lexer lexerFromFile(T...)( string filename, T rest ) -{ - return lexerFromString( std.file.readText(filename), filename, rest ); -} - -/// Named Construtor for Lexer - -Lexer lexerFromString( string str, string filename="", int lineno=1, int column=1 ) -{ - return new Lexer(str, filename, lineno, column); -} - -/// Lexer is a forward range of Tokens - -class Lexer -{ +class LexerT(Reader) + if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) ) +{ /// Range primitive bool empty() /*@property*/ { return current is null; } - /// Range primitive + /// Range primitive Token front() /*@property*/ { return std.exception.enforce(current, "Lexer has already reached the end"); } - /// Range primitive + /// Range primitive void popFront() /*@property*/ { - std.exception.enforce(current, "Lexer has already reached the end"); + std.exception.enforce(current, "Lexer has already reached the end"); current = readNext(); } - /// Range primitive - Lexer save() /*@property*/ + /// Range primitive + typeof(this) save() /*@property*/ { - return new Lexer(this.tupleof); + return new typeof(this)(reader.save, current); } private: // implementation - - string buffer; - string filename; - int lineno; - int column; + + Reader reader; Token current; invariant() - { - assert( buffer.empty || !std.ctype.isspace(buffer[0]) ); - } - - this( string buffer, string filename, int lineno, int column, Token current=null ) { - this.buffer = buffer; - this.filename = filename; - this.lineno = lineno; - this.column = column; - skipws(); - this.current = (current is null ? readNext() : current); - } - - void skipws() - { - bool progress = false; - do - { - string ws = buffer.munch(" \t"); - column += ws.length; - progress = !ws.empty; - while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') ) - { - progress = true; - if( buffer[0] == '\n' ) - buffer = buffer[1..$]; - else // if( buffer.front == '\r' ) - { - buffer = buffer[1..$]; - if( !buffer.empty && buffer[0]=='\n' ) - buffer = buffer[1..$]; - } - lineno ++; - column = 1; - } - }while( progress ); - } - - char readChar() - { - scope(exit) { - buffer = buffer[1..$]; - column ++; + assert( reader.empty || !std.ctype.isspace(reader.front) ); + } + + this( Reader reader, Token current = null ) + { + this.reader = reader; + readWhile!isSpace(); + this.current = (current is null ? readNext() : current); + } + + public static { + bool isSpace (dchar c) { return std.ctype.isspace(c)!=0; } + bool isSymbol (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; } + bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; } + bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c); } + bool isLetter (dchar c) { return !isSpace(c) && !isSymbol(c); } + } + + string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);} + string readQuoted(const LexPosition pos, ref char[] buf) + { + if( reader.empty ) + throw new LexException(pos, "EOF found while lexing a quoted-string"); + dchar c = reader.front; + reader.popFront; + if( c == '"' ) + return assumeUnique(buf); + if( c == '\\' && !reader.empty ) { + if( reader.front=='"' ) { + reader.popFront; + return readQuoted(pos,buf ~= '\"'); + } + if( reader.front=='\\' ) { + reader.popFront; + return readQuoted(pos,buf ~= '\\'); + } } - return buffer[0]; + return readQuoted(pos,buf ~= c); } - /// This is the main lexing routine - Token readNext() + string readWhile(alias fn)() { - if( buffer.empty ) - return null; - scope(exit) - skipws(); + char[] buf; + for(; !reader.empty && fn(reader.front); reader.popFront) + buf ~= reader.front; + return assumeUnique(buf); + } - if( isSymbol(buffer[0]) ) + Token readNext() + { + if( reader.empty ) + return null; + scope(success) + readWhile!isSpace(); + if( reader.front == '#' ) // comment + { + reader = find(reader, '\n'); + readWhile!isSpace(); + return readNext(); + } + else if( reader.front == '"' ) // quoted + { + auto pos = reader.currentPosition(); + reader.popFront; + return new Token(pos, readQuoted(pos), true); + } + else if( isSSymbol(reader.front) ) // paren { - if( buffer[0] == '#' ) - { - // skip comment - while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') ) - readChar(); - skipws(); - return readNext(); - } - else if( buffer[0] == '"' ) - { - // string literal - auto pos = currentPosition(); - string lit; - readChar(); - while( !buffer.empty && buffer[0]!='"' ) - { - // read one char - char c = readChar(); - if( c == '\\' ) - { - if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') ) - lit ~= readChar(); - else - lit ~= c; - } - else if( c == '\n' ) - { - lit ~= c; - lineno++; - column = 1; - } - else if( c == '\r' ) - { - if( !buffer.empty && buffer[0]=='\n' ) - readChar(); - lit ~= '\n'; - lineno++; - column = 1; - } - else - lit ~= c; - } - if( !buffer.empty ) - readChar(); - return new Token(pos, lit, true); - } - else - { - // normal symbol - auto pos = currentPosition(); - auto str = ""~readChar(); - return new Token(pos, str, false); - } + auto pos = reader.currentPosition(); + string s; s~=reader.front; reader.popFront; + return new Token(pos, s, false); + } + else if( isMSymbol(reader.front) ) // symbol + { + auto pos = reader.currentPosition(); + return new Token(pos, readWhile!isMSymbol(), false); } else { - auto pos = currentPosition(); - int i = 0; - while( i \n conversion. + +private +struct PositionedReader(CharSeq) + if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) ) { -//!! be sure to run the unittest on the root of the source directory - auto lexf = lexerFromFile("polemy/lex.d"); - lexf = find!`a.str == "module"`(lexf); - assert_eq( lexf.front.str, "module" ); - assert_eq( lexf.front.pos.filename, "polemy/lex.d" ); - assert_eq( lexf.front.pos.lineno, 7 ); - assert_eq( lexf.front.pos.column, 1 ); - lexf.popFront; - assert_eq( lexf.front.str, "polemy" ); - assert_eq( lexf.front.pos.lineno, 7 ); - assert_eq( lexf.front.pos.column, 8 ); - lexf.popFront; - assert_eq( lexf.front.str, "." ); - lexf.popFront; - assert_eq( lexf.front.str, "lex" ); - lexf.popFront; - assert_eq( lexf.front.str, ";" ); - lexf.popFront; - assert_eq( lexf.front.str, "import" ); - assert_eq( lexf.front.pos.lineno, 8 ); - assert_eq( lexf.front.pos.column, 1 ); + CharSeq buffer; + string filename; + int lineno; + int column; + + /// Range primitive + bool empty() /*@property*/ + { + return buffer.empty; + } + + /// Range primitive + dchar front() /*@property*/ + { + dchar c = buffer.front; + return (c=='\r' ? '\n' : c); + } + + /// Range primitive + void popFront() /*@property*/ + { + dchar c = buffer.front; + buffer.popFront; + if( c=='\r' ) + { + if( !buffer.empty && buffer.front=='\n' ) + buffer.popFront; + c = '\n'; + } + if( c=='\n' ) + { + lineno ++; + column = 1; + } + else + column ++; + } + + /// Range primitive + typeof(this) save() /*@property*/ + { + return this; + } + + /// Get the current position + immutable(LexPosition) currentPosition() const + { + return new immutable(LexPosition)(filename, lineno, column); + } } unittest { - auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!! -be ignored. -hahaha"hihihi""hu\\\"huhu"#123 aa -123 aa "aaa`~"\r\n"~`bbb # 123`~"\r\n"~`eee" -zzz -`); - Token[] ts = std.array.array(lex); - assert_eq( ts[0].str, "my" ); - assert_eq( ts[0].pos.lineno, 1 ); - assert( !ts[0].quoted ); - assert_eq( ts[1].str, "be" ); - assert_eq( ts[1].pos.lineno, 3 ); - assert( !ts[1].quoted ); - assert_eq( ts[2].str, "ignored" ); - assert( !ts[2].quoted ); - assert_eq( ts[3].str, "." ); - assert( !ts[3].quoted ); - assert_eq( ts[4].str, "hahaha" ); - assert_eq( ts[4].pos.lineno, 4 ); - assert( !ts[4].quoted ); - assert_eq( ts[5].str, "hihihi" ); - assert_eq( ts[5].pos.lineno, 4 ); - assert( ts[5].quoted ); - assert_eq( ts[6].str, `hu\"huhu` ); - assert_eq( ts[6].pos.lineno, 4 ); - assert( ts[6].quoted ); - assert_eq( ts[7].str, "123" ); - assert_eq( ts[7].pos.lineno, 5 ); - assert_eq( ts[8].str, "aa" ); - assert_eq( ts[9].pos.lineno, 5 ); - assert_eq( ts[9].str, "aaa\nbbb # 123\neee" ); - assert( ts[9].quoted ); - assert_eq( ts[10].pos.lineno, 8 ); - assert( !ts[10].quoted ); - assert_eq( ts.length, 11 ); + assert( isForwardRange!(PositionedReader!string) ); + assert( is(ElementType!(PositionedReader!string) == dchar) ); }