Artifact Content
Not logged in

Artifact fb4085e84f38ed6aa249b86ab223d47783a538df


     1  /**
     2   * Authors: k.inaba
     3   * License: NYSL 0.9982 http://www.kmonos.net/nysl/
     4   *
     5   * Lexer for Polemy programming language.
     6   */
     7  module polemy.lex;
     8  import polemy._common;
     9  
    10  import std.file : readText;
    11  import std.string : munch;
    12  import std.ctype;
    13  
    14  /// Represents a position in a source code
    15  
    16  class LexPosition
    17  {
    18  	immutable string filename; /// name of the source file
    19  	immutable int    lineno;   /// line number, 1, 2, ...
    20  	immutable int    column;   /// column, 1, 2, ...
    21  
    22  	override string toString() const
    23  		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
    24  
    25  	mixin SimpleConstructor;
    26  	mixin SimpleCompare;
    27  }
    28  
    29  unittest
    30  {
    31  	auto p = new LexPosition("hello.cpp", 123, 45);
    32  	auto q = new LexPosition("hello.cpp", 123, 46);
    33  
    34  	assert( p.filename == "hello.cpp" );
    35  	assert( p.lineno == 123 );
    36  	assert( p.column == 45 );
    37  	assert( to!string(p) == "hello.cpp:123:45" );
    38  	assert( p < q );
    39  	assert( p != q );
    40  
    41  	assert( !__traits(compiles, new LexPosition) );
    42  	assert( !__traits(compiles, p.filename="foo") );
    43  	assert( !__traits(compiles, p.lineno  =789) );
    44  	assert( !__traits(compiles, p.column  =222) );
    45  }
    46  
    47  /// Represents a lexer token
    48  
    49  class Token
    50  {
    51  	enum Kind {identifier, stringLiteral, number};
    52  	immutable LexPosition pos;  /// position where the token occurred in the source
    53  	immutable string      str;  /// the token string itself
    54  	immutable Kind        kind; /// which kind of token?
    55  
    56  	mixin SimpleConstructor;
    57  	mixin SimpleCompare;
    58  }
    59  
    60  unittest
    61  {
    62  	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
    63  	auto t = new Token(p, "class", Token.Kind.identifier);
    64  
    65  	assert( t.pos == p );
    66  	assert( t.str == "class" );
    67  	assert( t == new Token(p, "class", Token.Kind.identifier) );
    68  	assert( t < new Token(p, "struct", Token.Kind.identifier) );
    69  
    70  	assert( !__traits(compiles, new Token) );
    71  	assert( !__traits(compiles, t.pos=p) );
    72  	assert( !__traits(compiles, t.str=789) );
    73  }
    74  
    75  /// Named Construtor for Lexer
    76  
    77  Lexer lexerFromFile(T...)( string filename, T rest )
    78  {
    79  	return lexerFromString( std.file.readText(filename), filename, rest );
    80  }
    81  	
    82  /// Named Construtor for Lexer
    83  
    84  Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 )
    85  {
    86  	return new Lexer(str, filename, lineno, column);
    87  }
    88  
    89  /// Lexer is a forward range of Tokens
    90  
    91  class Lexer
    92  {
    93  	/// Range primitive
    94  	bool empty() /*@property*/
    95  	{
    96  		return current is null;
    97  	}
    98  
    99  	/// Range primitive
   100  	Token front() /*@property*/
   101  	{
   102  		return std.exception.enforce(current, "Lexer has already reached the end");
   103  	}
   104  
   105  	/// Range primitive
   106  	void popFront() /*@property*/
   107  	{
   108  		std.exception.enforce(current, "Lexer has already reached the end");
   109  		current = readNext();
   110  	}
   111  
   112  	/// Range primitive
   113  	Lexer save() /*@property*/
   114  	{
   115  		return new Lexer(buffer, filename, lineno, column, current);
   116  	}
   117  
   118  private: // implementation
   119  
   120  	string buffer;
   121  	string filename;
   122  	int    lineno;
   123  	int    column;
   124  	Token  current;
   125  
   126  	invariant()
   127  	{
   128  		assert( buffer.empty || !std.ctype.isspace(buffer[0]) );
   129  	}
   130  
   131  	this( string buffer, string filename, int lineno, int column, Token current=null )
   132  	{
   133  		this.buffer   = buffer;
   134  		this.filename = filename;
   135  		this.lineno   = lineno;
   136  		this.column   = column;
   137  		skipws();
   138  		this.current  = (current is null ? readNext() : current);
   139  	}
   140  
   141  	void skipws()
   142  	{
   143  		bool progress = false;
   144  		do
   145  		{
   146  			string ws = buffer.munch(" \t");
   147  			column += ws.length;
   148  			progress = !ws.empty;
   149  			while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') )
   150  			{
   151  				progress = true;
   152  				if( buffer[0] == '\n' )
   153  					buffer = buffer[1..$];
   154  				else // if( buffer.front == '\r' )
   155  				{
   156  					buffer = buffer[1..$];
   157  					if( !buffer.empty && buffer[0]=='\n' )
   158  						buffer = buffer[1..$];
   159  				}
   160  				lineno ++;
   161  				column = 1;
   162  			}
   163  		}while( progress );
   164  	}
   165  
   166  	char readChar()
   167  	{
   168  		scope(exit) {
   169  			buffer = buffer[1..$];
   170  			column ++;
   171  		}
   172  		return buffer[0];
   173  	}
   174  
   175  	/// This is the main lexing routine
   176  	Token readNext()
   177  	{
   178  		if( buffer.empty )
   179  			return null;
   180  		scope(exit)
   181  			skipws();
   182  
   183  		if( isSymbol(buffer[0]) )
   184  		{
   185  			if( buffer[0] == '#' )
   186  			{
   187  				// skip comment
   188  				while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') )
   189  					readChar();
   190  				skipws();
   191  				return readNext();
   192  			}
   193  			else if( buffer[0] == '"' )
   194  			{
   195  				// string literal
   196  				auto pos = currentPosition();
   197  				string lit;
   198  				readChar();
   199  				while( !buffer.empty && buffer[0]!='"' )
   200  				{
   201  					// read one char
   202  					char c = readChar();
   203  					if( c == '\\' )
   204  					{
   205  						if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') )
   206  							lit ~= readChar();
   207  						else
   208  							lit ~= c;
   209  					}
   210  					else if( c == '\n' )
   211  					{
   212  						lit ~= c;
   213  						lineno++;
   214  						column = 1;
   215  					}
   216  					else if( c == '\r' )
   217  					{
   218  						if( !buffer.empty && buffer[0]=='\n' )
   219  							readChar();
   220  						lit ~= '\n';
   221  						lineno++;
   222  						column = 1;
   223  					}
   224  					else
   225  						lit ~= c;
   226  				}
   227  				if( !buffer.empty )
   228  					readChar();
   229  				return new Token(pos, lit, Token.Kind.stringLiteral);
   230  			}
   231  			else
   232  			{
   233  				// normal symbol
   234  				auto pos = currentPosition();
   235  				auto str = ""~readChar();
   236  				return new Token(pos, str, Token.Kind.identifier);
   237  			}
   238  		}
   239  		else
   240  		{
   241  			auto pos = currentPosition();
   242  			int i = 0;
   243  			while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) )
   244  				++i;
   245  			auto str = buffer[0 .. i];
   246  			buffer   = buffer[i .. $];
   247  			column  += i;
   248  			bool isNumber = find!(`a<'0' || '9'<a`)(str).empty;
   249  			return new Token(pos, str, isNumber ? Token.Kind.number : Token.Kind.identifier);
   250  		}
   251  	}
   252  
   253  	bool isSymbol(char c)
   254  	{
   255  		return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_');
   256  	}
   257  
   258  	immutable(LexPosition) currentPosition()
   259  	{
   260  		return new immutable(LexPosition)(filename, lineno, column);
   261  	}
   262  }
   263  
   264  unittest
   265  {
   266  	assert( std.range.isForwardRange!(Lexer) );
   267  }
   268  
   269  unittest
   270  {
   271  	auto lex = lexerFromString("this	is a \t\n pen :-(   ");
   272  	Token[] ts = std.array.array(lex);
   273  
   274  	assert( ts[0].pos.lineno == 1 );
   275  	assert( ts[0].pos.column == 1 );
   276  	assert( ts[0].kind == Token.Kind.identifier );
   277  	assert( ts[0].str == "this" );
   278  
   279  	assert( ts[1].pos.lineno == 1 );
   280  	assert( ts[1].pos.column == 6 );
   281  	assert( ts[1].kind == Token.Kind.identifier );
   282  	assert( ts[1].str == "is" );
   283  
   284  	assert( ts[2].pos.lineno == 1 );
   285  	assert( ts[2].pos.column == 9 );
   286  	assert( ts[2].kind == Token.Kind.identifier );
   287  	assert( ts[2].str == "a" );
   288  
   289  	assert( ts[3].pos.lineno == 2 );
   290  	assert( ts[3].pos.column == 2 );
   291  	assert( ts[3].kind == Token.Kind.identifier );
   292  	assert( ts[3].str == "pen" );
   293  
   294  	// consecutive symbols are always separated
   295  	// hence, no "++" or "<<" or ...
   296  		
   297  	assert( ts[4].pos.lineno == 2 );
   298  	assert( ts[4].pos.column == 6 );
   299  	assert( ts[4].str == ":" );
   300  
   301  	assert( ts[5].pos.lineno == 2 );
   302  	assert( ts[5].pos.column == 7 );
   303  	assert( ts[5].str == "-" ); 
   304  
   305  	assert( ts[6].pos.lineno == 2 );
   306  	assert( ts[6].pos.column == 8 );
   307  	assert( ts[6].str == "(" );
   308  
   309  	assert( ts.length == 7 );
   310  }
   311  
   312  unittest
   313  {
   314  	auto lex2 = lexerFromString(" a12\n3a 5 ");
   315  	assert( lex2.front.str == "a12" );
   316  	assert( lex2.front.kind == Token.Kind.identifier );
   317  	lex2.popFront;
   318  	auto lex3 = lex2.save;
   319  	assert( lex2.front.str == "3a" );
   320  	assert( lex2.front.kind == Token.Kind.identifier );
   321  	lex2.popFront;
   322  	assert( lex3.front.str == "3a" );
   323  	assert( lex3.front.kind == Token.Kind.identifier );
   324  	assert( lex2.front.str == "5" );
   325  	assert( lex2.front.kind == Token.Kind.number );
   326  	lex2.popFront;
   327  	lex3.popFront;
   328  	assert( lex2.empty );
   329  	assert( !lex3.empty );
   330  	assert( lex3.front.str == "5" );
   331  	assert( lex3.front.kind == Token.Kind.number );
   332  }
   333  
   334  unittest
   335  {
   336  //!! be sure to run the unittest on the root of the source directory
   337  	auto lexf = lexerFromFile("polemy/lex.d");	
   338  	lexf = find!`a.str == "module"`(lexf);
   339  	assert( lexf.front.str == "module", lexf.front.str );
   340  	assert( lexf.front.pos.filename == "polemy/lex.d" );
   341  	assert( lexf.front.pos.lineno == 7 );
   342  	assert( lexf.front.pos.column == 1 );
   343  	lexf.popFront;
   344  	assert( lexf.front.str == "polemy" );
   345  	assert( lexf.front.pos.lineno == 7 );
   346  	assert( lexf.front.pos.column == 8 );
   347  	lexf.popFront;
   348  	assert( lexf.front.str == "." );
   349  	lexf.popFront;
   350  	assert( lexf.front.str == "lex" );
   351  	lexf.popFront;
   352  	assert( lexf.front.str == ";" );
   353  	lexf.popFront;
   354  	assert( lexf.front.str == "import" );
   355  	assert( lexf.front.pos.lineno == 8 );
   356  	assert( lexf.front.pos.column == 1 );
   357  }
   358  
   359  unittest
   360  {
   361  	auto lex = lexerFromString(`my # comment should
   362  # hey!!
   363  be ignored.
   364  hahaha"hihihi""hu\\\"huhu"#123 aa
   365  123 aa "aaa
   366  bbb # 123
   367  eee"
   368  zzz
   369  `);
   370  	Token[] ts = std.array.array(lex);
   371  	assert( ts[0].str == "my" );
   372  	assert( ts[0].pos.lineno == 1 );
   373  	assert( ts[1].str == "be" );
   374  	assert( ts[1].pos.lineno == 3 );
   375  	assert( ts[2].str == "ignored" );
   376  	assert( ts[3].str == "." );
   377  	assert( ts[4].str == "hahaha" );
   378  	assert( ts[4].pos.lineno == 4 );
   379  	assert( ts[4].kind == Token.Kind.identifier );
   380  	assert( ts[5].str == "hihihi" );
   381  	assert( ts[5].pos.lineno == 4 );
   382  	assert( ts[5].kind == Token.Kind.stringLiteral );
   383  	assert( ts[6].str == `hu\"huhu` );
   384  	assert( ts[6].kind == Token.Kind.stringLiteral );
   385  	assert( ts[6].pos.lineno == 4 );
   386  	assert( ts[7].str == "123" );
   387  	assert( ts[7].pos.lineno == 5 );
   388  	assert( ts[7].kind == Token.Kind.number );
   389  	assert( ts[8].str == "aa" );
   390  	assert( ts[9].pos.lineno == 5 );
   391  	assert( ts[9].str == "aaa\nbbb # 123\neee" );
   392  	assert( ts[9].kind == Token.Kind.stringLiteral );
   393  	assert( ts[10].pos.lineno == 8 );
   394  	assert( ts.length == 11 );
   395  }