Artifact Content
Not logged in

Artifact 783ee3b0fe58558e4c2462dcf472bf2834ce73dc


     1  /**
     2   * Authors: k.inaba
     3   * License: NYSL 0.9982 http://www.kmonos.net/nysl/
     4   *
     5   * Lexer for Polemy programming language.
     6   */
     7  module polemy.lex;
     8  import polemy._common;
     9  
    10  import std.file : readText;
    11  import std.string : munch;
    12  import std.ctype;
    13  
    14  /// Represents a position in a source code
    15  
    16  class LexPosition
    17  {
    18  	immutable string filename; /// name of the source file
    19  	immutable int    lineno;   /// line number, 1, 2, ...
    20  	immutable int    column;   /// column, 1, 2, ...
    21  
    22  	override string toString() const
    23  		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
    24  
    25  	mixin SimpleConstructor;
    26  	mixin SimpleCompare;
    27  }
    28  
    29  unittest
    30  {
    31  	auto p = new LexPosition("hello.cpp", 123, 45);
    32  	auto q = new LexPosition("hello.cpp", 123, 46);
    33  
    34  	assert_eq( p.filename, "hello.cpp" );
    35  	assert_eq( p.lineno, 123 );
    36  	assert_eq( p.column, 45 );
    37  	assert_eq( to!string(p), "hello.cpp:123:45" );
    38  	assert_lt( p, q );
    39  	assert_ne( p, q );
    40  
    41  	assert( !__traits(compiles, new LexPosition) );
    42  	assert( !__traits(compiles, p.filename="foo") );
    43  	assert( !__traits(compiles, p.lineno  =789) );
    44  	assert( !__traits(compiles, p.column  =222) );
    45  }
    46  
    47  /// Represents a lexer token
    48  
    49  class Token
    50  {
    51  	/// currently we have three kinds of token
    52  	enum Kind {
    53  		identifier, /// anything other than others
    54  		stringLiteral, /// "string literal"
    55  		number /// 42
    56  	};
    57  	immutable LexPosition pos;  /// position where the token occurred in the source
    58  	immutable string      str;  /// the token string itself
    59  	immutable Kind        kind; /// which kind of token?
    60  
    61  	mixin SimpleConstructor;
    62  	mixin SimpleCompare;
    63  }
    64  
    65  unittest
    66  {
    67  	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
    68  	auto t = new Token(p, "class", Token.Kind.identifier);
    69  
    70  	assert_eq( t.pos, p );
    71  	assert_eq( t.str, "class" );
    72  	assert_eq( t, new Token(p, "class", Token.Kind.identifier) );
    73  	assert_lt( t, new Token(p, "struct", Token.Kind.identifier) );
    74  
    75  	assert( !__traits(compiles, new Token) );
    76  	assert( !__traits(compiles, t.pos=p) );
    77  	assert( !__traits(compiles, t.str=789) );
    78  }
    79  
    80  /// Named Construtor for Lexer
    81  
    82  Lexer lexerFromFile(T...)( string filename, T rest )
    83  {
    84  	return lexerFromString( std.file.readText(filename), filename, rest );
    85  }
    86  	
    87  /// Named Construtor for Lexer
    88  
    89  Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 )
    90  {
    91  	return new Lexer(str, filename, lineno, column);
    92  }
    93  
    94  /// Lexer is a forward range of Tokens
    95  
    96  class Lexer
    97  {
    98  	/// Range primitive
    99  	bool empty() /*@property*/
   100  	{
   101  		return current is null;
   102  	}
   103  
   104  	/// Range primitive
   105  	Token front() /*@property*/
   106  	{
   107  		return std.exception.enforce(current, "Lexer has already reached the end");
   108  	}
   109  
   110  	/// Range primitive
   111  	void popFront() /*@property*/
   112  	{
   113  		std.exception.enforce(current, "Lexer has already reached the end");
   114  		current = readNext();
   115  	}
   116  
   117  	/// Range primitive
   118  	Lexer save() /*@property*/
   119  	{
   120  		return new Lexer(buffer, filename, lineno, column, current);
   121  	}
   122  
   123  private: // implementation
   124  
   125  	string buffer;
   126  	string filename;
   127  	int    lineno;
   128  	int    column;
   129  	Token  current;
   130  
   131  	invariant()
   132  	{
   133  		assert( buffer.empty || !std.ctype.isspace(buffer[0]) );
   134  	}
   135  
   136  	this( string buffer, string filename, int lineno, int column, Token current=null )
   137  	{
   138  		this.buffer   = buffer;
   139  		this.filename = filename;
   140  		this.lineno   = lineno;
   141  		this.column   = column;
   142  		skipws();
   143  		this.current  = (current is null ? readNext() : current);
   144  	}
   145  
   146  	void skipws()
   147  	{
   148  		bool progress = false;
   149  		do
   150  		{
   151  			string ws = buffer.munch(" \t");
   152  			column += ws.length;
   153  			progress = !ws.empty;
   154  			while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') )
   155  			{
   156  				progress = true;
   157  				if( buffer[0] == '\n' )
   158  					buffer = buffer[1..$];
   159  				else // if( buffer.front == '\r' )
   160  				{
   161  					buffer = buffer[1..$];
   162  					if( !buffer.empty && buffer[0]=='\n' )
   163  						buffer = buffer[1..$];
   164  				}
   165  				lineno ++;
   166  				column = 1;
   167  			}
   168  		}while( progress );
   169  	}
   170  
   171  	char readChar()
   172  	{
   173  		scope(exit) {
   174  			buffer = buffer[1..$];
   175  			column ++;
   176  		}
   177  		return buffer[0];
   178  	}
   179  
   180  	/// This is the main lexing routine
   181  	Token readNext()
   182  	{
   183  		if( buffer.empty )
   184  			return null;
   185  		scope(exit)
   186  			skipws();
   187  
   188  		if( isSymbol(buffer[0]) )
   189  		{
   190  			if( buffer[0] == '#' )
   191  			{
   192  				// skip comment
   193  				while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') )
   194  					readChar();
   195  				skipws();
   196  				return readNext();
   197  			}
   198  			else if( buffer[0] == '"' )
   199  			{
   200  				// string literal
   201  				auto pos = currentPosition();
   202  				string lit;
   203  				readChar();
   204  				while( !buffer.empty && buffer[0]!='"' )
   205  				{
   206  					// read one char
   207  					char c = readChar();
   208  					if( c == '\\' )
   209  					{
   210  						if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') )
   211  							lit ~= readChar();
   212  						else
   213  							lit ~= c;
   214  					}
   215  					else if( c == '\n' )
   216  					{
   217  						lit ~= c;
   218  						lineno++;
   219  						column = 1;
   220  					}
   221  					else if( c == '\r' )
   222  					{
   223  						if( !buffer.empty && buffer[0]=='\n' )
   224  							readChar();
   225  						lit ~= '\n';
   226  						lineno++;
   227  						column = 1;
   228  					}
   229  					else
   230  						lit ~= c;
   231  				}
   232  				if( !buffer.empty )
   233  					readChar();
   234  				return new Token(pos, lit, Token.Kind.stringLiteral);
   235  			}
   236  			else
   237  			{
   238  				// normal symbol
   239  				auto pos = currentPosition();
   240  				auto str = ""~readChar();
   241  				return new Token(pos, str, Token.Kind.identifier);
   242  			}
   243  		}
   244  		else
   245  		{
   246  			auto pos = currentPosition();
   247  			int i = 0;
   248  			while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) )
   249  				++i;
   250  			auto str = buffer[0 .. i];
   251  			buffer   = buffer[i .. $];
   252  			column  += i;
   253  			bool isNumber = find!(`a<'0' || '9'<a`)(str).empty;
   254  			return new Token(pos, str, isNumber ? Token.Kind.number : Token.Kind.identifier);
   255  		}
   256  	}
   257  
   258  	bool isSymbol(char c)
   259  	{
   260  		return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_');
   261  	}
   262  
   263  	immutable(LexPosition) currentPosition()
   264  	{
   265  		return new immutable(LexPosition)(filename, lineno, column);
   266  	}
   267  }
   268  
   269  unittest
   270  {
   271  	assert( std.range.isForwardRange!(Lexer) );
   272  }
   273  
   274  unittest
   275  {
   276  	auto lex = lexerFromString("this	is a \t\n pen :-(   ");
   277  	Token[] ts = std.array.array(lex);
   278  
   279  	assert( ts[0].pos.lineno == 1 );
   280  	assert( ts[0].pos.column == 1 );
   281  	assert( ts[0].kind == Token.Kind.identifier );
   282  	assert( ts[0].str == "this" );
   283  
   284  	assert( ts[1].pos.lineno == 1 );
   285  	assert( ts[1].pos.column == 6 );
   286  	assert( ts[1].kind == Token.Kind.identifier );
   287  	assert( ts[1].str == "is" );
   288  
   289  	assert( ts[2].pos.lineno == 1 );
   290  	assert( ts[2].pos.column == 9 );
   291  	assert( ts[2].kind == Token.Kind.identifier );
   292  	assert( ts[2].str == "a" );
   293  
   294  	assert( ts[3].pos.lineno == 2 );
   295  	assert( ts[3].pos.column == 2 );
   296  	assert( ts[3].kind == Token.Kind.identifier );
   297  	assert( ts[3].str == "pen" );
   298  
   299  	// consecutive symbols are always separated
   300  	// hence, no "++" or "<<" or ...
   301  		
   302  	assert( ts[4].pos.lineno == 2 );
   303  	assert( ts[4].pos.column == 6 );
   304  	assert( ts[4].str == ":" );
   305  
   306  	assert( ts[5].pos.lineno == 2 );
   307  	assert( ts[5].pos.column == 7 );
   308  	assert( ts[5].str == "-" ); 
   309  
   310  	assert( ts[6].pos.lineno == 2 );
   311  	assert( ts[6].pos.column == 8 );
   312  	assert( ts[6].str == "(" );
   313  
   314  	assert( ts.length == 7 );
   315  }
   316  
   317  unittest
   318  {
   319  	auto lex2 = lexerFromString(" a12\n3a 5 ");
   320  	assert( lex2.front.str == "a12" );
   321  	assert( lex2.front.kind == Token.Kind.identifier );
   322  	lex2.popFront;
   323  	auto lex3 = lex2.save;
   324  	assert( lex2.front.str == "3a" );
   325  	assert( lex2.front.kind == Token.Kind.identifier );
   326  	lex2.popFront;
   327  	assert( lex3.front.str == "3a" );
   328  	assert( lex3.front.kind == Token.Kind.identifier );
   329  	assert( lex2.front.str == "5" );
   330  	assert( lex2.front.kind == Token.Kind.number );
   331  	lex2.popFront;
   332  	lex3.popFront;
   333  	assert( lex2.empty );
   334  	assert( !lex3.empty );
   335  	assert( lex3.front.str == "5" );
   336  	assert( lex3.front.kind == Token.Kind.number );
   337  }
   338  
   339  unittest
   340  {
   341  //!! be sure to run the unittest on the root of the source directory
   342  	auto lexf = lexerFromFile("polemy/lex.d");	
   343  	lexf = find!`a.str == "module"`(lexf);
   344  	assert( lexf.front.str == "module", lexf.front.str );
   345  	assert( lexf.front.pos.filename == "polemy/lex.d" );
   346  	assert( lexf.front.pos.lineno == 7 );
   347  	assert( lexf.front.pos.column == 1 );
   348  	lexf.popFront;
   349  	assert( lexf.front.str == "polemy" );
   350  	assert( lexf.front.pos.lineno == 7 );
   351  	assert( lexf.front.pos.column == 8 );
   352  	lexf.popFront;
   353  	assert( lexf.front.str == "." );
   354  	lexf.popFront;
   355  	assert( lexf.front.str == "lex" );
   356  	lexf.popFront;
   357  	assert( lexf.front.str == ";" );
   358  	lexf.popFront;
   359  	assert( lexf.front.str == "import" );
   360  	assert( lexf.front.pos.lineno == 8 );
   361  	assert( lexf.front.pos.column == 1 );
   362  }
   363  
   364  unittest
   365  {
   366  	auto lex = lexerFromString(`my # comment should
   367  # hey!!
   368  be ignored.
   369  hahaha"hihihi""hu\\\"huhu"#123 aa
   370  123 aa "aaa
   371  bbb # 123
   372  eee"
   373  zzz
   374  `);
   375  	Token[] ts = std.array.array(lex);
   376  	assert( ts[0].str == "my" );
   377  	assert( ts[0].pos.lineno == 1 );
   378  	assert( ts[1].str == "be" );
   379  	assert( ts[1].pos.lineno == 3 );
   380  	assert( ts[2].str == "ignored" );
   381  	assert( ts[3].str == "." );
   382  	assert( ts[4].str == "hahaha" );
   383  	assert( ts[4].pos.lineno == 4 );
   384  	assert( ts[4].kind == Token.Kind.identifier );
   385  	assert( ts[5].str == "hihihi" );
   386  	assert( ts[5].pos.lineno == 4 );
   387  	assert( ts[5].kind == Token.Kind.stringLiteral );
   388  	assert( ts[6].str == `hu\"huhu` );
   389  	assert( ts[6].kind == Token.Kind.stringLiteral );
   390  	assert( ts[6].pos.lineno == 4 );
   391  	assert( ts[7].str == "123" );
   392  	assert( ts[7].pos.lineno == 5 );
   393  	assert( ts[7].kind == Token.Kind.number );
   394  	assert( ts[8].str == "aa" );
   395  	assert( ts[9].pos.lineno == 5 );
   396  	assert( ts[9].str == "aaa\nbbb # 123\neee" );
   397  	assert( ts[9].kind == Token.Kind.stringLiteral );
   398  	assert( ts[10].pos.lineno == 8 );
   399  	assert( ts.length == 11 );
   400  }