Artifact Content
Not logged in

Artifact 0972f7a454ea8e4fd61a230f705e85f4c589362a


     1  /**
     2   * Authors: k.inaba
     3   * License: NYSL 0.9982 http://www.kmonos.net/nysl/
     4   *
     5   * Lexer for Polemy programming language.
     6   */
     7  module polemy.lex;
     8  import polemy._common;
     9  
    10  import std.file : readText;
    11  import std.string : munch;
    12  import std.ctype;
    13  
    14  /// Represents a position in a source code
    15  
    16  class LexPosition
    17  {
    18  	immutable string filename; /// name of the source file
    19  	immutable int    lineno;   /// line number, 1, 2, ...
    20  	immutable int    column;   /// column, 1, 2, ...
    21  
    22  	override string toString() const
    23  		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
    24  
    25  	mixin SimpleConstructor;
    26  	mixin SimpleCompare;
    27  }
    28  
    29  unittest
    30  {
    31  	auto p = new LexPosition("hello.cpp", 123, 45);
    32  	auto q = new LexPosition("hello.cpp", 123, 46);
    33  
    34  	assert_eq( p.filename, "hello.cpp" );
    35  	assert_eq( p.lineno, 123 );
    36  	assert_eq( p.column, 45 );
    37  	assert_eq( to!string(p), "hello.cpp:123:45" );
    38  	assert_lt( p, q );
    39  	assert_ne( p, q );
    40  
    41  	assert( !__traits(compiles, new LexPosition) );
    42  	assert( !__traits(compiles, p.filename="foo") );
    43  	assert( !__traits(compiles, p.lineno  =789) );
    44  	assert( !__traits(compiles, p.column  =222) );
    45  }
    46  
    47  /// Represents a lexer token
    48  
    49  class Token
    50  {
    51  	immutable LexPosition pos;    /// Position where the token occurred in the source
    52  	immutable string      str;    /// The token string itself
    53  	immutable bool        quoted; /// Was it a "quoted" token or unquoted?
    54  
    55  	mixin SimpleConstructor;
    56  	mixin SimpleCompare;
    57  }
    58  
    59  unittest
    60  {
    61  	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
    62  	auto t = new Token(p, "class", false);
    63  	auto u = new Token(p, "class", true);
    64  
    65  	assert_eq( t.pos, p );
    66  	assert_eq( t.str, "class" );
    67  	assert( !t.quoted );
    68  	assert_eq( t, new Token(p, "class", false) );
    69  	assert_lt( t, new Token(p, "struct", false) );
    70  	assert_ne( t, u );
    71  	assert( u.quoted );
    72  
    73  	assert( !__traits(compiles, new Token) );
    74  	assert( !__traits(compiles, t.pos=p) );
    75  	assert( !__traits(compiles, t.str=789) );
    76  	assert( !__traits(compiles, t.quoted=true) );
    77  }
    78  
    79  /// Named Construtor for Lexer
    80  
    81  Lexer lexerFromFile(T...)( string filename, T rest )
    82  {
    83  	return lexerFromString( std.file.readText(filename), filename, rest );
    84  }
    85  	
    86  /// Named Construtor for Lexer
    87  
    88  Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 )
    89  {
    90  	return new Lexer(str, filename, lineno, column);
    91  }
    92  
    93  /// Lexer is a forward range of Tokens
    94  
    95  class Lexer
    96  {
    97  	/// Range primitive
    98  	bool empty() /*@property*/
    99  	{
   100  		return current is null;
   101  	}
   102  
   103  	/// Range primitive
   104  	Token front() /*@property*/
   105  	{
   106  		return std.exception.enforce(current, "Lexer has already reached the end");
   107  	}
   108  
   109  	/// Range primitive
   110  	void popFront() /*@property*/
   111  	{
   112  		std.exception.enforce(current, "Lexer has already reached the end");
   113  		current = readNext();
   114  	}
   115  
   116  	/// Range primitive
   117  	Lexer save() /*@property*/
   118  	{
   119  		return new Lexer(this.tupleof);
   120  	}
   121  
   122  private: // implementation
   123  
   124  	string buffer;
   125  	string filename;
   126  	int    lineno;
   127  	int    column;
   128  	Token  current;
   129  
   130  	invariant()
   131  	{
   132  		assert( buffer.empty || !std.ctype.isspace(buffer[0]) );
   133  	}
   134  
   135  	this( string buffer, string filename, int lineno, int column, Token current=null )
   136  	{
   137  		this.buffer   = buffer;
   138  		this.filename = filename;
   139  		this.lineno   = lineno;
   140  		this.column   = column;
   141  		skipws();
   142  		this.current  = (current is null ? readNext() : current);
   143  	}
   144  
   145  	void skipws()
   146  	{
   147  		bool progress = false;
   148  		do
   149  		{
   150  			string ws = buffer.munch(" \t");
   151  			column += ws.length;
   152  			progress = !ws.empty;
   153  			while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') )
   154  			{
   155  				progress = true;
   156  				if( buffer[0] == '\n' )
   157  					buffer = buffer[1..$];
   158  				else // if( buffer.front == '\r' )
   159  				{
   160  					buffer = buffer[1..$];
   161  					if( !buffer.empty && buffer[0]=='\n' )
   162  						buffer = buffer[1..$];
   163  				}
   164  				lineno ++;
   165  				column = 1;
   166  			}
   167  		}while( progress );
   168  	}
   169  
   170  	char readChar()
   171  	{
   172  		scope(exit) {
   173  			buffer = buffer[1..$];
   174  			column ++;
   175  		}
   176  		return buffer[0];
   177  	}
   178  
   179  	/// This is the main lexing routine
   180  	Token readNext()
   181  	{
   182  		if( buffer.empty )
   183  			return null;
   184  		scope(exit)
   185  			skipws();
   186  
   187  		if( isSymbol(buffer[0]) )
   188  		{
   189  			if( buffer[0] == '#' )
   190  			{
   191  				// skip comment
   192  				while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') )
   193  					readChar();
   194  				skipws();
   195  				return readNext();
   196  			}
   197  			else if( buffer[0] == '"' )
   198  			{
   199  				// string literal
   200  				auto pos = currentPosition();
   201  				string lit;
   202  				readChar();
   203  				while( !buffer.empty && buffer[0]!='"' )
   204  				{
   205  					// read one char
   206  					char c = readChar();
   207  					if( c == '\\' )
   208  					{
   209  						if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') )
   210  							lit ~= readChar();
   211  						else
   212  							lit ~= c;
   213  					}
   214  					else if( c == '\n' )
   215  					{
   216  						lit ~= c;
   217  						lineno++;
   218  						column = 1;
   219  					}
   220  					else if( c == '\r' )
   221  					{
   222  						if( !buffer.empty && buffer[0]=='\n' )
   223  							readChar();
   224  						lit ~= '\n';
   225  						lineno++;
   226  						column = 1;
   227  					}
   228  					else
   229  						lit ~= c;
   230  				}
   231  				if( !buffer.empty )
   232  					readChar();
   233  				return new Token(pos, lit, true);
   234  			}
   235  			else
   236  			{
   237  				// normal symbol
   238  				auto pos = currentPosition();
   239  				auto str = ""~readChar();
   240  				return new Token(pos, str, false);
   241  			}
   242  		}
   243  		else
   244  		{
   245  			auto pos = currentPosition();
   246  			int i = 0;
   247  			while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) )
   248  				++i;
   249  			auto str = buffer[0 .. i];
   250  			buffer   = buffer[i .. $];
   251  			column  += i;
   252  			return new Token(pos, str, false);
   253  		}
   254  	}
   255  
   256  	bool isSymbol(char c)
   257  	{
   258  		return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_');
   259  	}
   260  
   261  	immutable(LexPosition) currentPosition()
   262  	{
   263  		return new immutable(LexPosition)(filename, lineno, column);
   264  	}
   265  }
   266  
   267  unittest
   268  {
   269  	assert( std.range.isForwardRange!(Lexer) );
   270  }
   271  
   272  unittest
   273  {
   274  	auto lex = lexerFromString("this	is a \t\r\n pen :-(   ");
   275  	Token[] ts = std.array.array(lex);
   276  
   277  	assert_eq( ts[0].pos.lineno, 1 );
   278  	assert_eq( ts[0].pos.column, 1 );
   279  	assert(   !ts[0].quoted );
   280  	assert_eq( ts[0].str, "this" );
   281  
   282  	assert_eq( ts[1].pos.lineno, 1 );
   283  	assert_eq( ts[1].pos.column, 6 );
   284  	assert(   !ts[1].quoted );
   285  	assert_eq( ts[1].str, "is" );
   286  
   287  	assert_eq( ts[2].pos.lineno, 1 );
   288  	assert_eq( ts[2].pos.column, 9 );
   289  	assert(   !ts[2].quoted );
   290  	assert_eq( ts[2].str, "a" );
   291  
   292  	assert_eq( ts[3].pos.lineno, 2 );
   293  	assert_eq( ts[3].pos.column, 2 );
   294  	assert(   !ts[3].quoted );
   295  	assert_eq( ts[3].str, "pen" );
   296  
   297  	// consecutive symbols are always separated
   298  	// hence, no "++" or "<<" or ...
   299  	
   300  	assert_eq( ts[4].pos.lineno, 2 );
   301  	assert_eq( ts[4].pos.column, 6 );
   302  	assert_eq( ts[4].str, ":" );
   303  
   304  	assert_eq( ts[5].pos.lineno, 2 );
   305  	assert_eq( ts[5].pos.column, 7 );
   306  	assert_eq( ts[5].str, "-" ); 
   307  
   308  	assert_eq( ts[6].pos.lineno, 2 );
   309  	assert_eq( ts[6].pos.column, 8 );
   310  	assert_eq( ts[6].str, "(" );
   311  
   312  	assert_eq( ts.length, 7 );
   313  }
   314  
   315  unittest
   316  {
   317  	auto lex2 = lexerFromString(" a12\n3a 5 ");
   318  	assert_eq( lex2.front.str, "a12" );
   319  	lex2.popFront;
   320  	auto lex3 = lex2.save;
   321  	assert_eq( lex2.front.str, "3a" );
   322  	lex2.popFront;
   323  	assert_eq( lex3.front.str, "3a" );
   324  	assert_eq( lex2.front.str, "5" );
   325  	lex2.popFront;
   326  	lex3.popFront;
   327  	assert( lex2.empty );
   328  	assert( !lex3.empty );
   329  	assert_eq( lex3.front.str, "5" );
   330  }
   331  
   332  unittest
   333  {
   334  //!! be sure to run the unittest on the root of the source directory
   335  	auto lexf = lexerFromFile("polemy/lex.d");	
   336  	lexf = find!`a.str == "module"`(lexf);
   337  	assert_eq( lexf.front.str, "module" );
   338  	assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
   339  	assert_eq( lexf.front.pos.lineno, 7 );
   340  	assert_eq( lexf.front.pos.column, 1 );
   341  	lexf.popFront;
   342  	assert_eq( lexf.front.str, "polemy" );
   343  	assert_eq( lexf.front.pos.lineno, 7 );
   344  	assert_eq( lexf.front.pos.column, 8 );
   345  	lexf.popFront;
   346  	assert_eq( lexf.front.str, "." );
   347  	lexf.popFront;
   348  	assert_eq( lexf.front.str, "lex" );
   349  	lexf.popFront;
   350  	assert_eq( lexf.front.str, ";" );
   351  	lexf.popFront;
   352  	assert_eq( lexf.front.str, "import" );
   353  	assert_eq( lexf.front.pos.lineno, 8 );
   354  	assert_eq( lexf.front.pos.column, 1 );
   355  }
   356  
   357  unittest
   358  {
   359  	auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
   360  be ignored.
   361  hahaha"hihihi""hu\\\"huhu"#123 aa
   362  123 aa "aaa`~"\r\n"~`bbb # 123`~"\r\n"~`eee"
   363  zzz
   364  `);
   365  	Token[] ts = std.array.array(lex);
   366  	assert_eq( ts[0].str, "my" );
   367  	assert_eq( ts[0].pos.lineno, 1 );
   368  	assert(   !ts[0].quoted );
   369  	assert_eq( ts[1].str, "be" );
   370  	assert_eq( ts[1].pos.lineno, 3 );
   371  	assert(   !ts[1].quoted );
   372  	assert_eq( ts[2].str, "ignored" );
   373  	assert(   !ts[2].quoted );
   374  	assert_eq( ts[3].str, "." );
   375  	assert(   !ts[3].quoted );
   376  	assert_eq( ts[4].str, "hahaha" );
   377  	assert_eq( ts[4].pos.lineno, 4 );
   378  	assert(   !ts[4].quoted );
   379  	assert_eq( ts[5].str, "hihihi" );
   380  	assert_eq( ts[5].pos.lineno, 4 );
   381  	assert(    ts[5].quoted );
   382  	assert_eq( ts[6].str, `hu\"huhu` );
   383  	assert_eq( ts[6].pos.lineno, 4 );
   384  	assert(    ts[6].quoted );
   385  	assert_eq( ts[7].str, "123" );
   386  	assert_eq( ts[7].pos.lineno, 5 );
   387  	assert_eq( ts[8].str, "aa" );
   388  	assert_eq( ts[9].pos.lineno, 5 );
   389  	assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
   390  	assert(    ts[9].quoted );
   391  	assert_eq( ts[10].pos.lineno, 8 );
   392  	assert(   !ts[10].quoted );
   393  	assert_eq( ts.length, 11 );
   394  }