Artifact Content
Not logged in

Artifact 4e484112cd314d8d6c0822b9d350d3142dedbc23


     1  /**
     2   * Authors: k.inaba
     3   * License: NYSL 0.9982 http://www.kmonos.net/nysl/
     4   *
     5   * Lexer for Polemy programming language.
     6   */
     7  module polemy.lex;
     8  import polemy._common;
     9  import std.file  : readText;
    10  import std.ctype : isspace, isalnum;
    11  
    12  /*mixin*/
    13  template ExceptionWithPosition()
    14  {
    15  	const LexPosition pos;
    16  	this( const LexPosition pos, string msg, string file=null, size_t line=0, Throwable next=null )
    17  		{ super(sprintf!"[%s] %s"(pos, msg), file, line, next); this.pos = pos; }
    18  }
    19  
    20  ///
    21  class UnexpectedEOF : Exception
    22  {
    23  	mixin ExceptionWithPosition;
    24  }
    25  
    26  ///
    27  class LexException : Exception
    28  {
    29  	mixin ExceptionWithPosition;
    30  };
    31  
    32  /// Represents a position in a source code
    33  
    34  class LexPosition
    35  {
    36  	immutable string filename; /// name of the source file
    37  	immutable int    lineno;   /// 1-origin
    38  	immutable int    column;   /// 1-origin
    39  
    40  	mixin SimpleClass;
    41  	override string toString() const
    42  		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
    43  
    44  	static immutable LexPosition dummy;
    45  	static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
    46  }
    47  
    48  unittest
    49  {
    50  	auto p = new LexPosition("hello.cpp", 123, 45);
    51  
    52  	assert_eq( p.filename, "hello.cpp" );
    53  	assert_eq( p.lineno, 123 );
    54  	assert_eq( p.column, 45 );
    55  	assert_eq( to!string(p), "hello.cpp:123:45" );
    56  
    57  	assert( !__traits(compiles, new LexPosition) );
    58  	assert( !__traits(compiles, p.filename="foo") );
    59  	assert( !__traits(compiles, p.lineno  =789) );
    60  	assert( !__traits(compiles, p.column  =222) );
    61  
    62  	auto q = new LexPosition("hello.cpp", 123, 46);
    63  	assert_lt( p, q );
    64  	assert_ne( p, q );
    65  }
    66  
    67  /// Represents a lexer token
    68  
    69  class Token
    70  {
    71  	immutable LexPosition pos;    /// Position where the token occurred in the source
    72  	immutable string      str;    /// The token string itself
    73  	immutable bool        quoted; /// Was it a "quoted" token or unquoted?
    74  
    75  	mixin SimpleClass;
    76  }
    77  
    78  unittest
    79  {
    80  	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
    81  	auto t = new Token(p, "class", false);
    82  	auto u = new Token(p, "class", true);
    83  
    84  	assert_eq( t.pos, p );
    85  	assert_eq( t.str, "class" );
    86  	assert( !t.quoted );
    87  	assert_eq( t, new Token(p, "class", false) );
    88  	assert_lt( t, new Token(p, "struct", false) );
    89  	assert_ne( t, u );
    90  	assert( u.quoted );
    91  
    92  	assert( !__traits(compiles, new Token) );
    93  	assert( !__traits(compiles, t.pos=p) );
    94  	assert( !__traits(compiles, t.str=789) );
    95  	assert( !__traits(compiles, t.quoted=true) );
    96  }
    97  
    98  /// Named Construtors for Lexer
    99  
   100  Lexer lexerFromFile(T...)( string filename, T rest )
   101  {
   102  	return lexerFromString( std.file.readText(filename), filename, rest );
   103  }
   104  	
   105  /// Named Construtors for Lexer
   106  
   107  LexerT!(PositionedReader!CharSeq) /* ddoc doesn't recognize auto return... bugzilla:2581 */
   108  lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
   109  {
   110   	return new LexerT!(PositionedReader!CharSeq)(
   111  		PositionedReader!CharSeq(str, filename, lineno, column)
   112  	);
   113  }
   114  
   115  /// Standard Lexer Type (all you have to know is that this is a forward range of Tokens)
   116  
   117  alias LexerT!(PositionedReader!string) Lexer;
   118  
   119  /// Lexer Implementation
   120  
   121  class LexerT(Reader)
   122  	if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
   123  {
   124  	/// Range primitive
   125  	bool empty() /*@property*/
   126  	{
   127  		return current is null;
   128  	}
   129  
   130  	/// Range primitive
   131  	Token front() /*@property*/
   132  	{
   133  		return std.exception.enforce(current, "Lexer has already reached the end");
   134  	}
   135  
   136  	/// Range primitive
   137  	void popFront() /*@property*/
   138  	{
   139  		std.exception.enforce(current, "Lexer has already reached the end");
   140  		current = readNext();
   141  	}
   142  
   143  	/// Range primitive
   144  	typeof(this) save() /*@property*/
   145  	{
   146  		return new typeof(this)(reader.save, current);
   147  	}
   148  
   149  private: // implementation
   150  
   151  	Reader reader;
   152  	Token  current;
   153  
   154  	invariant()
   155  	{
   156  		assert( reader.empty || !std.ctype.isspace(reader.front) );
   157  	}
   158  
   159  	this( Reader reader, Token current = null )
   160  	{
   161  		this.reader = reader;
   162  		readWhile!isSpace();
   163  		this.current = (current is null ? readNext() : current);
   164  	}
   165  
   166  	public static {
   167  		bool isSpace   (dchar c) { return std.ctype.isspace(c)!=0; }
   168  		bool isSymbol  (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
   169  		bool isSSymbol (dchar c) { return "()[]{};@".canFind(c); }
   170  		bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
   171  		bool isLetter  (dchar c) { return !isSpace(c) && !isSymbol(c); }
   172  	}
   173  
   174  	string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
   175  	string readQuoted(const LexPosition pos, ref char[] buf)
   176  	{
   177  		if( reader.empty )
   178  			throw genex!UnexpectedEOF(pos, "Quoted string not terminated");
   179  		dchar c = reader.front;
   180  		reader.popFront;
   181  		if( c == '"' )
   182  			return assumeUnique(buf);
   183  		if( c == '\\' && !reader.empty ) {
   184  			if( reader.front=='"' ) {
   185  				reader.popFront;
   186  				return readQuoted(pos,buf ~= '\"');
   187  			}
   188  			if( reader.front=='\\' ) {
   189  				reader.popFront;
   190  				return readQuoted(pos,buf ~= '\\');
   191  			}
   192  		}
   193  		return readQuoted(pos,buf ~= c);
   194  	}
   195  
   196  	string readWhile(alias fn)()
   197  	{
   198  		char[] buf;
   199  		for(; !reader.empty && fn(reader.front); reader.popFront)
   200  			buf ~= reader.front;
   201  		return assumeUnique(buf);
   202  	}
   203  
   204  	Token readNext()
   205  	{
   206  		if( reader.empty )
   207  			return null;
   208  		scope(success)
   209  			readWhile!isSpace();
   210  		if( reader.front == '#' ) // comment
   211  		{
   212  			reader = find(reader, '\n');
   213  			readWhile!isSpace();
   214  			return readNext();
   215  		}
   216  		else if( reader.front == '"' ) // quoted
   217  		{
   218  			auto pos = reader.currentPosition();
   219  			reader.popFront;
   220  			return new Token(pos, readQuoted(pos), true);
   221  		}
   222  		else if( isSSymbol(reader.front) ) // paren
   223  		{
   224  			auto pos = reader.currentPosition();
   225  			string s; s~=reader.front; reader.popFront;
   226  			return new Token(pos, s, false);
   227  		}
   228  		else if( isMSymbol(reader.front) ) // symbol
   229  		{
   230  			auto pos = reader.currentPosition();
   231  			return new Token(pos, readWhile!isMSymbol(), false);
   232  		}
   233  		else
   234  		{
   235  			auto pos = reader.currentPosition();
   236  			return new Token(pos, readWhile!isLetter(), false);
   237  		}
   238  	}
   239  }
   240  
   241  unittest
   242  {
   243  	assert( std.range.isForwardRange!(Lexer) );
   244  	assert( is(ElementType!(Lexer) == Token) );
   245  }
   246  
   247  unittest
   248  {
   249  	auto lex = lexerFromString("this	is a \t\r\n pen :-( @@;  ");
   250  	Token[] ts = std.array.array(lex);
   251  
   252  	assert_eq( ts[0].pos.lineno, 1 );
   253  	assert_eq( ts[0].pos.column, 1 );
   254  	assert(   !ts[0].quoted );
   255  	assert_eq( ts[0].str, "this" );
   256  
   257  	assert_eq( ts[1].pos.lineno, 1 );
   258  	assert_eq( ts[1].pos.column, 6 );
   259  	assert(   !ts[1].quoted );
   260  	assert_eq( ts[1].str, "is" );
   261  
   262  	assert_eq( ts[2].pos.lineno, 1 );
   263  	assert_eq( ts[2].pos.column, 9 );
   264  	assert(   !ts[2].quoted );
   265  	assert_eq( ts[2].str, "a" );
   266  
   267  	assert_eq( ts[3].pos.lineno, 2 );
   268  	assert_eq( ts[3].pos.column, 2 );
   269  	assert(   !ts[3].quoted );
   270  	assert_eq( ts[3].str, "pen" );
   271  
   272  	assert_eq( ts[4].pos.lineno, 2 );
   273  	assert_eq( ts[4].pos.column, 6 );
   274  	assert_eq( ts[4].str, ":-" );
   275  
   276  	assert_eq( ts[5].pos.lineno, 2 );
   277  	assert_eq( ts[5].pos.column, 8 );
   278  	assert_eq( ts[5].str, "(" );
   279  	assert_eq( ts[6].str, "@" );
   280  	assert_eq( ts[7].str, "@" );
   281  	assert_eq( ts[8].str, ";" ); // paren and simicolons, atmarks are split
   282  
   283  	assert_eq( ts.length, 9 );
   284  }
   285  
   286  unittest
   287  {
   288  	// !! be sure to run the unittest on the root of the source directory
   289  	auto lexf = lexerFromFile("polemy/lex.d");	
   290  	lexf = find!`a.str == "module"`(lexf);
   291  	assert_eq( lexf.front.str, "module" );
   292  	assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
   293  	assert_eq( lexf.front.pos.lineno, 7 );
   294  	assert_eq( lexf.front.pos.column, 1 );
   295  	lexf.popFront;
   296  	assert_eq( lexf.front.str, "polemy" );
   297  	assert_eq( lexf.front.pos.lineno, 7 );
   298  	assert_eq( lexf.front.pos.column, 8 );
   299  	lexf.popFront;
   300  	lexf.popFront;
   301  	lexf.popFront;
   302  	lexf.popFront;
   303  	assert_eq( lexf.front.str, "import" );
   304  	assert_eq( lexf.front.pos.lineno, 8 );
   305  	assert_eq( lexf.front.pos.column, 1 );
   306  }
   307  
   308  unittest
   309  {
   310  	assert_throw!UnexpectedEOF( lexerFromString(`"`) );
   311  }
   312  
   313  unittest
   314  {
   315  	auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
   316  be ignored.
   317  hahaha"hihihi""hu\\\"huhu"#123 aa
   318  123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
   319  zzz
   320  `);
   321  	Token[] ts = std.array.array(lex);
   322  	assert_eq( ts[0].str, "my" );
   323  	assert_eq( ts[0].pos.lineno, 1 );
   324  	assert(   !ts[0].quoted );
   325  	assert_eq( ts[1].str, "be" );
   326  	assert_eq( ts[1].pos.lineno, 3 );
   327  	assert(   !ts[1].quoted );
   328  	assert_eq( ts[2].str, "ignored" );
   329  	assert(   !ts[2].quoted );
   330  	assert_eq( ts[3].str, "." );
   331  	assert(   !ts[3].quoted );
   332  	assert_eq( ts[4].str, "hahaha" );
   333  	assert_eq( ts[4].pos.lineno, 4 );
   334  	assert(   !ts[4].quoted );
   335  	assert_eq( ts[5].str, "hihihi" );
   336  	assert_eq( ts[5].pos.lineno, 4 );
   337  	assert(    ts[5].quoted );
   338  	assert_eq( ts[6].str, `hu\"huhu` );
   339  	assert_eq( ts[6].pos.lineno, 4 );
   340  	assert(    ts[6].quoted );
   341  	assert_eq( ts[7].str, "123" );
   342  	assert_eq( ts[7].pos.lineno, 5 );
   343  	assert_eq( ts[8].str, "aa" );
   344  	assert_eq( ts[9].pos.lineno, 5 );
   345  	assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
   346  	assert(    ts[9].quoted );
   347  	assert_eq( ts[10].pos.lineno, 8 );
   348  	assert(   !ts[10].quoted );
   349  	assert_eq( ts.length, 11 );
   350  }
   351  
   352  unittest
   353  {
   354  	auto lex2 = lexerFromString(" a12\n3a 5 ");
   355  	assert_eq( lex2.front.str, "a12" );
   356  	lex2.popFront;
   357  	auto lex3 = lex2.save;
   358  	assert_eq( lex2.front.str, "3a" );
   359  	lex2.popFront;
   360  	assert_eq( lex3.front.str, "3a" );
   361  	assert_eq( lex2.front.str, "5" );
   362  	lex2.popFront;
   363  	lex3.popFront;
   364  	assert( lex2.empty );
   365  	assert( !lex3.empty );
   366  	assert_eq( lex3.front.str, "5" );
   367  }
   368  
   369  unittest
   370  {
   371  	auto lex = lexerFromString(`=""`);
   372  	assert_eq(lex.front.str, "="); lex.popFront;
   373  	assert_eq(lex.front.str, ""); lex.popFront;
   374  	assert( lex.empty );
   375  	assert_eq( lexerFromString(`-@`).front.str, "-" );
   376  }
   377  
   378  /// Forward range for reader character by character,
   379  /// keeping track of position information and caring \r\n -> \n conversion.
   380  
   381  private
   382  struct PositionedReader(CharSeq)
   383  	if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
   384  {
   385  	CharSeq buffer;
   386  	string  filename;
   387  	int     lineno;
   388  	int     column;
   389  
   390  	/// Range primitive
   391  	bool empty() /*@property*/
   392  	{
   393  		return buffer.empty;
   394  	}
   395  
   396  	/// Range primitive
   397  	dchar front() /*@property*/
   398  	{
   399  		dchar c = buffer.front;
   400  		return (c=='\r' ? '\n' : c);
   401  	}
   402  
   403  	/// Range primitive
   404  	void popFront() /*@property*/
   405  	{
   406  		dchar c = buffer.front;
   407  		buffer.popFront;
   408  		if( c=='\r' )
   409  		{
   410  			if( !buffer.empty && buffer.front=='\n' )
   411  				buffer.popFront;
   412  			c = '\n';
   413  		}
   414  		if( c=='\n' )
   415  		{
   416  			lineno ++;
   417  			column = 1;
   418  		}
   419  		else
   420  			column ++;
   421  	}
   422  
   423  	/// Range primitive
   424  	typeof(this) save() /*@property*/
   425  	{
   426  		return this;
   427  	}
   428  
   429  	/// Get the current position
   430  	immutable(LexPosition) currentPosition() const
   431  	{
   432  		return new immutable(LexPosition)(filename, lineno, column);
   433  	}
   434  }
   435  
   436  unittest
   437  {
   438  	assert( isForwardRange!(PositionedReader!string) );
   439  	assert( is(ElementType!(PositionedReader!string) == dchar) );
   440  }