Artifact Content
Not logged in

Artifact 514757ca55e253b4facc0d360b4722a0261908a1


     1  /**
     2   * Authors: k.inaba
     3   * License: NYSL 0.9982 http://www.kmonos.net/nysl/
     4   *
     5   * Lexer for Polemy programming language.
     6   */
     7  module polemy.lex;
     8  import polemy._common;
     9  import std.file  : readText;
    10  import std.ctype : isspace, isalnum;
    11  
    12  /*mixin*/
    13  template ExceptionWithPosition()
    14  {
    15  	const LexPosition pos;
    16  	this( const LexPosition pos, string msg, string file=null, size_t line=0, Throwable next=null )
    17  	{
    18  		if(pos is null)
    19  			super(sprintf!"[??] %s"(msg), file, line, next);
    20  		else
    21  			super(sprintf!"[%s] %s"(pos, msg), file, line, next);
    22  		this.pos = pos;
    23  	}
    24  }
    25  
    26  /// Thrown when encountered an EOF in the middle of a lexical token
    27  
    28  class UnexpectedEOF : Exception
    29  {
    30  	mixin ExceptionWithPosition;
    31  }
    32  
    33  /// Thrown when encountered a lexical error
    34  
    35  class LexException : Exception
    36  {
    37  	mixin ExceptionWithPosition;
    38  };
    39  
    40  /// Represents a position in source codes
    41  
    42  class LexPosition
    43  {
    44  	immutable string filename; /// name of the source file
    45  	immutable int    lineno;   /// 1-origin
    46  	immutable int    column;   /// 1-origin
    47  
    48  	mixin SimpleClass;
    49  	override string toString() const
    50  		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
    51  
    52  	static immutable LexPosition dummy;
    53  	static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
    54  }
    55  
    56  unittest
    57  {
    58  	auto p = new LexPosition("hello.cpp", 123, 45);
    59  
    60  	assert_eq( p.filename, "hello.cpp" );
    61  	assert_eq( p.lineno, 123 );
    62  	assert_eq( p.column, 45 );
    63  	assert_eq( text(p), "hello.cpp:123:45" );
    64  
    65  	assert( !__traits(compiles, new LexPosition) );
    66  	assert( !__traits(compiles, p.filename="foo") );
    67  	assert( !__traits(compiles, p.lineno  =789) );
    68  	assert( !__traits(compiles, p.column  =222) );
    69  
    70  	auto q = new LexPosition("hello.cpp", 123, 46);
    71  	assert_lt( p, q );
    72  	assert_ne( p, q );
    73  }
    74  
    75  /// Represents a lexer token
    76  
    77  class Token
    78  {
    79  	immutable LexPosition pos;    /// Position where the token occurred in the source
    80  	immutable string      str;    /// The token string itself
    81  	immutable bool        quoted; /// Was it a "quoted" token or unquoted?
    82  
    83  	mixin SimpleClass;
    84  }
    85  
    86  unittest
    87  {
    88  	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
    89  	auto t = new Token(p, "class", false);
    90  	auto u = new Token(p, "class", true);
    91  
    92  	assert_eq( t.pos, p );
    93  	assert_eq( t.str, "class" );
    94  	assert( !t.quoted );
    95  	assert_eq( t, new Token(p, "class", false) );
    96  	assert_lt( t, new Token(p, "struct", false) );
    97  	assert_ne( t, u );
    98  	assert( u.quoted );
    99  
   100  	assert( !__traits(compiles, new Token) );
   101  	assert( !__traits(compiles, t.pos=p) );
   102  	assert( !__traits(compiles, t.str=789) );
   103  	assert( !__traits(compiles, t.quoted=true) );
   104  }
   105  
   106  /// Named Construtors for Lexer
   107  
   108  Lexer lexerFromFile(T...)( string filename, T ln_cn )
   109  {
   110  	return lexerFromString( std.file.readText(filename), filename, ln_cn );
   111  }
   112  	
   113  /// Named Construtor for Lexer
   114  
   115  LexerT!(PositionedReader!CharSeq) /* ddoc doesn't recognize auto return... bugzilla:2581 */
   116  lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
   117  {
   118   	return new LexerT!(PositionedReader!CharSeq)(
   119  		PositionedReader!CharSeq(str, filename, lineno, column)
   120  	);
   121  }
   122  
   123  /// Standard Lexer Type (all you have to know is that this is a forward range of Tokens!)
   124  
   125  alias LexerT!(PositionedReader!string) Lexer;
   126  
   127  /// Lexer Implementation
   128  
   129  class LexerT(Reader)
   130  	if( isForwardRange!(Reader) && is(ElementType!(Reader)==dchar) )
   131  {
   132  	/// Range primitive
   133  	bool empty() /*@property*/
   134  	{
   135  		return current is null;
   136  	}
   137  
   138  	/// Range primitive
   139  	Token front() /*@property*/
   140  	{
   141  		return std.exception.enforce(current, "Lexer has already reached the end");
   142  	}
   143  
   144  	/// Range primitive
   145  	void popFront() /*@property*/
   146  	{
   147  		std.exception.enforce(current, "Lexer has already reached the end");
   148  		current = readNext();
   149  	}
   150  
   151  	/// Range primitive
   152  	typeof(this) save() /*@property*/
   153  	{
   154  		return new typeof(this)(reader.save, current);
   155  	}
   156  
   157  private: // implementation
   158  
   159  	Reader reader;
   160  	Token  current;
   161  
   162  	invariant()
   163  	{
   164  		assert( reader.empty || !isSpace(reader.front) );
   165  	}
   166  
   167  	this( Reader reader, Token current = null )
   168  	{
   169  		this.reader = reader;
   170  		readWhile!isSpace();
   171  		this.current = (current is null ? readNext() : current);
   172  	}
   173  
   174  	public static
   175  	{
   176  		bool isSpace   (dchar c) { return std.ctype.isspace(c)!=0; }
   177  		bool isSymbol  (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
   178  		bool isSSymbol (dchar c) { return "()[]{};@".canFind(c); }
   179  		bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
   180  		bool isLetter  (dchar c) { return !isSpace(c) && !isSymbol(c); }
   181  	}
   182  
   183  	string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
   184  	string readQuoted(const LexPosition pos, ref char[] buf)
   185  	{
   186  		if( reader.empty )
   187  			throw genex!UnexpectedEOF(pos, "Quoted string not terminated");
   188  		dchar c = reader.front;
   189  		reader.popFront;
   190  		if( c == '"' )
   191  			return assumeUnique(buf);
   192  		if( c == '\\' && !reader.empty ) {
   193  			if( reader.front=='"' ) {
   194  				reader.popFront;
   195  				return readQuoted(pos,buf ~= '\"');
   196  			}
   197  			if( reader.front=='\\' ) {
   198  				reader.popFront;
   199  				return readQuoted(pos,buf ~= '\\');
   200  			}
   201  		}
   202  		return readQuoted(pos,buf ~= c);
   203  	}
   204  
   205  	string readWhile(alias fn)()
   206  	{
   207  		char[] buf;
   208  		for(; !reader.empty && fn(reader.front); reader.popFront)
   209  			buf ~= reader.front;
   210  		return assumeUnique(buf);
   211  	}
   212  
   213  	Token readNext()
   214  	{
   215  		if( reader.empty )
   216  			return null;
   217  		scope(success)
   218  			readWhile!isSpace();
   219  		if( reader.front == '#' ) // comment
   220  		{
   221  			reader = find(reader, '\n');
   222  			readWhile!isSpace();
   223  			return readNext();
   224  		}
   225  		else if( reader.front == '"' ) // quoted
   226  		{
   227  			auto pos = reader.currentPosition();
   228  			reader.popFront;
   229  			return new Token(pos, readQuoted(pos), true);
   230  		}
   231  		else if( isSSymbol(reader.front) ) // paren
   232  		{
   233  			auto pos = reader.currentPosition();
   234  			string s; s~=reader.front; reader.popFront;
   235  			return new Token(pos, s, false);
   236  		}
   237  		else if( isMSymbol(reader.front) ) // symbol
   238  		{
   239  			auto pos = reader.currentPosition();
   240  			return new Token(pos, readWhile!isMSymbol(), false);
   241  		}
   242  		else
   243  		{
   244  			auto pos = reader.currentPosition();
   245  			return new Token(pos, readWhile!isLetter(), false);
   246  		}
   247  	}
   248  }
   249  
   250  unittest
   251  {
   252  	assert( std.range.isForwardRange!(Lexer) );
   253  	assert( is(ElementType!(Lexer) == Token) );
   254  }
   255  
   256  unittest
   257  {
   258  	auto lex = lexerFromString("this	is a \t\r\n pen :-( @@;  ");
   259  	Token[] ts = std.array.array(lex);
   260  
   261  	assert_eq( ts[0].pos.lineno, 1 );
   262  	assert_eq( ts[0].pos.column, 1 );
   263  	assert(   !ts[0].quoted );
   264  	assert_eq( ts[0].str, "this" );
   265  
   266  	assert_eq( ts[1].pos.lineno, 1 );
   267  	assert_eq( ts[1].pos.column, 6 );
   268  	assert(   !ts[1].quoted );
   269  	assert_eq( ts[1].str, "is" );
   270  
   271  	assert_eq( ts[2].pos.lineno, 1 );
   272  	assert_eq( ts[2].pos.column, 9 );
   273  	assert(   !ts[2].quoted );
   274  	assert_eq( ts[2].str, "a" );
   275  
   276  	assert_eq( ts[3].pos.lineno, 2 );
   277  	assert_eq( ts[3].pos.column, 2 );
   278  	assert(   !ts[3].quoted );
   279  	assert_eq( ts[3].str, "pen" );
   280  
   281  	assert_eq( ts[4].pos.lineno, 2 );
   282  	assert_eq( ts[4].pos.column, 6 );
   283  	assert_eq( ts[4].str, ":-" );
   284  
   285  	assert_eq( ts[5].pos.lineno, 2 );
   286  	assert_eq( ts[5].pos.column, 8 );
   287  	assert_eq( ts[5].str, "(" );
   288  	assert_eq( ts[6].str, "@" );
   289  	assert_eq( ts[7].str, "@" );
   290  	assert_eq( ts[8].str, ";" ); // paren and simicolons, atmarks are split
   291  
   292  	assert_eq( ts.length, 9 );
   293  }
   294  
   295  unittest
   296  {
   297  	// !! be sure to run the unittest on the root of the source directory
   298  	auto lexf = lexerFromFile("polemy/lex.d");	
   299  	lexf = find!`a.str == "module"`(lexf);
   300  	assert_eq( lexf.front.str, "module" );
   301  	assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
   302  	assert_eq( lexf.front.pos.lineno, 7 );
   303  	assert_eq( lexf.front.pos.column, 1 );
   304  	lexf.popFront;
   305  	assert_eq( lexf.front.str, "polemy" );
   306  	assert_eq( lexf.front.pos.lineno, 7 );
   307  	assert_eq( lexf.front.pos.column, 8 );
   308  	lexf.popFront;
   309  	lexf.popFront;
   310  	lexf.popFront;
   311  	lexf.popFront;
   312  	assert_eq( lexf.front.str, "import" );
   313  	assert_eq( lexf.front.pos.lineno, 8 );
   314  	assert_eq( lexf.front.pos.column, 1 );
   315  }
   316  
   317  unittest
   318  {
   319  	assert_throw!UnexpectedEOF( lexerFromString(`"`) );
   320  }
   321  
   322  unittest
   323  {
   324  	auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
   325  be ignored.
   326  hahaha"hihihi""hu\\\"huhu"#123 aa
   327  123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
   328  zzz
   329  `);
   330  	Token[] ts = std.array.array(lex);
   331  	assert_eq( ts[0].str, "my" );
   332  	assert_eq( ts[0].pos.lineno, 1 );
   333  	assert(   !ts[0].quoted );
   334  	assert_eq( ts[1].str, "be" );
   335  	assert_eq( ts[1].pos.lineno, 3 );
   336  	assert(   !ts[1].quoted );
   337  	assert_eq( ts[2].str, "ignored" );
   338  	assert(   !ts[2].quoted );
   339  	assert_eq( ts[3].str, "." );
   340  	assert(   !ts[3].quoted );
   341  	assert_eq( ts[4].str, "hahaha" );
   342  	assert_eq( ts[4].pos.lineno, 4 );
   343  	assert(   !ts[4].quoted );
   344  	assert_eq( ts[5].str, "hihihi" );
   345  	assert_eq( ts[5].pos.lineno, 4 );
   346  	assert(    ts[5].quoted );
   347  	assert_eq( ts[6].str, `hu\"huhu` );
   348  	assert_eq( ts[6].pos.lineno, 4 );
   349  	assert(    ts[6].quoted );
   350  	assert_eq( ts[7].str, "123" );
   351  	assert_eq( ts[7].pos.lineno, 5 );
   352  	assert_eq( ts[8].str, "aa" );
   353  	assert_eq( ts[9].pos.lineno, 5 );
   354  	assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
   355  	assert(    ts[9].quoted );
   356  	assert_eq( ts[10].pos.lineno, 8 );
   357  	assert(   !ts[10].quoted );
   358  	assert_eq( ts.length, 11 );
   359  }
   360  
   361  unittest
   362  {
   363  	auto lex2 = lexerFromString(" a12\n3a 5 ");
   364  	assert_eq( lex2.front.str, "a12" );
   365  	lex2.popFront;
   366  	auto lex3 = lex2.save;
   367  	assert_eq( lex2.front.str, "3a" );
   368  	lex2.popFront;
   369  	assert_eq( lex3.front.str, "3a" );
   370  	assert_eq( lex2.front.str, "5" );
   371  	lex2.popFront;
   372  	lex3.popFront;
   373  	assert( lex2.empty );
   374  	assert( !lex3.empty );
   375  	assert_eq( lex3.front.str, "5" );
   376  }
   377  
   378  unittest
   379  {
   380  	auto lex = lexerFromString(`=""`);
   381  	assert_eq(lex.front.str, "="); lex.popFront;
   382  	assert_eq(lex.front.str, ""); lex.popFront;
   383  	assert( lex.empty );
   384  	assert_eq( lexerFromString(`-@`).front.str, "-" );
   385  }
   386  
   387  /// Forward range for reader character by character,
   388  /// keeping track of position information and caring \r\n -> \n conversion.
   389  
   390  struct PositionedReader(CharSeq)
   391  	if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq)==dchar) )
   392  {
   393  	CharSeq buffer;
   394  	string  filename;
   395  	int     lineno;
   396  	int     column;
   397  
   398  	/// Range primitive
   399  	bool empty() /*@property*/
   400  	{
   401  		return buffer.empty;
   402  	}
   403  
   404  	/// Range primitive
   405  	dchar front() /*@property*/
   406  	{
   407  		dchar c = buffer.front;
   408  		return (c=='\r' ? '\n' : c);
   409  	}
   410  
   411  	/// Range primitive
   412  	void popFront() /*@property*/
   413  	{
   414  		dchar c = buffer.front;
   415  		buffer.popFront;
   416  		if( c=='\r' )
   417  		{
   418  			if( !buffer.empty && buffer.front=='\n' )
   419  				buffer.popFront;
   420  			c = '\n';
   421  		}
   422  		if( c=='\n' )
   423  		{
   424  			lineno ++;	
   425  			column = 1;
   426  		}
   427  		else
   428  			column ++;
   429  	}
   430  
   431  	/// Range primitive
   432  	typeof(this) save() /*@property*/
   433  	{
   434  		return this;
   435  	}
   436  
   437  	/// Get the current position
   438  	immutable(LexPosition) currentPosition() const
   439  	{
   440  		return new immutable(LexPosition)(filename, lineno, column);
   441  	}
   442  }
   443  
   444  unittest
   445  {
   446  	assert( isForwardRange!(PositionedReader!string) );
   447  	assert( is(ElementType!(PositionedReader!string) == dchar) );
   448  }