Artifact Content
Not logged in

Artifact a4ea515a0f014c9c7d8c84d891b34a92b3b79730


     1  /**
     2   * Authors: k.inaba
     3   * License: NYSL 0.9982 http://www.kmonos.net/nysl/
     4   *
     5   * Lexer for Polemy programming language.
     6   */
     7  module polemy.lex;
     8  import polemy._common;
     9  import std.file  : readText;
    10  import std.ctype : isspace, isalnum;
    11  
    12  /*mixin*/
    13  template ExceptionWithPosition()
    14  {
    15  	const LexPosition pos;
    16  	this( const LexPosition pos, string msg, string file=null, size_t line=0, Throwable next=null )
    17  		{ super(sprintf!"[%s] %s"(pos, msg), file, line, next); this.pos = pos; }
    18  }
    19  
    20  /// Thrown when encountered an EOF in the middle of a lexical token
    21  
    22  class UnexpectedEOF : Exception
    23  {
    24  	mixin ExceptionWithPosition;
    25  }
    26  
    27  /// Thrown when encountered a lexical error
    28  
    29  class LexException : Exception
    30  {
    31  	mixin ExceptionWithPosition;
    32  };
    33  
    34  /// Represents a position in source codes
    35  
    36  class LexPosition
    37  {
    38  	immutable string filename; /// name of the source file
    39  	immutable int    lineno;   /// 1-origin
    40  	immutable int    column;   /// 1-origin
    41  
    42  	mixin SimpleClass;
    43  	override string toString() const
    44  		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
    45  
    46  	static immutable LexPosition dummy;
    47  	static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
    48  }
    49  
    50  unittest
    51  {
    52  	auto p = new LexPosition("hello.cpp", 123, 45);
    53  
    54  	assert_eq( p.filename, "hello.cpp" );
    55  	assert_eq( p.lineno, 123 );
    56  	assert_eq( p.column, 45 );
    57  	assert_eq( to!string(p), "hello.cpp:123:45" );
    58  
    59  	assert( !__traits(compiles, new LexPosition) );
    60  	assert( !__traits(compiles, p.filename="foo") );
    61  	assert( !__traits(compiles, p.lineno  =789) );
    62  	assert( !__traits(compiles, p.column  =222) );
    63  
    64  	auto q = new LexPosition("hello.cpp", 123, 46);
    65  	assert_lt( p, q );
    66  	assert_ne( p, q );
    67  }
    68  
    69  /// Represents a lexer token
    70  
    71  class Token
    72  {
    73  	immutable LexPosition pos;    /// Position where the token occurred in the source
    74  	immutable string      str;    /// The token string itself
    75  	immutable bool        quoted; /// Was it a "quoted" token or unquoted?
    76  
    77  	mixin SimpleClass;
    78  }
    79  
    80  unittest
    81  {
    82  	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
    83  	auto t = new Token(p, "class", false);
    84  	auto u = new Token(p, "class", true);
    85  
    86  	assert_eq( t.pos, p );
    87  	assert_eq( t.str, "class" );
    88  	assert( !t.quoted );
    89  	assert_eq( t, new Token(p, "class", false) );
    90  	assert_lt( t, new Token(p, "struct", false) );
    91  	assert_ne( t, u );
    92  	assert( u.quoted );
    93  
    94  	assert( !__traits(compiles, new Token) );
    95  	assert( !__traits(compiles, t.pos=p) );
    96  	assert( !__traits(compiles, t.str=789) );
    97  	assert( !__traits(compiles, t.quoted=true) );
    98  }
    99  
   100  /// Named Construtors for Lexer
   101  
   102  Lexer lexerFromFile(T...)( string filename, T ln_cn )
   103  {
   104  	return lexerFromString( std.file.readText(filename), filename, ln_cn );
   105  }
   106  	
   107  /// Named Construtor for Lexer
   108  
   109  LexerT!(PositionedReader!CharSeq) /* ddoc doesn't recognize auto return... bugzilla:2581 */
   110  lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
   111  {
   112   	return new LexerT!(PositionedReader!CharSeq)(
   113  		PositionedReader!CharSeq(str, filename, lineno, column)
   114  	);
   115  }
   116  
   117  /// Standard Lexer Type (all you have to know is that this is a forward range of Tokens!)
   118  
   119  alias LexerT!(PositionedReader!string) Lexer;
   120  
   121  /// Lexer Implementation
   122  
   123  class LexerT(Reader)
   124  	if( isForwardRange!(Reader) && is(ElementType!(Reader)==dchar) )
   125  {
   126  	/// Range primitive
   127  	bool empty() /*@property*/
   128  	{
   129  		return current is null;
   130  	}
   131  
   132  	/// Range primitive
   133  	Token front() /*@property*/
   134  	{
   135  		return std.exception.enforce(current, "Lexer has already reached the end");
   136  	}
   137  
   138  	/// Range primitive
   139  	void popFront() /*@property*/
   140  	{
   141  		std.exception.enforce(current, "Lexer has already reached the end");
   142  		current = readNext();
   143  	}
   144  
   145  	/// Range primitive
   146  	typeof(this) save() /*@property*/
   147  	{
   148  		return new typeof(this)(reader.save, current);
   149  	}
   150  
   151  private: // implementation
   152  
   153  	Reader reader;
   154  	Token  current;
   155  
   156  	invariant()
   157  	{
   158  		assert( reader.empty || !isSpace(reader.front) );
   159  	}
   160  
   161  	this( Reader reader, Token current = null )
   162  	{
   163  		this.reader = reader;
   164  		readWhile!isSpace();
   165  		this.current = (current is null ? readNext() : current);
   166  	}
   167  
   168  	public static
   169  	{
   170  		bool isSpace   (dchar c) { return std.ctype.isspace(c)!=0; }
   171  		bool isSymbol  (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
   172  		bool isSSymbol (dchar c) { return "()[]{};@".canFind(c); }
   173  		bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
   174  		bool isLetter  (dchar c) { return !isSpace(c) && !isSymbol(c); }
   175  	}
   176  
   177  	string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
   178  	string readQuoted(const LexPosition pos, ref char[] buf)
   179  	{
   180  		if( reader.empty )
   181  			throw genex!UnexpectedEOF(pos, "Quoted string not terminated");
   182  		dchar c = reader.front;
   183  		reader.popFront;
   184  		if( c == '"' )
   185  			return assumeUnique(buf);
   186  		if( c == '\\' && !reader.empty ) {
   187  			if( reader.front=='"' ) {
   188  				reader.popFront;
   189  				return readQuoted(pos,buf ~= '\"');
   190  			}
   191  			if( reader.front=='\\' ) {
   192  				reader.popFront;
   193  				return readQuoted(pos,buf ~= '\\');
   194  			}
   195  		}
   196  		return readQuoted(pos,buf ~= c);
   197  	}
   198  
   199  	string readWhile(alias fn)()
   200  	{
   201  		char[] buf;
   202  		for(; !reader.empty && fn(reader.front); reader.popFront)
   203  			buf ~= reader.front;
   204  		return assumeUnique(buf);
   205  	}
   206  
   207  	Token readNext()
   208  	{
   209  		if( reader.empty )
   210  			return null;
   211  		scope(success)
   212  			readWhile!isSpace();
   213  		if( reader.front == '#' ) // comment
   214  		{
   215  			reader = find(reader, '\n');
   216  			readWhile!isSpace();
   217  			return readNext();
   218  		}
   219  		else if( reader.front == '"' ) // quoted
   220  		{
   221  			auto pos = reader.currentPosition();
   222  			reader.popFront;
   223  			return new Token(pos, readQuoted(pos), true);
   224  		}
   225  		else if( isSSymbol(reader.front) ) // paren
   226  		{
   227  			auto pos = reader.currentPosition();
   228  			string s; s~=reader.front; reader.popFront;
   229  			return new Token(pos, s, false);
   230  		}
   231  		else if( isMSymbol(reader.front) ) // symbol
   232  		{
   233  			auto pos = reader.currentPosition();
   234  			return new Token(pos, readWhile!isMSymbol(), false);
   235  		}
   236  		else
   237  		{
   238  			auto pos = reader.currentPosition();
   239  			return new Token(pos, readWhile!isLetter(), false);
   240  		}
   241  	}
   242  }
   243  
   244  unittest
   245  {
   246  	assert( std.range.isForwardRange!(Lexer) );
   247  	assert( is(ElementType!(Lexer) == Token) );
   248  }
   249  
   250  unittest
   251  {
   252  	auto lex = lexerFromString("this	is a \t\r\n pen :-( @@;  ");
   253  	Token[] ts = std.array.array(lex);
   254  
   255  	assert_eq( ts[0].pos.lineno, 1 );
   256  	assert_eq( ts[0].pos.column, 1 );
   257  	assert(   !ts[0].quoted );
   258  	assert_eq( ts[0].str, "this" );
   259  
   260  	assert_eq( ts[1].pos.lineno, 1 );
   261  	assert_eq( ts[1].pos.column, 6 );
   262  	assert(   !ts[1].quoted );
   263  	assert_eq( ts[1].str, "is" );
   264  
   265  	assert_eq( ts[2].pos.lineno, 1 );
   266  	assert_eq( ts[2].pos.column, 9 );
   267  	assert(   !ts[2].quoted );
   268  	assert_eq( ts[2].str, "a" );
   269  
   270  	assert_eq( ts[3].pos.lineno, 2 );
   271  	assert_eq( ts[3].pos.column, 2 );
   272  	assert(   !ts[3].quoted );
   273  	assert_eq( ts[3].str, "pen" );
   274  
   275  	assert_eq( ts[4].pos.lineno, 2 );
   276  	assert_eq( ts[4].pos.column, 6 );
   277  	assert_eq( ts[4].str, ":-" );
   278  
   279  	assert_eq( ts[5].pos.lineno, 2 );
   280  	assert_eq( ts[5].pos.column, 8 );
   281  	assert_eq( ts[5].str, "(" );
   282  	assert_eq( ts[6].str, "@" );
   283  	assert_eq( ts[7].str, "@" );
   284  	assert_eq( ts[8].str, ";" ); // paren and simicolons, atmarks are split
   285  
   286  	assert_eq( ts.length, 9 );
   287  }
   288  
   289  unittest
   290  {
   291  	// !! be sure to run the unittest on the root of the source directory
   292  	auto lexf = lexerFromFile("polemy/lex.d");	
   293  	lexf = find!`a.str == "module"`(lexf);
   294  	assert_eq( lexf.front.str, "module" );
   295  	assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
   296  	assert_eq( lexf.front.pos.lineno, 7 );
   297  	assert_eq( lexf.front.pos.column, 1 );
   298  	lexf.popFront;
   299  	assert_eq( lexf.front.str, "polemy" );
   300  	assert_eq( lexf.front.pos.lineno, 7 );
   301  	assert_eq( lexf.front.pos.column, 8 );
   302  	lexf.popFront;
   303  	lexf.popFront;
   304  	lexf.popFront;
   305  	lexf.popFront;
   306  	assert_eq( lexf.front.str, "import" );
   307  	assert_eq( lexf.front.pos.lineno, 8 );
   308  	assert_eq( lexf.front.pos.column, 1 );
   309  }
   310  
   311  unittest
   312  {
   313  	assert_throw!UnexpectedEOF( lexerFromString(`"`) );
   314  }
   315  
   316  unittest
   317  {
   318  	auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
   319  be ignored.
   320  hahaha"hihihi""hu\\\"huhu"#123 aa
   321  123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
   322  zzz
   323  `);
   324  	Token[] ts = std.array.array(lex);
   325  	assert_eq( ts[0].str, "my" );
   326  	assert_eq( ts[0].pos.lineno, 1 );
   327  	assert(   !ts[0].quoted );
   328  	assert_eq( ts[1].str, "be" );
   329  	assert_eq( ts[1].pos.lineno, 3 );
   330  	assert(   !ts[1].quoted );
   331  	assert_eq( ts[2].str, "ignored" );
   332  	assert(   !ts[2].quoted );
   333  	assert_eq( ts[3].str, "." );
   334  	assert(   !ts[3].quoted );
   335  	assert_eq( ts[4].str, "hahaha" );
   336  	assert_eq( ts[4].pos.lineno, 4 );
   337  	assert(   !ts[4].quoted );
   338  	assert_eq( ts[5].str, "hihihi" );
   339  	assert_eq( ts[5].pos.lineno, 4 );
   340  	assert(    ts[5].quoted );
   341  	assert_eq( ts[6].str, `hu\"huhu` );
   342  	assert_eq( ts[6].pos.lineno, 4 );
   343  	assert(    ts[6].quoted );
   344  	assert_eq( ts[7].str, "123" );
   345  	assert_eq( ts[7].pos.lineno, 5 );
   346  	assert_eq( ts[8].str, "aa" );
   347  	assert_eq( ts[9].pos.lineno, 5 );
   348  	assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
   349  	assert(    ts[9].quoted );
   350  	assert_eq( ts[10].pos.lineno, 8 );
   351  	assert(   !ts[10].quoted );
   352  	assert_eq( ts.length, 11 );
   353  }
   354  
   355  unittest
   356  {
   357  	auto lex2 = lexerFromString(" a12\n3a 5 ");
   358  	assert_eq( lex2.front.str, "a12" );
   359  	lex2.popFront;
   360  	auto lex3 = lex2.save;
   361  	assert_eq( lex2.front.str, "3a" );
   362  	lex2.popFront;
   363  	assert_eq( lex3.front.str, "3a" );
   364  	assert_eq( lex2.front.str, "5" );
   365  	lex2.popFront;
   366  	lex3.popFront;
   367  	assert( lex2.empty );
   368  	assert( !lex3.empty );
   369  	assert_eq( lex3.front.str, "5" );
   370  }
   371  
   372  unittest
   373  {
   374  	auto lex = lexerFromString(`=""`);
   375  	assert_eq(lex.front.str, "="); lex.popFront;
   376  	assert_eq(lex.front.str, ""); lex.popFront;
   377  	assert( lex.empty );
   378  	assert_eq( lexerFromString(`-@`).front.str, "-" );
   379  }
   380  
   381  /// Forward range for reader character by character,
   382  /// keeping track of position information and caring \r\n -> \n conversion.
   383  
   384  struct PositionedReader(CharSeq)
   385  	if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq)==dchar) )
   386  {
   387  	CharSeq buffer;
   388  	string  filename;
   389  	int     lineno;
   390  	int     column;
   391  
   392  	/// Range primitive
   393  	bool empty() /*@property*/
   394  	{
   395  		return buffer.empty;
   396  	}
   397  
   398  	/// Range primitive
   399  	dchar front() /*@property*/
   400  	{
   401  		dchar c = buffer.front;
   402  		return (c=='\r' ? '\n' : c);
   403  	}
   404  
   405  	/// Range primitive
   406  	void popFront() /*@property*/
   407  	{
   408  		dchar c = buffer.front;
   409  		buffer.popFront;
   410  		if( c=='\r' )
   411  		{
   412  			if( !buffer.empty && buffer.front=='\n' )
   413  				buffer.popFront;
   414  			c = '\n';
   415  		}
   416  		if( c=='\n' )
   417  		{
   418  			lineno ++;
   419  			column = 1;
   420  		}
   421  		else
   422  			column ++;
   423  	}
   424  
   425  	/// Range primitive
   426  	typeof(this) save() /*@property*/
   427  	{
   428  		return this;
   429  	}
   430  
   431  	/// Get the current position
   432  	immutable(LexPosition) currentPosition() const
   433  	{
   434  		return new immutable(LexPosition)(filename, lineno, column);
   435  	}
   436  }
   437  
   438  unittest
   439  {
   440  	assert( isForwardRange!(PositionedReader!string) );
   441  	assert( is(ElementType!(PositionedReader!string) == dchar) );
   442  }