Artifact Content
Not logged in

Artifact df057f5d35e6bc8ab722a25dc86010063c65dc8e


     1  /**
     2   * Authors: k.inaba
     3   * License: NYSL 0.9982 http://www.kmonos.net/nysl/
     4   *
     5   * Lexer for Polemy programming language.
     6   */
     7  module polemy.lex;
     8  import polemy._common;
     9  import std.file  : readText;
    10  import std.ctype : isspace, isalnum;
    11  
    12  /// Exception from this module
    13  
    14  class LexException : Exception
    15  {
    16  	const LexPosition pos;
    17  
    18  	this( const LexPosition pos, string msg, string file="", int line=0 )
    19  		{ super(sprintf!"[%s] %s"(pos, msg), file, line); this.pos = pos; }
    20  };
    21  
    22  /// Represents a position in a source code
    23  
    24  class LexPosition
    25  {
    26  	immutable string filename; /// name of the source file
    27  	immutable int    lineno;   /// 1-origin
    28  	immutable int    column;   /// 1-origin
    29  
    30  	mixin SimpleClass;
    31  	override string toString() const
    32  		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
    33  
    34  	static immutable LexPosition dummy;
    35  	static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
    36  }
    37  
    38  unittest
    39  {
    40  	auto p = new LexPosition("hello.cpp", 123, 45);
    41  	auto q = new LexPosition("hello.cpp", 123, 46);
    42  
    43  	assert_eq( p.filename, "hello.cpp" );
    44  	assert_eq( p.lineno, 123 );
    45  	assert_eq( p.column, 45 );
    46  	assert_eq( to!string(p), "hello.cpp:123:45" );
    47  	assert_lt( p, q );
    48  	assert_ne( p, q );
    49  
    50  	assert( !__traits(compiles, new LexPosition) );
    51  	assert( !__traits(compiles, p.filename="foo") );
    52  	assert( !__traits(compiles, p.lineno  =789) );
    53  	assert( !__traits(compiles, p.column  =222) );
    54  }
    55  
    56  /// Represents a lexer token
    57  
    58  class Token
    59  {
    60  	immutable LexPosition pos;    /// Position where the token occurred in the source
    61  	immutable string      str;    /// The token string itself
    62  	immutable bool        quoted; /// Was it a "quoted" token or unquoted?
    63  
    64  	mixin SimpleClass;
    65  }
    66  
    67  unittest
    68  {
    69  	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
    70  	auto t = new Token(p, "class", false);
    71  	auto u = new Token(p, "class", true);
    72  
    73  	assert_eq( t.pos, p );
    74  	assert_eq( t.str, "class" );
    75  	assert( !t.quoted );
    76  	assert_eq( t, new Token(p, "class", false) );
    77  	assert_lt( t, new Token(p, "struct", false) );
    78  	assert_ne( t, u );
    79  	assert( u.quoted );
    80  
    81  	assert( !__traits(compiles, new Token) );
    82  	assert( !__traits(compiles, t.pos=p) );
    83  	assert( !__traits(compiles, t.str=789) );
    84  	assert( !__traits(compiles, t.quoted=true) );
    85  }
    86  
    87  /// Named Construtors for Lexer
    88  
    89  auto lexerFromFile(T...)( string filename, T rest )
    90  {
    91  	return lexerFromString( std.file.readText(filename), filename, rest );
    92  }
    93  	
    94  auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
    95  {
    96   	return new LexerT!(PositionedReader!CharSeq)(
    97  		PositionedReader!CharSeq(str, filename, lineno, column)
    98  	);
    99  }
   100  
   101  /// Standard Lexer Type (all you have to know is that this is a forward range of Tokens)
   102  
   103  alias LexerT!(PositionedReader!string) Lexer;
   104  
   105  /// Lexer Implementation
   106  
   107  class LexerT(Reader)
   108  	if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
   109  {
   110  	/// Range primitive
   111  	bool empty() /*@property*/
   112  	{
   113  		return current is null;
   114  	}
   115  
   116  	/// Range primitive
   117  	Token front() /*@property*/
   118  	{
   119  		return std.exception.enforce(current, "Lexer has already reached the end");
   120  	}
   121  
   122  	/// Range primitive
   123  	void popFront() /*@property*/
   124  	{
   125  		std.exception.enforce(current, "Lexer has already reached the end");
   126  		current = readNext();
   127  	}
   128  
   129  	/// Range primitive
   130  	typeof(this) save() /*@property*/
   131  	{
   132  		return new typeof(this)(reader.save, current);
   133  	}
   134  
   135  private: // implementation
   136  
   137  	Reader reader;
   138  	Token  current;
   139  
   140  	invariant()
   141  	{
   142  		assert( reader.empty || !std.ctype.isspace(reader.front) );
   143  	}
   144  
   145  	this( Reader reader, Token current = null )
   146  	{
   147  		this.reader = reader;
   148  		readWhile!isSpace();
   149  		this.current = (current is null ? readNext() : current);
   150  	}
   151  
   152  	public static {
   153  		bool isSpace   (dchar c) { return std.ctype.isspace(c)!=0; }
   154  		bool isSymbol  (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
   155  		bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; }
   156  		bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
   157  		bool isLetter  (dchar c) { return !isSpace(c) && !isSymbol(c); }
   158  	}
   159  
   160  	string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
   161  	string readQuoted(const LexPosition pos, ref char[] buf)
   162  	{
   163  		if( reader.empty )
   164  			throw new LexException(pos, "EOF found while lexing a quoted-string");
   165  		dchar c = reader.front;
   166  		reader.popFront;
   167  		if( c == '"' )
   168  			return assumeUnique(buf);
   169  		if( c == '\\' && !reader.empty ) {
   170  			if( reader.front=='"' ) {
   171  				reader.popFront;
   172  				return readQuoted(pos,buf ~= '\"');
   173  			}
   174  			if( reader.front=='\\' ) {
   175  				reader.popFront;
   176  				return readQuoted(pos,buf ~= '\\');
   177  			}
   178  		}
   179  		return readQuoted(pos,buf ~= c);
   180  	}
   181  
   182  	string readWhile(alias fn)()
   183  	{
   184  		char[] buf;
   185  		for(; !reader.empty && fn(reader.front); reader.popFront)
   186  			buf ~= reader.front;
   187  		return assumeUnique(buf);
   188  	}
   189  
   190  	Token readNext()
   191  	{
   192  		if( reader.empty )
   193  			return null;
   194  		scope(success)
   195  			readWhile!isSpace();
   196  		if( reader.front == '#' ) // comment
   197  		{
   198  			reader = find(reader, '\n');
   199  			readWhile!isSpace();
   200  			return readNext();
   201  		}
   202  		else if( reader.front == '"' ) // quoted
   203  		{
   204  			auto pos = reader.currentPosition();
   205  			reader.popFront;
   206  			return new Token(pos, readQuoted(pos), true);
   207  		}
   208  		else if( isSSymbol(reader.front) ) // paren
   209  		{
   210  			auto pos = reader.currentPosition();
   211  			string s; s~=reader.front; reader.popFront;
   212  			return new Token(pos, s, false);
   213  		}
   214  		else if( isMSymbol(reader.front) ) // symbol
   215  		{
   216  			auto pos = reader.currentPosition();
   217  			return new Token(pos, readWhile!isMSymbol(), false);
   218  		}
   219  		else
   220  		{
   221  			auto pos = reader.currentPosition();
   222  			return new Token(pos, readWhile!isLetter(), false);
   223  		}
   224  	}
   225  }
   226  
   227  unittest
   228  {
   229  	assert( std.range.isForwardRange!(Lexer) );
   230  }
   231  
   232  unittest
   233  {
   234  	auto lex = lexerFromString("this	is a \t\r\n pen :-( @@;  ");
   235  	Token[] ts = std.array.array(lex);
   236  
   237  	assert_eq( ts[0].pos.lineno, 1 );
   238  	assert_eq( ts[0].pos.column, 1 );
   239  	assert(   !ts[0].quoted );
   240  	assert_eq( ts[0].str, "this" );
   241  
   242  	assert_eq( ts[1].pos.lineno, 1 );
   243  	assert_eq( ts[1].pos.column, 6 );
   244  	assert(   !ts[1].quoted );
   245  	assert_eq( ts[1].str, "is" );
   246  
   247  	assert_eq( ts[2].pos.lineno, 1 );
   248  	assert_eq( ts[2].pos.column, 9 );
   249  	assert(   !ts[2].quoted );
   250  	assert_eq( ts[2].str, "a" );
   251  
   252  	assert_eq( ts[3].pos.lineno, 2 );
   253  	assert_eq( ts[3].pos.column, 2 );
   254  	assert(   !ts[3].quoted );
   255  	assert_eq( ts[3].str, "pen" );
   256  
   257  	assert_eq( ts[4].pos.lineno, 2 );
   258  	assert_eq( ts[4].pos.column, 6 );
   259  	assert_eq( ts[4].str, ":-" );
   260  
   261  	assert_eq( ts[5].pos.lineno, 2 );
   262  	assert_eq( ts[5].pos.column, 8 );
   263  	assert_eq( ts[5].str, "(" );
   264  	assert_eq( ts[6].str, "@@" );
   265  	assert_eq( ts[7].str, ";" ); // paren and simicolons are split
   266  
   267  	assert_eq( ts.length, 8 );
   268  }
   269  
   270  unittest
   271  {
   272  	// !! be sure to run the unittest on the root of the source directory
   273  	auto lexf = lexerFromFile("polemy/lex.d");	
   274  	lexf = find!`a.str == "module"`(lexf);
   275  	assert_eq( lexf.front.str, "module" );
   276  	assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
   277  	assert_eq( lexf.front.pos.lineno, 7 );
   278  	assert_eq( lexf.front.pos.column, 1 );
   279  	lexf.popFront;
   280  	assert_eq( lexf.front.str, "polemy" );
   281  	assert_eq( lexf.front.pos.lineno, 7 );
   282  	assert_eq( lexf.front.pos.column, 8 );
   283  	lexf.popFront;
   284  	lexf.popFront;
   285  	lexf.popFront;
   286  	lexf.popFront;
   287  	assert_eq( lexf.front.str, "import" );
   288  	assert_eq( lexf.front.pos.lineno, 8 );
   289  	assert_eq( lexf.front.pos.column, 1 );
   290  }
   291  
   292  unittest
   293  {
   294  	assert_throw!LexException( lexerFromString(`"`) );
   295  }
   296  
   297  unittest
   298  {
   299  	auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
   300  be ignored.
   301  hahaha"hihihi""hu\\\"huhu"#123 aa
   302  123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
   303  zzz
   304  `);
   305  	Token[] ts = std.array.array(lex);
   306  	assert_eq( ts[0].str, "my" );
   307  	assert_eq( ts[0].pos.lineno, 1 );
   308  	assert(   !ts[0].quoted );
   309  	assert_eq( ts[1].str, "be" );
   310  	assert_eq( ts[1].pos.lineno, 3 );
   311  	assert(   !ts[1].quoted );
   312  	assert_eq( ts[2].str, "ignored" );
   313  	assert(   !ts[2].quoted );
   314  	assert_eq( ts[3].str, "." );
   315  	assert(   !ts[3].quoted );
   316  	assert_eq( ts[4].str, "hahaha" );
   317  	assert_eq( ts[4].pos.lineno, 4 );
   318  	assert(   !ts[4].quoted );
   319  	assert_eq( ts[5].str, "hihihi" );
   320  	assert_eq( ts[5].pos.lineno, 4 );
   321  	assert(    ts[5].quoted );
   322  	assert_eq( ts[6].str, `hu\"huhu` );
   323  	assert_eq( ts[6].pos.lineno, 4 );
   324  	assert(    ts[6].quoted );
   325  	assert_eq( ts[7].str, "123" );
   326  	assert_eq( ts[7].pos.lineno, 5 );
   327  	assert_eq( ts[8].str, "aa" );
   328  	assert_eq( ts[9].pos.lineno, 5 );
   329  	assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
   330  	assert(    ts[9].quoted );
   331  	assert_eq( ts[10].pos.lineno, 8 );
   332  	assert(   !ts[10].quoted );
   333  	assert_eq( ts.length, 11 );
   334  }
   335  
   336  unittest
   337  {
   338  	auto lex2 = lexerFromString(" a12\n3a 5 ");
   339  	assert_eq( lex2.front.str, "a12" );
   340  	lex2.popFront;
   341  	auto lex3 = lex2.save;
   342  	assert_eq( lex2.front.str, "3a" );
   343  	lex2.popFront;
   344  	assert_eq( lex3.front.str, "3a" );
   345  	assert_eq( lex2.front.str, "5" );
   346  	lex2.popFront;
   347  	lex3.popFront;
   348  	assert( lex2.empty );
   349  	assert( !lex3.empty );
   350  	assert_eq( lex3.front.str, "5" );
   351  }
   352  
   353  unittest
   354  {
   355  	auto lex = lexerFromString(`=""`);
   356  	assert_eq(lex.front.str, "="); lex.popFront;
   357  	assert_eq(lex.front.str, ""); lex.popFront;
   358  	assert( lex.empty );
   359  }
   360  
   361  /// Forward range for reader character by character,
   362  /// keeping track of position information and caring \r\n -> \n conversion.
   363  
   364  private
   365  struct PositionedReader(CharSeq)
   366  	if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
   367  {
   368  	CharSeq buffer;
   369  	string  filename;
   370  	int     lineno;
   371  	int     column;
   372  
   373  	/// Range primitive
   374  	bool empty() /*@property*/
   375  	{
   376  		return buffer.empty;
   377  	}
   378  
   379  	/// Range primitive
   380  	dchar front() /*@property*/
   381  	{
   382  		dchar c = buffer.front;
   383  		return (c=='\r' ? '\n' : c);
   384  	}
   385  
   386  	/// Range primitive
   387  	void popFront() /*@property*/
   388  	{
   389  		dchar c = buffer.front;
   390  		buffer.popFront;
   391  		if( c=='\r' )
   392  		{
   393  			if( !buffer.empty && buffer.front=='\n' )
   394  				buffer.popFront;
   395  			c = '\n';
   396  		}
   397  		if( c=='\n' )
   398  		{
   399  			lineno ++;
   400  			column = 1;
   401  		}
   402  		else
   403  			column ++;
   404  	}
   405  
   406  	/// Range primitive
   407  	typeof(this) save() /*@property*/
   408  	{
   409  		return this;
   410  	}
   411  
   412  	/// Get the current position
   413  	immutable(LexPosition) currentPosition() const
   414  	{
   415  		return new immutable(LexPosition)(filename, lineno, column);
   416  	}
   417  }
   418  
   419  unittest
   420  {
   421  	assert( isForwardRange!(PositionedReader!string) );
   422  	assert( is(ElementType!(PositionedReader!string) == dchar) );
   423  }