Artifact Content
Not logged in

Artifact bee9af8d0f8f73488395b86b4cf2a9554859cec3


     1  /**
     2   * Authors: k.inaba
     3   * License: NYSL 0.9982 http://www.kmonos.net/nysl/
     4   *
     5   * Lexer for Polemy programming language.
     6   */
     7  module polemy.lex;
     8  import polemy._common;
     9  import std.file  : readText;
    10  import std.ctype : isspace, isalnum;
    11  
    12  /// Exception from this module
    13  
    14  class LexException : Exception
    15  {
    16  	this( const LexPosition pos, string msg )
    17  		{ super(sprintf!"%s [%s]"(msg, pos)); this.pos = pos; }
    18  	const LexPosition pos;
    19  };
    20  
    21  /// Represents a position in a source code
    22  
    23  class LexPosition
    24  {
    25  	immutable string filename; /// name of the source file
    26  	immutable int    lineno;   /// line number, 1, 2, ...
    27  	immutable int    column;   /// column, 1, 2, ...
    28  
    29  	override string toString() const
    30  		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
    31  
    32  	mixin SimpleConstructor;
    33  	mixin SimpleCompare;
    34  
    35  	static immutable LexPosition dummy;
    36  	static this(){ dummy = new immutable(LexPosition)("<unnamed>",0,0); }
    37  }
    38  
    39  unittest
    40  {
    41  	auto p = new LexPosition("hello.cpp", 123, 45);
    42  	auto q = new LexPosition("hello.cpp", 123, 46);
    43  
    44  	assert_eq( p.filename, "hello.cpp" );
    45  	assert_eq( p.lineno, 123 );
    46  	assert_eq( p.column, 45 );
    47  	assert_eq( to!string(p), "hello.cpp:123:45" );
    48  	assert_lt( p, q );
    49  	assert_ne( p, q );
    50  
    51  	assert( !__traits(compiles, new LexPosition) );
    52  	assert( !__traits(compiles, p.filename="foo") );
    53  	assert( !__traits(compiles, p.lineno  =789) );
    54  	assert( !__traits(compiles, p.column  =222) );
    55  }
    56  
    57  /// Represents a lexer token
    58  
    59  class Token
    60  {
    61  	immutable LexPosition pos;    /// Position where the token occurred in the source
    62  	immutable string      str;    /// The token string itself
    63  	immutable bool        quoted; /// Was it a "quoted" token or unquoted?
    64  
    65  	mixin SimpleConstructor;
    66  	mixin SimpleCompare;
    67  	mixin SimpleToString;
    68  }
    69  
    70  unittest
    71  {
    72  	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
    73  	auto t = new Token(p, "class", false);
    74  	auto u = new Token(p, "class", true);
    75  
    76  	assert_eq( t.pos, p );
    77  	assert_eq( t.str, "class" );
    78  	assert( !t.quoted );
    79  	assert_eq( t, new Token(p, "class", false) );
    80  	assert_lt( t, new Token(p, "struct", false) );
    81  	assert_ne( t, u );
    82  	assert( u.quoted );
    83  
    84  	assert( !__traits(compiles, new Token) );
    85  	assert( !__traits(compiles, t.pos=p) );
    86  	assert( !__traits(compiles, t.str=789) );
    87  	assert( !__traits(compiles, t.quoted=true) );
    88  }
    89  
    90  /// Named Construtor for Lexer
    91  
    92  auto lexerFromFile(T...)( string filename, T rest )
    93  {
    94  	return lexerFromString( std.file.readText(filename), filename, rest );
    95  }
    96  	
    97  /// Named Construtor for Lexer
    98  
    99  auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
   100  {
   101   	return new LexerT!(PositionedReader!CharSeq)(
   102  		PositionedReader!CharSeq(str, filename, lineno, column)
   103  	);
   104  }
   105  
   106  /// Standard Lexer Type (all users have to know is that this is a forward range of Tokens)
   107  
   108  alias LexerT!(PositionedReader!string) Lexer;
   109  
   110  /// Lexer Implementation
   111  
   112  class LexerT(Reader)
   113  	if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
   114  {
   115  	/// Range primitive
   116  	bool empty() /*@property*/
   117  	{
   118  		return current is null;
   119  	}
   120  
   121  	/// Range primitive
   122  	Token front() /*@property*/
   123  	{
   124  		return std.exception.enforce(current, "Lexer has already reached the end");
   125  	}
   126  
   127  	/// Range primitive
   128  	void popFront() /*@property*/
   129  	{
   130  		std.exception.enforce(current, "Lexer has already reached the end");
   131  		current = readNext();
   132  	}
   133  
   134  	/// Range primitive
   135  	typeof(this) save() /*@property*/
   136  	{
   137  		return new typeof(this)(reader.save, current);
   138  	}
   139  
   140  private: // implementation
   141  
   142  	Reader reader;
   143  	Token  current;
   144  
   145  	invariant()
   146  	{
   147  		assert( reader.empty || !std.ctype.isspace(reader.front) );
   148  	}
   149  
   150  	this( Reader reader, Token current = null )
   151  	{
   152  		this.reader = reader;
   153  		readWhile!isSpace();
   154  		this.current = (current is null ? readNext() : current);
   155  	}
   156  
   157  	public static {
   158  		bool isSpace   (dchar c) { return std.ctype.isspace(c)!=0; }
   159  		bool isSymbol  (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
   160  		bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; }
   161  		bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c); }
   162  		bool isLetter  (dchar c) { return !isSpace(c) && !isSymbol(c); }
   163  	}
   164  
   165  	string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
   166  	string readQuoted(const LexPosition pos, ref char[] buf)
   167  	{
   168  		if( reader.empty )
   169  			throw new LexException(pos, "EOF found while lexing a quoted-string");
   170  		dchar c = reader.front;
   171  		reader.popFront;
   172  		if( c == '"' )
   173  			return assumeUnique(buf);
   174  		if( c == '\\' && !reader.empty ) {
   175  			if( reader.front=='"' ) {
   176  				reader.popFront;
   177  				return readQuoted(pos,buf ~= '\"');
   178  			}
   179  			if( reader.front=='\\' ) {
   180  				reader.popFront;
   181  				return readQuoted(pos,buf ~= '\\');
   182  			}
   183  		}
   184  		return readQuoted(pos,buf ~= c);
   185  	}
   186  
   187  	string readWhile(alias fn)()
   188  	{
   189  		char[] buf;
   190  		for(; !reader.empty && fn(reader.front); reader.popFront)
   191  			buf ~= reader.front;
   192  		return assumeUnique(buf);
   193  	}
   194  
   195  	Token readNext()
   196  	{
   197  		if( reader.empty )
   198  			return null;
   199  		scope(success)
   200  			readWhile!isSpace();
   201  		if( reader.front == '#' ) // comment
   202  		{
   203  			reader = find(reader, '\n');
   204  			readWhile!isSpace();
   205  			return readNext();
   206  		}
   207  		else if( reader.front == '"' ) // quoted
   208  		{
   209  			auto pos = reader.currentPosition();
   210  			reader.popFront;
   211  			return new Token(pos, readQuoted(pos), true);
   212  		}
   213  		else if( isSSymbol(reader.front) ) // paren
   214  		{
   215  			auto pos = reader.currentPosition();
   216  			string s; s~=reader.front; reader.popFront;
   217  			return new Token(pos, s, false);
   218  		}
   219  		else if( isMSymbol(reader.front) ) // symbol
   220  		{
   221  			auto pos = reader.currentPosition();
   222  			return new Token(pos, readWhile!isMSymbol(), false);
   223  		}
   224  		else
   225  		{
   226  			auto pos = reader.currentPosition();
   227  			return new Token(pos, readWhile!isLetter(), false);
   228  		}
   229  	}
   230  }
   231  
   232  unittest
   233  {
   234  	assert( std.range.isForwardRange!(Lexer) );
   235  }
   236  
   237  unittest
   238  {
   239  	auto lex = lexerFromString("this	is a \t\r\n pen :-( @@;  ");
   240  	Token[] ts = std.array.array(lex);
   241  
   242  	assert_eq( ts[0].pos.lineno, 1 );
   243  	assert_eq( ts[0].pos.column, 1 );
   244  	assert(   !ts[0].quoted );
   245  	assert_eq( ts[0].str, "this" );
   246  
   247  	assert_eq( ts[1].pos.lineno, 1 );
   248  	assert_eq( ts[1].pos.column, 6 );
   249  	assert(   !ts[1].quoted );
   250  	assert_eq( ts[1].str, "is" );
   251  
   252  	assert_eq( ts[2].pos.lineno, 1 );
   253  	assert_eq( ts[2].pos.column, 9 );
   254  	assert(   !ts[2].quoted );
   255  	assert_eq( ts[2].str, "a" );
   256  
   257  	assert_eq( ts[3].pos.lineno, 2 );
   258  	assert_eq( ts[3].pos.column, 2 );
   259  	assert(   !ts[3].quoted );
   260  	assert_eq( ts[3].str, "pen" );
   261  
   262  	assert_eq( ts[4].pos.lineno, 2 );
   263  	assert_eq( ts[4].pos.column, 6 );
   264  	assert_eq( ts[4].str, ":-" );
   265  
   266  	assert_eq( ts[5].pos.lineno, 2 );
   267  	assert_eq( ts[5].pos.column, 8 );
   268  	assert_eq( ts[5].str, "(" );
   269  	assert_eq( ts[6].str, "@@" );
   270  	assert_eq( ts[7].str, ";" ); // paren and simicolons are split
   271  
   272  	assert_eq( ts.length, 8 );
   273  }
   274  
   275  unittest
   276  {
   277  	// !! be sure to run the unittest on the root of the source directory
   278  	auto lexf = lexerFromFile("polemy/lex.d");	
   279  	lexf = find!`a.str == "module"`(lexf);
   280  	assert_eq( lexf.front.str, "module" );
   281  	assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
   282  	assert_eq( lexf.front.pos.lineno, 7 );
   283  	assert_eq( lexf.front.pos.column, 1 );
   284  	lexf.popFront;
   285  	assert_eq( lexf.front.str, "polemy" );
   286  	assert_eq( lexf.front.pos.lineno, 7 );
   287  	assert_eq( lexf.front.pos.column, 8 );
   288  	lexf.popFront;
   289  	lexf.popFront;
   290  	lexf.popFront;
   291  	lexf.popFront;
   292  	assert_eq( lexf.front.str, "import" );
   293  	assert_eq( lexf.front.pos.lineno, 8 );
   294  	assert_eq( lexf.front.pos.column, 1 );
   295  }
   296  
   297  unittest
   298  {
   299  	assert_throw!LexException( lexerFromString(`"`) );
   300  }
   301  
   302  unittest
   303  {
   304  	auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
   305  be ignored.
   306  hahaha"hihihi""hu\\\"huhu"#123 aa
   307  123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
   308  zzz
   309  `);
   310  	Token[] ts = std.array.array(lex);
   311  	assert_eq( ts[0].str, "my" );
   312  	assert_eq( ts[0].pos.lineno, 1 );
   313  	assert(   !ts[0].quoted );
   314  	assert_eq( ts[1].str, "be" );
   315  	assert_eq( ts[1].pos.lineno, 3 );
   316  	assert(   !ts[1].quoted );
   317  	assert_eq( ts[2].str, "ignored" );
   318  	assert(   !ts[2].quoted );
   319  	assert_eq( ts[3].str, "." );
   320  	assert(   !ts[3].quoted );
   321  	assert_eq( ts[4].str, "hahaha" );
   322  	assert_eq( ts[4].pos.lineno, 4 );
   323  	assert(   !ts[4].quoted );
   324  	assert_eq( ts[5].str, "hihihi" );
   325  	assert_eq( ts[5].pos.lineno, 4 );
   326  	assert(    ts[5].quoted );
   327  	assert_eq( ts[6].str, `hu\"huhu` );
   328  	assert_eq( ts[6].pos.lineno, 4 );
   329  	assert(    ts[6].quoted );
   330  	assert_eq( ts[7].str, "123" );
   331  	assert_eq( ts[7].pos.lineno, 5 );
   332  	assert_eq( ts[8].str, "aa" );
   333  	assert_eq( ts[9].pos.lineno, 5 );
   334  	assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
   335  	assert(    ts[9].quoted );
   336  	assert_eq( ts[10].pos.lineno, 8 );
   337  	assert(   !ts[10].quoted );
   338  	assert_eq( ts.length, 11 );
   339  }
   340  
   341  unittest
   342  {
   343  	auto lex2 = lexerFromString(" a12\n3a 5 ");
   344  	assert_eq( lex2.front.str, "a12" );
   345  	lex2.popFront;
   346  	auto lex3 = lex2.save;
   347  	assert_eq( lex2.front.str, "3a" );
   348  	lex2.popFront;
   349  	assert_eq( lex3.front.str, "3a" );
   350  	assert_eq( lex2.front.str, "5" );
   351  	lex2.popFront;
   352  	lex3.popFront;
   353  	assert( lex2.empty );
   354  	assert( !lex3.empty );
   355  	assert_eq( lex3.front.str, "5" );
   356  }
   357  
   358  /// Forward range for reader character by character,
   359  /// keeping track of position information and caring \r\n -> \n conversion.
   360  
   361  private
   362  struct PositionedReader(CharSeq)
   363  	if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
   364  {
   365  	CharSeq buffer;
   366  	string  filename;
   367  	int     lineno;
   368  	int     column;
   369  
   370  	/// Range primitive
   371  	bool empty() /*@property*/
   372  	{
   373  		return buffer.empty;
   374  	}
   375  
   376  	/// Range primitive
   377  	dchar front() /*@property*/
   378  	{
   379  		dchar c = buffer.front;
   380  		return (c=='\r' ? '\n' : c);
   381  	}
   382  
   383  	/// Range primitive
   384  	void popFront() /*@property*/
   385  	{
   386  		dchar c = buffer.front;
   387  		buffer.popFront;
   388  		if( c=='\r' )
   389  		{
   390  			if( !buffer.empty && buffer.front=='\n' )
   391  				buffer.popFront;
   392  			c = '\n';
   393  		}
   394  		if( c=='\n' )
   395  		{
   396  			lineno ++;
   397  			column = 1;
   398  		}
   399  		else
   400  			column ++;
   401  	}
   402  
   403  	/// Range primitive
   404  	typeof(this) save() /*@property*/
   405  	{
   406  		return this;
   407  	}
   408  
   409  	/// Get the current position
   410  	immutable(LexPosition) currentPosition() const
   411  	{
   412  		return new immutable(LexPosition)(filename, lineno, column);
   413  	}
   414  }
   415  
   416  unittest
   417  {
   418  	assert( isForwardRange!(PositionedReader!string) );
   419  	assert( is(ElementType!(PositionedReader!string) == dchar) );
   420  }