Artifact Content
Not logged in

Artifact 5f52873e3ff7ae3098df334f269bfdc6f927e225


     1  /**
     2   * Authors: k.inaba
     3   * License: NYSL 0.9982 http://www.kmonos.net/nysl/
     4   *
     5   * Lexer for Polemy programming language.
     6   */
     7  module polemy.lex;
     8  import polemy._common;
     9  import std.file  : readText;
    10  import std.ctype : isspace, isalnum;
    11  
    12  /// Exception from this module
    13  
    14  class LexException : Exception
    15  {
    16  	this( const LexPosition pos, string msg )
    17  		{ super(sprintf!"%s [%s]"(msg, pos)); this.pos = pos; }
    18  	const LexPosition pos;
    19  };
    20  
    21  /// Represents a position in a source code
    22  
    23  class LexPosition
    24  {
    25  	immutable string filename; /// name of the source file
    26  	immutable int    lineno;   /// line number, 1, 2, ...
    27  	immutable int    column;   /// column, 1, 2, ...
    28  
    29  	override string toString() const
    30  		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }
    31  
    32  	mixin SimpleConstructor;
    33  	mixin SimpleCompare;
    34  }
    35  
    36  unittest
    37  {
    38  	auto p = new LexPosition("hello.cpp", 123, 45);
    39  	auto q = new LexPosition("hello.cpp", 123, 46);
    40  
    41  	assert_eq( p.filename, "hello.cpp" );
    42  	assert_eq( p.lineno, 123 );
    43  	assert_eq( p.column, 45 );
    44  	assert_eq( to!string(p), "hello.cpp:123:45" );
    45  	assert_lt( p, q );
    46  	assert_ne( p, q );
    47  
    48  	assert( !__traits(compiles, new LexPosition) );
    49  	assert( !__traits(compiles, p.filename="foo") );
    50  	assert( !__traits(compiles, p.lineno  =789) );
    51  	assert( !__traits(compiles, p.column  =222) );
    52  }
    53  
    54  /// Represents a lexer token
    55  
    56  class Token
    57  {
    58  	immutable LexPosition pos;    /// Position where the token occurred in the source
    59  	immutable string      str;    /// The token string itself
    60  	immutable bool        quoted; /// Was it a "quoted" token or unquoted?
    61  
    62  	mixin SimpleConstructor;
    63  	mixin SimpleCompare;
    64  }
    65  
    66  unittest
    67  {
    68  	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
    69  	auto t = new Token(p, "class", false);
    70  	auto u = new Token(p, "class", true);
    71  
    72  	assert_eq( t.pos, p );
    73  	assert_eq( t.str, "class" );
    74  	assert( !t.quoted );
    75  	assert_eq( t, new Token(p, "class", false) );
    76  	assert_lt( t, new Token(p, "struct", false) );
    77  	assert_ne( t, u );
    78  	assert( u.quoted );
    79  
    80  	assert( !__traits(compiles, new Token) );
    81  	assert( !__traits(compiles, t.pos=p) );
    82  	assert( !__traits(compiles, t.str=789) );
    83  	assert( !__traits(compiles, t.quoted=true) );
    84  }
    85  
    86  /// Named Construtor for Lexer
    87  
    88  auto lexerFromFile(T...)( string filename, T rest )
    89  {
    90  	return lexerFromString( std.file.readText(filename), filename, rest );
    91  }
    92  	
    93  /// Named Construtor for Lexer
    94  
    95  auto lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
    96  {
    97   	return new LexerT!(PositionedReader!CharSeq)(
    98  		PositionedReader!CharSeq(str, filename, lineno, column)
    99  	);
   100  }
   101  
   102  /// Standard Lexer Type (all users have to know is that this is a forward range of Tokens)
   103  
   104  alias LexerT!(PositionedReader!string) Lexer;
   105  
   106  /// Lexer Implementation
   107  
   108  class LexerT(Reader)
   109  	if( isForwardRange!(Reader) && is(ElementType!(Reader) == dchar) )
   110  {
   111  	/// Range primitive
   112  	bool empty() /*@property*/
   113  	{
   114  		return current is null;
   115  	}
   116  
   117  	/// Range primitive
   118  	Token front() /*@property*/
   119  	{
   120  		return std.exception.enforce(current, "Lexer has already reached the end");
   121  	}
   122  
   123  	/// Range primitive
   124  	void popFront() /*@property*/
   125  	{
   126  		std.exception.enforce(current, "Lexer has already reached the end");
   127  		current = readNext();
   128  	}
   129  
   130  	/// Range primitive
   131  	typeof(this) save() /*@property*/
   132  	{
   133  		return new typeof(this)(reader.save, current);
   134  	}
   135  
   136  private: // implementation
   137  
   138  	Reader reader;
   139  	Token  current;
   140  
   141  	invariant()
   142  	{
   143  		assert( reader.empty || !std.ctype.isspace(reader.front) );
   144  	}
   145  
   146  	this( Reader reader, Token current = null )
   147  	{
   148  		this.reader = reader;
   149  		readWhile!isSpace();
   150  		this.current = (current is null ? readNext() : current);
   151  	}
   152  
   153  	public static {
   154  		bool isSpace   (dchar c) { return std.ctype.isspace(c)!=0; }
   155  		bool isSymbol  (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
   156  		bool isSSymbol (dchar c) { return !find("()[]{};", c).empty; }
   157  		bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c); }
   158  		bool isLetter  (dchar c) { return !isSpace(c) && !isSymbol(c); }
   159  	}
   160  
   161  	string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
   162  	string readQuoted(const LexPosition pos, ref char[] buf)
   163  	{
   164  		if( reader.empty )
   165  			throw new LexException(pos, "EOF found while lexing a quoted-string");
   166  		dchar c = reader.front;
   167  		reader.popFront;
   168  		if( c == '"' )
   169  			return assumeUnique(buf);
   170  		if( c == '\\' && !reader.empty ) {
   171  			if( reader.front=='"' ) {
   172  				reader.popFront;
   173  				return readQuoted(pos,buf ~= '\"');
   174  			}
   175  			if( reader.front=='\\' ) {
   176  				reader.popFront;
   177  				return readQuoted(pos,buf ~= '\\');
   178  			}
   179  		}
   180  		return readQuoted(pos,buf ~= c);
   181  	}
   182  
   183  	string readWhile(alias fn)()
   184  	{
   185  		char[] buf;
   186  		for(; !reader.empty && fn(reader.front); reader.popFront)
   187  			buf ~= reader.front;
   188  		return assumeUnique(buf);
   189  	}
   190  
   191  	Token readNext()
   192  	{
   193  		if( reader.empty )
   194  			return null;
   195  		scope(success)
   196  			readWhile!isSpace();
   197  		if( reader.front == '#' ) // comment
   198  		{
   199  			reader = find(reader, '\n');
   200  			readWhile!isSpace();
   201  			return readNext();
   202  		}
   203  		else if( reader.front == '"' ) // quoted
   204  		{
   205  			auto pos = reader.currentPosition();
   206  			reader.popFront;
   207  			return new Token(pos, readQuoted(pos), true);
   208  		}
   209  		else if( isSSymbol(reader.front) ) // paren
   210  		{
   211  			auto pos = reader.currentPosition();
   212  			string s; s~=reader.front; reader.popFront;
   213  			return new Token(pos, s, false);
   214  		}
   215  		else if( isMSymbol(reader.front) ) // symbol
   216  		{
   217  			auto pos = reader.currentPosition();
   218  			return new Token(pos, readWhile!isMSymbol(), false);
   219  		}
   220  		else
   221  		{
   222  			auto pos = reader.currentPosition();
   223  			return new Token(pos, readWhile!isLetter(), false);
   224  		}
   225  	}
   226  }
   227  
   228  unittest
   229  {
   230  	assert( std.range.isForwardRange!(Lexer) );
   231  }
   232  
   233  unittest
   234  {
   235  	auto lex = lexerFromString("this	is a \t\r\n pen :-( @@;  ");
   236  	Token[] ts = std.array.array(lex);
   237  
   238  	assert_eq( ts[0].pos.lineno, 1 );
   239  	assert_eq( ts[0].pos.column, 1 );
   240  	assert(   !ts[0].quoted );
   241  	assert_eq( ts[0].str, "this" );
   242  
   243  	assert_eq( ts[1].pos.lineno, 1 );
   244  	assert_eq( ts[1].pos.column, 6 );
   245  	assert(   !ts[1].quoted );
   246  	assert_eq( ts[1].str, "is" );
   247  
   248  	assert_eq( ts[2].pos.lineno, 1 );
   249  	assert_eq( ts[2].pos.column, 9 );
   250  	assert(   !ts[2].quoted );
   251  	assert_eq( ts[2].str, "a" );
   252  
   253  	assert_eq( ts[3].pos.lineno, 2 );
   254  	assert_eq( ts[3].pos.column, 2 );
   255  	assert(   !ts[3].quoted );
   256  	assert_eq( ts[3].str, "pen" );
   257  
   258  	assert_eq( ts[4].pos.lineno, 2 );
   259  	assert_eq( ts[4].pos.column, 6 );
   260  	assert_eq( ts[4].str, ":-" );
   261  
   262  	assert_eq( ts[5].pos.lineno, 2 );
   263  	assert_eq( ts[5].pos.column, 8 );
   264  	assert_eq( ts[5].str, "(" );
   265  	assert_eq( ts[6].str, "@@" );
   266  	assert_eq( ts[7].str, ";" ); // paren and simicolons are split
   267  
   268  	assert_eq( ts.length, 8 );
   269  }
   270  
   271  unittest
   272  {
   273  	// !! be sure to run the unittest on the root of the source directory
   274  	auto lexf = lexerFromFile("polemy/lex.d");	
   275  	lexf = find!`a.str == "module"`(lexf);
   276  	assert_eq( lexf.front.str, "module" );
   277  	assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
   278  	assert_eq( lexf.front.pos.lineno, 7 );
   279  	assert_eq( lexf.front.pos.column, 1 );
   280  	lexf.popFront;
   281  	assert_eq( lexf.front.str, "polemy" );
   282  	assert_eq( lexf.front.pos.lineno, 7 );
   283  	assert_eq( lexf.front.pos.column, 8 );
   284  	lexf.popFront;
   285  	lexf.popFront;
   286  	lexf.popFront;
   287  	lexf.popFront;
   288  	assert_eq( lexf.front.str, "import" );
   289  	assert_eq( lexf.front.pos.lineno, 8 );
   290  	assert_eq( lexf.front.pos.column, 1 );
   291  }
   292  
   293  unittest
   294  {
   295  	assert_throw!LexException( lexerFromString(`"`) );
   296  }
   297  
   298  unittest
   299  {
   300  	auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
   301  be ignored.
   302  hahaha"hihihi""hu\\\"huhu"#123 aa
   303  123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
   304  zzz
   305  `);
   306  	Token[] ts = std.array.array(lex);
   307  	assert_eq( ts[0].str, "my" );
   308  	assert_eq( ts[0].pos.lineno, 1 );
   309  	assert(   !ts[0].quoted );
   310  	assert_eq( ts[1].str, "be" );
   311  	assert_eq( ts[1].pos.lineno, 3 );
   312  	assert(   !ts[1].quoted );
   313  	assert_eq( ts[2].str, "ignored" );
   314  	assert(   !ts[2].quoted );
   315  	assert_eq( ts[3].str, "." );
   316  	assert(   !ts[3].quoted );
   317  	assert_eq( ts[4].str, "hahaha" );
   318  	assert_eq( ts[4].pos.lineno, 4 );
   319  	assert(   !ts[4].quoted );
   320  	assert_eq( ts[5].str, "hihihi" );
   321  	assert_eq( ts[5].pos.lineno, 4 );
   322  	assert(    ts[5].quoted );
   323  	assert_eq( ts[6].str, `hu\"huhu` );
   324  	assert_eq( ts[6].pos.lineno, 4 );
   325  	assert(    ts[6].quoted );
   326  	assert_eq( ts[7].str, "123" );
   327  	assert_eq( ts[7].pos.lineno, 5 );
   328  	assert_eq( ts[8].str, "aa" );
   329  	assert_eq( ts[9].pos.lineno, 5 );
   330  	assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
   331  	assert(    ts[9].quoted );
   332  	assert_eq( ts[10].pos.lineno, 8 );
   333  	assert(   !ts[10].quoted );
   334  	assert_eq( ts.length, 11 );
   335  }
   336  
   337  unittest
   338  {
   339  	auto lex2 = lexerFromString(" a12\n3a 5 ");
   340  	assert_eq( lex2.front.str, "a12" );
   341  	lex2.popFront;
   342  	auto lex3 = lex2.save;
   343  	assert_eq( lex2.front.str, "3a" );
   344  	lex2.popFront;
   345  	assert_eq( lex3.front.str, "3a" );
   346  	assert_eq( lex2.front.str, "5" );
   347  	lex2.popFront;
   348  	lex3.popFront;
   349  	assert( lex2.empty );
   350  	assert( !lex3.empty );
   351  	assert_eq( lex3.front.str, "5" );
   352  }
   353  
   354  /// Forward range for reader character by character,
   355  /// keeping track of position information and caring \r\n -> \n conversion.
   356  
   357  private
   358  struct PositionedReader(CharSeq)
   359  	if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq) == dchar) )
   360  {
   361  	CharSeq buffer;
   362  	string  filename;
   363  	int     lineno;
   364  	int     column;
   365  
   366  	/// Range primitive
   367  	bool empty() /*@property*/
   368  	{
   369  		return buffer.empty;
   370  	}
   371  
   372  	/// Range primitive
   373  	dchar front() /*@property*/
   374  	{
   375  		dchar c = buffer.front;
   376  		return (c=='\r' ? '\n' : c);
   377  	}
   378  
   379  	/// Range primitive
   380  	void popFront() /*@property*/
   381  	{
   382  		dchar c = buffer.front;
   383  		buffer.popFront;
   384  		if( c=='\r' )
   385  		{
   386  			if( !buffer.empty && buffer.front=='\n' )
   387  				buffer.popFront;
   388  			c = '\n';
   389  		}
   390  		if( c=='\n' )
   391  		{
   392  			lineno ++;
   393  			column = 1;
   394  		}
   395  		else
   396  			column ++;
   397  	}
   398  
   399  	/// Range primitive
   400  	typeof(this) save() /*@property*/
   401  	{
   402  		return this;
   403  	}
   404  
   405  	/// Get the current position
   406  	immutable(LexPosition) currentPosition() const
   407  	{
   408  		return new immutable(LexPosition)(filename, lineno, column);
   409  	}
   410  }
   411  
   412  unittest
   413  {
   414  	assert( isForwardRange!(PositionedReader!string) );
   415  	assert( is(ElementType!(PositionedReader!string) == dchar) );
   416  }