Artifact Content
Not logged in

Artifact b98d4e6a5ceb6fb9e105b14bbd4efd40df1ae099


     1  /**
     2   * Authors: k.inaba
     3   * License: NYSL 0.9982 http://www.kmonos.net/nysl/
     4   *
     5   * Lexer for Polemy programming language.
     6   */
     7  module polemy.lex;
     8  import polemy._common;
     9  import polemy.failure;
    10  import std.file  : readText;
    11  import std.ctype : isspace, isalnum;
    12  
    13  /// Represents a lexer token
    14  
    15  class Token
    16  {
    17  	immutable LexPosition pos;    /// Position where the token occurred in the source
    18  	immutable string      str;    /// The token string itself
    19  	immutable bool        quoted; /// Was it a "quoted" token or unquoted?
    20  
    21  	mixin SimpleClass;
    22  }
    23  
    24  unittest
    25  {
    26  	auto p = new LexPosition("hello.cpp", 123, 45);
    27  	auto t = new Token(p, "class", false);
    28  	auto u = new Token(p, "class", true);
    29  
    30  	assert_eq( t.pos, p );
    31  	assert_eq( t.str, "class" );
    32  	assert( !t.quoted );
    33  	assert_eq( t, new Token(p, "class", false) );
    34  	assert_lt( t, new Token(p, "struct", false) );
    35  	assert_ne( t, u );
    36  	assert( u.quoted );
    37  
    38  	assert( !__traits(compiles, new Token) );
    39  	assert( !__traits(compiles, t.pos=p) );
    40  	assert( !__traits(compiles, t.str=789) );
    41  	assert( !__traits(compiles, t.quoted=true) );
    42  }
    43  
    44  /// Named Construtors for Lexer
    45  
    46  Lexer lexerFromFile(T...)( string filename, T ln_cn )
    47  {
    48  	return lexerFromString( std.file.readText(filename), filename, ln_cn );
    49  }
    50  	
    51  /// Named Construtor for Lexer
    52  
    53  LexerT!(PositionedReader!CharSeq) /* ddoc doesn't recognize auto return... bugzilla:2581 */
    54  lexerFromString(CharSeq)( CharSeq str, string filename="<unnamed>", int lineno=1, int column=1 )
    55  {
    56   	return new LexerT!(PositionedReader!CharSeq)(
    57  		PositionedReader!CharSeq(str, filename, lineno, column)
    58  	);
    59  }
    60  
    61  /// Standard Lexer Type (all you have to know is that this is a forward range of Tokens!)
    62  
    63  alias LexerT!(PositionedReader!string) Lexer;
    64  
    65  /// Lexer Implementation
    66  
    67  class LexerT(Reader)
    68  	if( isForwardRange!(Reader) && is(ElementType!(Reader)==dchar) )
    69  {
    70  	/// Range primitive
    71  	bool empty() /*@property*/
    72  	{
    73  		return current is null;
    74  	}
    75  
    76  	/// Range primitive
    77  	Token front() /*@property*/
    78  	{
    79  		return std.exception.enforce(current, "Lexer has already reached the end");
    80  	}
    81  
    82  	/// Range primitive
    83  	void popFront() /*@property*/
    84  	{
    85  		std.exception.enforce(current, "Lexer has already reached the end");
    86  		current = readNext();
    87  	}
    88  
    89  	/// Range primitive
    90  	typeof(this) save() /*@property*/
    91  	{
    92  		return new typeof(this)(reader.save, current);
    93  	}
    94  
    95  private: // implementation
    96  
    97  	Reader reader;
    98  	Token  current;
    99  
   100  	invariant()
   101  	{
   102  		assert( reader.empty || !isSpace(reader.front) );
   103  	}
   104  
   105  	this( Reader reader, Token current = null )
   106  	{
   107  		this.reader = reader;
   108  		readWhile!isSpace();
   109  		this.current = (current is null ? readNext() : current);
   110  	}
   111  
   112  	public static
   113  	{
   114  		bool isSpace   (dchar c) { return std.ctype.isspace(c)!=0; }
   115  		bool isSymbol  (dchar c) { return 0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_' && c!='\''; }
   116  		bool isSSymbol (dchar c) { return "()[]{};,@".canFind(c); }
   117  		bool isMSymbol (dchar c) { return isSymbol(c) && !isSSymbol(c) && c!='"' && c!='#'; }
   118  		bool isLetter  (dchar c) { return !isSpace(c) && !isSymbol(c); }
   119  	}
   120  
   121  	string readQuoted(const LexPosition pos){char[] buf; return readQuoted(pos,buf);}
   122  	string readQuoted(const LexPosition pos, ref char[] buf)
   123  	{
   124  		if( reader.empty )
   125  			throw genex!UnexpectedEOF(pos, "Quoted string not terminated");
   126  		dchar c = reader.front;
   127  		reader.popFront;
   128  		if( c == '"' )
   129  			return assumeUnique(buf);
   130  		if( c == '\\' && !reader.empty ) {
   131  			if( reader.front=='"' ) {
   132  				reader.popFront;
   133  				return readQuoted(pos,buf ~= '\"');
   134  			}
   135  			if( reader.front=='\\' ) {
   136  				reader.popFront;
   137  				return readQuoted(pos,buf ~= '\\');
   138  			}
   139  		}
   140  		return readQuoted(pos,buf ~= c);
   141  	}
   142  
   143  	string readWhile(alias fn)()
   144  	{
   145  		char[] buf;
   146  		for(; !reader.empty && fn(reader.front); reader.popFront)
   147  			buf ~= reader.front;
   148  		return assumeUnique(buf);
   149  	}
   150  
   151  	Token readNext()
   152  	{
   153  		if( reader.empty )
   154  			return null;
   155  		scope(success)
   156  			readWhile!isSpace();
   157  		if( reader.front == '#' ) // comment
   158  		{
   159  			reader = find(reader, '\n');
   160  			readWhile!isSpace();
   161  			return readNext();
   162  		}
   163  		else if( reader.front == '"' ) // quoted
   164  		{
   165  			auto pos = reader.currentPosition();
   166  			reader.popFront;
   167  			return new Token(pos, readQuoted(pos), true);
   168  		}
   169  		else if( isSSymbol(reader.front) ) // paren
   170  		{
   171  			auto pos = reader.currentPosition();
   172  			string s; s~=reader.front; reader.popFront;
   173  			return new Token(pos, s, false);
   174  		}
   175  		else if( isMSymbol(reader.front) ) // symbol
   176  		{
   177  			auto pos = reader.currentPosition();
   178  			return new Token(pos, readWhile!isMSymbol(), false);
   179  		}
   180  		else
   181  		{
   182  			auto pos = reader.currentPosition();
   183  			return new Token(pos, readWhile!isLetter(), false);
   184  		}
   185  	}
   186  }
   187  
   188  unittest
   189  {
   190  	assert( std.range.isForwardRange!(Lexer) );
   191  	assert( is(ElementType!(Lexer) == Token) );
   192  }
   193  
   194  unittest
   195  {
   196  	auto lex = lexerFromString("this	is a \t\r\n pen :-( @@;  ");
   197  	Token[] ts = std.array.array(lex);
   198  
   199  	assert_eq( ts[0].pos.lineno, 1 );
   200  	assert_eq( ts[0].pos.column, 1 );
   201  	assert(   !ts[0].quoted );
   202  	assert_eq( ts[0].str, "this" );
   203  
   204  	assert_eq( ts[1].pos.lineno, 1 );
   205  	assert_eq( ts[1].pos.column, 6 );
   206  	assert(   !ts[1].quoted );
   207  	assert_eq( ts[1].str, "is" );
   208  
   209  	assert_eq( ts[2].pos.lineno, 1 );
   210  	assert_eq( ts[2].pos.column, 9 );
   211  	assert(   !ts[2].quoted );
   212  	assert_eq( ts[2].str, "a" );
   213  
   214  	assert_eq( ts[3].pos.lineno, 2 );
   215  	assert_eq( ts[3].pos.column, 2 );
   216  	assert(   !ts[3].quoted );
   217  	assert_eq( ts[3].str, "pen" );
   218  
   219  	assert_eq( ts[4].pos.lineno, 2 );
   220  	assert_eq( ts[4].pos.column, 6 );
   221  	assert_eq( ts[4].str, ":-" );
   222  
   223  	assert_eq( ts[5].pos.lineno, 2 );
   224  	assert_eq( ts[5].pos.column, 8 );
   225  	assert_eq( ts[5].str, "(" );
   226  	assert_eq( ts[6].str, "@" );
   227  	assert_eq( ts[7].str, "@" );
   228  	assert_eq( ts[8].str, ";" ); // paren and simicolons, atmarks are split
   229  
   230  	assert_eq( ts.length, 9 );
   231  }
   232  
   233  unittest
   234  {
   235  	// !! be sure to run the unittest on the root of the source directory
   236  	auto lexf = lexerFromFile("polemy/lex.d");	
   237  	lexf = find!`a.str == "module"`(lexf);
   238  	assert_eq( lexf.front.str, "module" );
   239  	assert_eq( lexf.front.pos.filename, "polemy/lex.d" );
   240  	assert_eq( lexf.front.pos.lineno, 7 );
   241  	assert_eq( lexf.front.pos.column, 1 );
   242  	lexf.popFront;
   243  	assert_eq( lexf.front.str, "polemy" );
   244  	assert_eq( lexf.front.pos.lineno, 7 );
   245  	assert_eq( lexf.front.pos.column, 8 );
   246  	lexf.popFront;
   247  	lexf.popFront;
   248  	lexf.popFront;
   249  	lexf.popFront;
   250  	assert_eq( lexf.front.str, "import" );
   251  	assert_eq( lexf.front.pos.lineno, 8 );
   252  	assert_eq( lexf.front.pos.column, 1 );
   253  }
   254  
   255  unittest
   256  {
   257  	assert_throw!UnexpectedEOF( lexerFromString(`"`) );
   258  }
   259  
   260  unittest
   261  {
   262  	auto lex = lexerFromString(`my # comment should`~"\r\n"~`# hey!!
   263  be ignored.
   264  hahaha"hihihi""hu\\\"huhu"#123 aa
   265  123 aa "aaa`~"\n"~`bbb # 123`~"\r\n"~`eee"
   266  zzz
   267  `);
   268  	Token[] ts = std.array.array(lex);
   269  	assert_eq( ts[0].str, "my" );
   270  	assert_eq( ts[0].pos.lineno, 1 );
   271  	assert(   !ts[0].quoted );
   272  	assert_eq( ts[1].str, "be" );
   273  	assert_eq( ts[1].pos.lineno, 3 );
   274  	assert(   !ts[1].quoted );
   275  	assert_eq( ts[2].str, "ignored" );
   276  	assert(   !ts[2].quoted );
   277  	assert_eq( ts[3].str, "." );
   278  	assert(   !ts[3].quoted );
   279  	assert_eq( ts[4].str, "hahaha" );
   280  	assert_eq( ts[4].pos.lineno, 4 );
   281  	assert(   !ts[4].quoted );
   282  	assert_eq( ts[5].str, "hihihi" );
   283  	assert_eq( ts[5].pos.lineno, 4 );
   284  	assert(    ts[5].quoted );
   285  	assert_eq( ts[6].str, `hu\"huhu` );
   286  	assert_eq( ts[6].pos.lineno, 4 );
   287  	assert(    ts[6].quoted );
   288  	assert_eq( ts[7].str, "123" );
   289  	assert_eq( ts[7].pos.lineno, 5 );
   290  	assert_eq( ts[8].str, "aa" );
   291  	assert_eq( ts[9].pos.lineno, 5 );
   292  	assert_eq( ts[9].str, "aaa\nbbb # 123\neee" );
   293  	assert(    ts[9].quoted );
   294  	assert_eq( ts[10].pos.lineno, 8 );
   295  	assert(   !ts[10].quoted );
   296  	assert_eq( ts.length, 11 );
   297  }
   298  
   299  unittest
   300  {
   301  	auto lex2 = lexerFromString(" a12\n3a 5 ");
   302  	assert_eq( lex2.front.str, "a12" );
   303  	lex2.popFront;
   304  	auto lex3 = lex2.save;
   305  	assert_eq( lex2.front.str, "3a" );
   306  	lex2.popFront;
   307  	assert_eq( lex3.front.str, "3a" );
   308  	assert_eq( lex2.front.str, "5" );
   309  	lex2.popFront;
   310  	lex3.popFront;
   311  	assert( lex2.empty );
   312  	assert( !lex3.empty );
   313  	assert_eq( lex3.front.str, "5" );
   314  }
   315  
   316  unittest
   317  {
   318  	auto lex = lexerFromString(`=""`);
   319  	assert_eq(lex.front.str, "="); lex.popFront;
   320  	assert_eq(lex.front.str, ""); lex.popFront;
   321  	assert( lex.empty );
   322  	assert_eq( lexerFromString(`-@`).front.str, "-" );
   323  }
   324  
   325  /// Forward range for reader character by character,
   326  /// keeping track of position information and caring \r\n -> \n conversion.
   327  
   328  struct PositionedReader(CharSeq)
   329  	if( isForwardRange!(CharSeq) && is(ElementType!(CharSeq)==dchar) )
   330  {
   331  	CharSeq buffer;
   332  	string  filename;
   333  	int     lineno;
   334  	int     column;
   335  
   336  	/// Range primitive
   337  	bool empty() /*@property*/
   338  	{
   339  		return buffer.empty;
   340  	}
   341  
   342  	/// Range primitive
   343  	dchar front() /*@property*/
   344  	{
   345  		dchar c = buffer.front;
   346  		return (c=='\r' ? '\n' : c);
   347  	}
   348  
   349  	/// Range primitive
   350  	void popFront() /*@property*/
   351  	{
   352  		dchar c = buffer.front;
   353  		buffer.popFront;
   354  		if( c=='\r' )
   355  		{
   356  			if( !buffer.empty && buffer.front=='\n' )
   357  				buffer.popFront;
   358  			c = '\n';
   359  		}
   360  		if( c=='\n' )
   361  		{
   362  			lineno ++;	
   363  			column = 1;
   364  		}
   365  		else
   366  			column ++;
   367  	}
   368  
   369  	/// Range primitive
   370  	typeof(this) save() /*@property*/
   371  	{
   372  		return this;
   373  	}
   374  
   375  	/// Get the current position
   376  	LexPosition currentPosition() const
   377  	{
   378  		return new LexPosition(filename, lineno, column);
   379  	}
   380  }
   381  
   382  unittest
   383  {
   384  	assert( isForwardRange!(PositionedReader!string) );
   385  	assert( is(ElementType!(PositionedReader!string) == dchar) );
   386  	{
   387  		auto pr = PositionedReader!string("abc","",1,1);
   388  		assert_eq(pr.currentPosition().column, 1); pr.popFront;
   389  		assert_eq(pr.currentPosition().column, 2); pr.popFront;
   390  		assert_eq(pr.currentPosition().column, 3); pr.popFront;
   391  	}
   392  	{
   393  		auto pr = PositionedReader!string("\n\r\n\n","",1,1);
   394  		assert_eq(pr.currentPosition().lineno, 1); pr.popFront;
   395  		assert_eq(pr.currentPosition().lineno, 2); pr.popFront;
   396  		assert_eq(pr.currentPosition().lineno, 3); pr.popFront;
   397  	}
   398  }