Artifact Content
Not logged in

Artifact fb4085e84f38ed6aa249b86ab223d47783a538df


/**
 * Authors: k.inaba
 * License: NYSL 0.9982 http://www.kmonos.net/nysl/
 *
 * Lexer for Polemy programming language.
 */
module polemy.lex;
import polemy._common;

import std.file : readText;
import std.string : munch;
import std.ctype;

/// Represents a position in a source code

class LexPosition
{
	immutable string filename; /// name of the source file
	immutable int    lineno;   /// line number, 1, 2, ...
	immutable int    column;   /// column, 1, 2, ...

	override string toString() const
		{ return sprintf!"%s:%d:%d"(filename, lineno, column); }

	mixin SimpleConstructor;
	mixin SimpleCompare;
}

unittest
{
	auto p = new LexPosition("hello.cpp", 123, 45);
	auto q = new LexPosition("hello.cpp", 123, 46);

	assert( p.filename == "hello.cpp" );
	assert( p.lineno == 123 );
	assert( p.column == 45 );
	assert( to!string(p) == "hello.cpp:123:45" );
	assert( p < q );
	assert( p != q );

	assert( !__traits(compiles, new LexPosition) );
	assert( !__traits(compiles, p.filename="foo") );
	assert( !__traits(compiles, p.lineno  =789) );
	assert( !__traits(compiles, p.column  =222) );
}

/// Represents a lexer token

class Token
{
	enum Kind {identifier, stringLiteral, number};
	immutable LexPosition pos;  /// position where the token occurred in the source
	immutable string      str;  /// the token string itself
	immutable Kind        kind; /// which kind of token?

	mixin SimpleConstructor;
	mixin SimpleCompare;
}

unittest
{
	auto p = new immutable(LexPosition)("hello.cpp", 123, 45);
	auto t = new Token(p, "class", Token.Kind.identifier);

	assert( t.pos == p );
	assert( t.str == "class" );
	assert( t == new Token(p, "class", Token.Kind.identifier) );
	assert( t < new Token(p, "struct", Token.Kind.identifier) );

	assert( !__traits(compiles, new Token) );
	assert( !__traits(compiles, t.pos=p) );
	assert( !__traits(compiles, t.str=789) );
}

/// Named Construtor for Lexer

Lexer lexerFromFile(T...)( string filename, T rest )
{
	return lexerFromString( std.file.readText(filename), filename, rest );
}
	
/// Named Construtor for Lexer

Lexer lexerFromString( string str, string filename="<unnamed>", int lineno=1, int column=1 )
{
	return new Lexer(str, filename, lineno, column);
}

/// Lexer is a forward range of Tokens

class Lexer
{
	/// Range primitive
	bool empty() /*@property*/
	{
		return current is null;
	}

	/// Range primitive
	Token front() /*@property*/
	{
		return std.exception.enforce(current, "Lexer has already reached the end");
	}

	/// Range primitive
	void popFront() /*@property*/
	{
		std.exception.enforce(current, "Lexer has already reached the end");
		current = readNext();
	}

	/// Range primitive
	Lexer save() /*@property*/
	{
		return new Lexer(buffer, filename, lineno, column, current);
	}

private: // implementation

	string buffer;
	string filename;
	int    lineno;
	int    column;
	Token  current;

	invariant()
	{
		assert( buffer.empty || !std.ctype.isspace(buffer[0]) );
	}

	this( string buffer, string filename, int lineno, int column, Token current=null )
	{
		this.buffer   = buffer;
		this.filename = filename;
		this.lineno   = lineno;
		this.column   = column;
		skipws();
		this.current  = (current is null ? readNext() : current);
	}

	void skipws()
	{
		bool progress = false;
		do
		{
			string ws = buffer.munch(" \t");
			column += ws.length;
			progress = !ws.empty;
			while( !buffer.empty && (buffer[0]=='\r' || buffer[0]=='\n') )
			{
				progress = true;
				if( buffer[0] == '\n' )
					buffer = buffer[1..$];
				else // if( buffer.front == '\r' )
				{
					buffer = buffer[1..$];
					if( !buffer.empty && buffer[0]=='\n' )
						buffer = buffer[1..$];
				}
				lineno ++;
				column = 1;
			}
		}while( progress );
	}

	char readChar()
	{
		scope(exit) {
			buffer = buffer[1..$];
			column ++;
		}
		return buffer[0];
	}

	/// This is the main lexing routine
	Token readNext()
	{
		if( buffer.empty )
			return null;
		scope(exit)
			skipws();

		if( isSymbol(buffer[0]) )
		{
			if( buffer[0] == '#' )
			{
				// skip comment
				while( !buffer.empty && (buffer[0]!='\n' && buffer[0]!='\r') )
					readChar();
				skipws();
				return readNext();
			}
			else if( buffer[0] == '"' )
			{
				// string literal
				auto pos = currentPosition();
				string lit;
				readChar();
				while( !buffer.empty && buffer[0]!='"' )
				{
					// read one char
					char c = readChar();
					if( c == '\\' )
					{
						if( !buffer.empty && (buffer[0]=='\\' || buffer[0]=='"') )
							lit ~= readChar();
						else
							lit ~= c;
					}
					else if( c == '\n' )
					{
						lit ~= c;
						lineno++;
						column = 1;
					}
					else if( c == '\r' )
					{
						if( !buffer.empty && buffer[0]=='\n' )
							readChar();
						lit ~= '\n';
						lineno++;
						column = 1;
					}
					else
						lit ~= c;
				}
				if( !buffer.empty )
					readChar();
				return new Token(pos, lit, Token.Kind.stringLiteral);
			}
			else
			{
				// normal symbol
				auto pos = currentPosition();
				auto str = ""~readChar();
				return new Token(pos, str, Token.Kind.identifier);
			}
		}
		else
		{
			auto pos = currentPosition();
			int i = 0;
			while( i<buffer.length && !std.ctype.isspace(buffer[i]) && !isSymbol(buffer[i]) )
				++i;
			auto str = buffer[0 .. i];
			buffer   = buffer[i .. $];
			column  += i;
			bool isNumber = find!(`a<'0' || '9'<a`)(str).empty;
			return new Token(pos, str, isNumber ? Token.Kind.number : Token.Kind.identifier);
		}
	}

	bool isSymbol(char c)
	{
		return (0x21<=c && c<=0x7f && !std.ctype.isalnum(c) && c!='_');
	}

	immutable(LexPosition) currentPosition()
	{
		return new immutable(LexPosition)(filename, lineno, column);
	}
}

unittest
{
	assert( std.range.isForwardRange!(Lexer) );
}

unittest
{
	auto lex = lexerFromString("this	is a \t\n pen :-(   ");
	Token[] ts = std.array.array(lex);

	assert( ts[0].pos.lineno == 1 );
	assert( ts[0].pos.column == 1 );
	assert( ts[0].kind == Token.Kind.identifier );
	assert( ts[0].str == "this" );

	assert( ts[1].pos.lineno == 1 );
	assert( ts[1].pos.column == 6 );
	assert( ts[1].kind == Token.Kind.identifier );
	assert( ts[1].str == "is" );

	assert( ts[2].pos.lineno == 1 );
	assert( ts[2].pos.column == 9 );
	assert( ts[2].kind == Token.Kind.identifier );
	assert( ts[2].str == "a" );

	assert( ts[3].pos.lineno == 2 );
	assert( ts[3].pos.column == 2 );
	assert( ts[3].kind == Token.Kind.identifier );
	assert( ts[3].str == "pen" );

	// consecutive symbols are always separated
	// hence, no "++" or "<<" or ...
		
	assert( ts[4].pos.lineno == 2 );
	assert( ts[4].pos.column == 6 );
	assert( ts[4].str == ":" );

	assert( ts[5].pos.lineno == 2 );
	assert( ts[5].pos.column == 7 );
	assert( ts[5].str == "-" ); 

	assert( ts[6].pos.lineno == 2 );
	assert( ts[6].pos.column == 8 );
	assert( ts[6].str == "(" );

	assert( ts.length == 7 );
}

unittest
{
	auto lex2 = lexerFromString(" a12\n3a 5 ");
	assert( lex2.front.str == "a12" );
	assert( lex2.front.kind == Token.Kind.identifier );
	lex2.popFront;
	auto lex3 = lex2.save;
	assert( lex2.front.str == "3a" );
	assert( lex2.front.kind == Token.Kind.identifier );
	lex2.popFront;
	assert( lex3.front.str == "3a" );
	assert( lex3.front.kind == Token.Kind.identifier );
	assert( lex2.front.str == "5" );
	assert( lex2.front.kind == Token.Kind.number );
	lex2.popFront;
	lex3.popFront;
	assert( lex2.empty );
	assert( !lex3.empty );
	assert( lex3.front.str == "5" );
	assert( lex3.front.kind == Token.Kind.number );
}

unittest
{
//!! be sure to run the unittest on the root of the source directory
	auto lexf = lexerFromFile("polemy/lex.d");	
	lexf = find!`a.str == "module"`(lexf);
	assert( lexf.front.str == "module", lexf.front.str );
	assert( lexf.front.pos.filename == "polemy/lex.d" );
	assert( lexf.front.pos.lineno == 7 );
	assert( lexf.front.pos.column == 1 );
	lexf.popFront;
	assert( lexf.front.str == "polemy" );
	assert( lexf.front.pos.lineno == 7 );
	assert( lexf.front.pos.column == 8 );
	lexf.popFront;
	assert( lexf.front.str == "." );
	lexf.popFront;
	assert( lexf.front.str == "lex" );
	lexf.popFront;
	assert( lexf.front.str == ";" );
	lexf.popFront;
	assert( lexf.front.str == "import" );
	assert( lexf.front.pos.lineno == 8 );
	assert( lexf.front.pos.column == 1 );
}

unittest
{
	auto lex = lexerFromString(`my # comment should
# hey!!
be ignored.
hahaha"hihihi""hu\\\"huhu"#123 aa
123 aa "aaa
bbb # 123
eee"
zzz
`);
	Token[] ts = std.array.array(lex);
	assert( ts[0].str == "my" );
	assert( ts[0].pos.lineno == 1 );
	assert( ts[1].str == "be" );
	assert( ts[1].pos.lineno == 3 );
	assert( ts[2].str == "ignored" );
	assert( ts[3].str == "." );
	assert( ts[4].str == "hahaha" );
	assert( ts[4].pos.lineno == 4 );
	assert( ts[4].kind == Token.Kind.identifier );
	assert( ts[5].str == "hihihi" );
	assert( ts[5].pos.lineno == 4 );
	assert( ts[5].kind == Token.Kind.stringLiteral );
	assert( ts[6].str == `hu\"huhu` );
	assert( ts[6].kind == Token.Kind.stringLiteral );
	assert( ts[6].pos.lineno == 4 );
	assert( ts[7].str == "123" );
	assert( ts[7].pos.lineno == 5 );
	assert( ts[7].kind == Token.Kind.number );
	assert( ts[8].str == "aa" );
	assert( ts[9].pos.lineno == 5 );
	assert( ts[9].str == "aaa\nbbb # 123\neee" );
	assert( ts[9].kind == Token.Kind.stringLiteral );
	assert( ts[10].pos.lineno == 8 );
	assert( ts.length == 11 );
}