from typing import Optional from ppp_tokens import EofToken, IdentifierToken, Keyword, KeywordToken, NumberToken, StringToken, Symbol, SymbolToken, Token, TokenContents, Location class Lexer: def __init__(self, source: str, filename: str) -> None: self._source = source self._location = 0 self._line = 1 self._col = 0 self._filename = filename self._peeked_token: Optional[Token] = None def _loc(self) -> Location: return Location(self._filename, self._line, self._col) def _token(self, loc: Location, value: str, contents: TokenContents) -> Token: return Token(loc, value, contents) @classmethod def from_file(cls, path: str) -> 'Lexer': with open(path) as f: return cls(f.read(), path) def _advance(self): assert self._location < len(self._source) self._line, self._col = (self._line + 1, 0) if self._source[self._location] == '\n' else (self._line, self._col + 1) self._location += 1 def next_token(self) -> Token: if self._peeked_token is not None: peeked_token, self._peeked_token = self._peeked_token, None return peeked_token while self._location < len(self._source) and self._source[self._location] in ' \t\n': self._advance() if self._location >= len(self._source): return self._token(self._loc(), '\0', EofToken()) match self._source[self._location]: case c if c.isdigit(): start_location = self._location loc = self._loc() while self._location < len(self._source) and self._source[self._location].isdigit(): self._advance() number = int(self._source[start_location:self._location]) return self._token(loc, self._source[start_location:self._location], NumberToken(number)) case c if c.isalpha() or c == "_": start_location = self._location loc = self._loc() while self._location < len(self._source) and (self._source[self._location].isalpha() or self._source[self._location] in '_'): self._advance() word = self._source[start_location:self._location] try: keyword = Keyword(word) return self._token(loc, word, KeywordToken(keyword)) except ValueError: try: symbol = Symbol(word) return self._token(loc, word, SymbolToken(symbol)) except ValueError: return self._token(loc, word, IdentifierToken(word)) case '"': # TODO: Proper escaping loc = self._loc() self._advance() start_location = self._location escaping = False while self._location < len(self._source) and (self._source[self._location] != '"' or escaping): escaping = self._source[self._location] == '\\' if not escaping else False self._advance() string = self._source[start_location:self._location].encode('utf-8').decode('unicode_escape') self._advance() return self._token(loc, self._source[start_location-1:self._location], StringToken(string)) # TODO: Make a proper Trie for this. case '|' if self._location < len(self._source)-1 and self._source[self._location+1] == '|': loc = self._loc() self._advance(); self._advance() return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dpipe)) case '&' if self._location < len(self._source)-1 and self._source[self._location+1] == '&': loc = self._loc() self._advance(); self._advance() return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dampersand)) case '*' if self._location < len(self._source)-1 and self._source[self._location+1] == '*': loc = self._loc() self._advance(); self._advance() return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dasterisk)) case '-' if self._location < len(self._source)-1 and self._source[self._location+1] == '>': loc = self._loc() self._advance(); self._advance() return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Arrow)) case '>' if self._location < len(self._source)-1 and self._source[self._location+1] == '=': loc = self._loc() self._advance(); self._advance() return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.GreaterEqual)) case '<' if self._location < len(self._source)-1 and self._source[self._location+1] == '=': loc = self._loc() self._advance(); self._advance() return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.LesserEqual)) case '=' if self._location < len(self._source)-1 and self._source[self._location+1] == '=': loc = self._loc() self._advance(); self._advance() return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dequal)) case '=' if self._location < len(self._source)-1 and self._source[self._location+1] == '>': loc = self._loc() self._advance(); self._advance() return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.EqualArrow)) case '!' if self._location < len(self._source)-1 and self._source[self._location+1] == '=': loc = self._loc() self._advance(); self._advance() return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.NotEqual)) case c if c in Symbol._value2member_map_: loc = self._loc() self._advance() return self._token(loc, self._source[self._location-1], SymbolToken(Symbol(c))) case _: raise SyntaxError(f"{self._loc()}: Unknown character: '{c}'") assert False, "Unreachable" def peek_token(self) -> Token: if self._peeked_token is not None: return self._peeked_token self._peeked_token = self.next_token() return self._peeked_token def assert_tokenkind(self, kind: type) -> Token: token = self.next_token() if not isinstance(token.contents, kind): raise SyntaxError(f"{token.loc}: Expected {kind} but got {token.contents}!") return token def assert_token(self, expected: TokenContents) -> Token: token = self.next_token() if token.contents != expected: raise SyntaxError(f"{token.loc}: Expected {expected} but got {token.contents}!") return token def check_token(self, expected: TokenContents) -> bool: token = self.peek_token() return token.contents == expected def check_tokens(self, *expected: TokenContents) -> bool: for token in expected: if self.check_token(token): return True return False def check_tokenkind(self, kind: type) -> bool: token = self.peek_token() return isinstance(token.contents, kind) def take_tokenkind(self, kind: type) -> Optional[Token]: if self.check_tokenkind(kind): return self.next_token() return None def take_token(self, token: TokenContents) -> Optional[Token]: if self.check_token(token): return self.next_token() return None def take_tokens(self, *tokens: TokenContents) -> Optional[Token]: for token in tokens: if self.check_token(token): return self.next_token() return None