Fix token location reporting

This commit is contained in:
germax26 2024-10-01 11:28:10 +10:00
parent b02ca87760
commit 18b22cd5d1
Signed by: germax26
SSH Key Fingerprint: SHA256:N3w+8798IMWBt7SYH8G1C0iJlIa2HIIcRCXwILT5FvM
2 changed files with 69 additions and 47 deletions

View File

@ -1,31 +1,32 @@
from typing import Optional from typing import Optional
from ppp_tokens import EofToken, IdentifierToken, Keyword, KeywordToken, NumberToken, StringToken, Symbol, SymbolToken, Token, TokenContents from ppp_tokens import EofToken, IdentifierToken, Keyword, KeywordToken, NumberToken, StringToken, Symbol, SymbolToken, Token, TokenContents, Location
class Lexer: class Lexer:
def __init__(self, source: str) -> None: def __init__(self, source: str, filename: str) -> None:
self._source = source self._source = source
self._location = 0 self._location = 0
self._line = 1 self._line = 1
self._col = 0 self._col = 0
self._filename = filename
self._peeked_token: Optional[Token] = None self._peeked_token: Optional[Token] = None
self._current: str = ""
def _loc(self) -> Location:
return Location(self._filename, self._line, self._col)
def _token(self, loc: Location, value: str, contents: TokenContents) -> Token:
return Token(loc, value, contents)
@classmethod @classmethod
def from_file(cls, path: str) -> 'Lexer': def from_file(cls, path: str) -> 'Lexer':
with open(path) as f: with open(path) as f:
return cls(f.read()) return cls(f.read(), path)
def _advance(self) -> str: def _advance(self):
assert self._location < len(self._source) assert self._location < len(self._source)
self._line, self._col = (self._line + 1, 0) if self._current == '\n' else (self._line, self._col + 1) self._line, self._col = (self._line + 1, 0) if self._source[self._location] == '\n' else (self._line, self._col + 1)
self._location += 1 self._location += 1
self._current = self._source[self._location] if self._location < len(self._source) else ''
return self._current
# def _peek(self) -> str:
# assert self._location < len(self._source)-1
def next_token(self) -> Token: def next_token(self) -> Token:
if self._peeked_token is not None: if self._peeked_token is not None:
@ -34,71 +35,84 @@ class Lexer:
while self._location < len(self._source) and self._source[self._location] in ' \t\n': self._advance() while self._location < len(self._source) and self._source[self._location] in ' \t\n': self._advance()
if self._location >= len(self._source): return Token(self._line, self._col, '\0', EofToken()) if self._location >= len(self._source): return self._token(self._loc(), '\0', EofToken())
match self._source[self._location]: match self._source[self._location]:
case c if c.isdigit(): case c if c.isdigit():
start_location = self._location start_location = self._location
while self._location < len(self._source) and self._source[self._location].isdigit(): self._location += 1 loc = self._loc()
while self._location < len(self._source) and self._source[self._location].isdigit(): self._advance()
number = int(self._source[start_location:self._location]) number = int(self._source[start_location:self._location])
return Token(self._line, self._col, self._source[start_location:self._location], NumberToken(number)) return self._token(loc, self._source[start_location:self._location], NumberToken(number))
case c if c.isalpha() or c == "_": case c if c.isalpha() or c == "_":
start_location = self._location start_location = self._location
while self._location < len(self._source) and (self._source[self._location].isalpha() or self._source[self._location] in '_'): self._location += 1 loc = self._loc()
while self._location < len(self._source) and (self._source[self._location].isalpha() or self._source[self._location] in '_'): self._advance()
word = self._source[start_location:self._location] word = self._source[start_location:self._location]
try: try:
keyword = Keyword(word) keyword = Keyword(word)
return Token(self._line, self._col, word, KeywordToken(keyword)) return self._token(loc, word, KeywordToken(keyword))
except ValueError: except ValueError:
try: try:
symbol = Symbol(word) symbol = Symbol(word)
return Token(self._line, self._col, word, SymbolToken(symbol)) return self._token(loc, word, SymbolToken(symbol))
except ValueError: except ValueError:
return Token(self._line, self._col, word, IdentifierToken(word)) return self._token(loc, word, IdentifierToken(word))
case '"': case '"':
# TODO: Escaping # TODO: Proper escaping
self._location += 1 self._advance()
start_location = self._location start_location = self._location
loc = self._loc()
escaping = False escaping = False
while self._location < len(self._source) and (self._source[self._location] != '"' or escaping): while self._location < len(self._source) and (self._source[self._location] != '"' or escaping):
escaping = self._source[self._location] == '\\' if not escaping else False escaping = self._source[self._location] == '\\' if not escaping else False
self._location += 1 self._advance()
string = self._source[start_location:self._location].encode('utf-8').decode('unicode_escape') string = self._source[start_location:self._location].encode('utf-8').decode('unicode_escape')
self._location += 1 self._advance()
return Token(self._line, self._col, self._source[start_location-1:self._location], StringToken(string)) return self._token(loc, self._source[start_location-1:self._location], StringToken(string))
# TODO: Make a proper Trie for this. # TODO: Make a proper Trie for this.
case '|' if self._location < len(self._source)-1 and self._source[self._location+1] == '|': case '|' if self._location < len(self._source)-1 and self._source[self._location+1] == '|':
self._location += 2 loc = self._loc()
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.Dpipe)) self._advance(); self._advance()
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dpipe))
case '&' if self._location < len(self._source)-1 and self._source[self._location+1] == '&': case '&' if self._location < len(self._source)-1 and self._source[self._location+1] == '&':
self._location += 2 loc = self._loc()
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.Dampersand)) self._advance(); self._advance()
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dampersand))
case '*' if self._location < len(self._source)-1 and self._source[self._location+1] == '*': case '*' if self._location < len(self._source)-1 and self._source[self._location+1] == '*':
self._location += 2 loc = self._loc()
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.Dasterisk)) self._advance(); self._advance()
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dasterisk))
case '-' if self._location < len(self._source)-1 and self._source[self._location+1] == '>': case '-' if self._location < len(self._source)-1 and self._source[self._location+1] == '>':
self._location += 2 loc = self._loc()
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.Arrow)) self._advance(); self._advance()
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Arrow))
case '>' if self._location < len(self._source)-1 and self._source[self._location+1] == '=': case '>' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
self._location += 2 loc = self._loc()
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.GreaterEqual)) self._advance(); self._advance()
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.GreaterEqual))
case '<' if self._location < len(self._source)-1 and self._source[self._location+1] == '=': case '<' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
self._location += 2 loc = self._loc()
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.LesserEqual)) self._advance(); self._advance()
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.LesserEqual))
case '=' if self._location < len(self._source)-1 and self._source[self._location+1] == '=': case '=' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
self._location += 2 loc = self._loc()
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.Dequal)) self._advance(); self._advance()
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dequal))
case '=' if self._location < len(self._source)-1 and self._source[self._location+1] == '>': case '=' if self._location < len(self._source)-1 and self._source[self._location+1] == '>':
self._location += 2 loc = self._loc()
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.EqualArrow)) self._advance(); self._advance()
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.EqualArrow))
case '!' if self._location < len(self._source)-1 and self._source[self._location+1] == '=': case '!' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
self._location += 2 loc = self._loc()
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.NotEqual)) self._advance(); self._advance()
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.NotEqual))
case c if c in Symbol._value2member_map_: case c if c in Symbol._value2member_map_:
self._location += 1 loc = self._loc()
return Token(self._line, self._col, self._source[self._location-1], SymbolToken(Symbol(c))) self._advance()
return self._token(loc, self._source[self._location-1], SymbolToken(Symbol(c)))
case _: case _:
assert False, ("Unimplemented", c, self._location) raise SyntaxError(f"{self._loc()}: Unknown character: '{c}'")
assert False, "Unreachable" assert False, "Unreachable"
def peek_token(self) -> Token: def peek_token(self) -> Token:
@ -108,12 +122,12 @@ class Lexer:
def assert_tokenkind(self, kind: type) -> Token: def assert_tokenkind(self, kind: type) -> Token:
token = self.next_token() token = self.next_token()
assert isinstance(token.contents, kind), (f"Expected {kind} but got {token.contents}!", self.next_token(), self.next_token(), self.next_token()) if not isinstance(token.contents, kind): raise SyntaxError(f"{token.loc}: Expected {kind} but got {token.contents}!")
return token return token
def assert_token(self, expected: TokenContents) -> Token: def assert_token(self, expected: TokenContents) -> Token:
token = self.next_token() token = self.next_token()
assert token.contents == expected, (f"Expected {expected} but got {token.contents}!", self.next_token(), self.next_token()) if token.contents != expected: raise SyntaxError(f"{token.loc}: Expected {expected} but got {token.contents}!")
return token return token
def check_token(self, expected: TokenContents) -> bool: def check_token(self, expected: TokenContents) -> bool:

View File

@ -89,8 +89,16 @@ class EofToken: pass
TokenContents = KeywordToken | IdentifierToken | NumberToken | StringToken | SymbolToken | EofToken TokenContents = KeywordToken | IdentifierToken | NumberToken | StringToken | SymbolToken | EofToken
@dataclass @dataclass
class Token: class Location:
file: str
line: int line: int
col: int col: int
def __repr__(self) -> str:
return f"{self.file}:{self.line}:{self.col+1}"
@dataclass
class Token:
loc: Location
value: str value: str
contents: TokenContents contents: TokenContents