Fix token location reporting
This commit is contained in:
parent
b02ca87760
commit
18b22cd5d1
106
ppp_lexer.py
106
ppp_lexer.py
@ -1,31 +1,32 @@
|
|||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from ppp_tokens import EofToken, IdentifierToken, Keyword, KeywordToken, NumberToken, StringToken, Symbol, SymbolToken, Token, TokenContents
|
from ppp_tokens import EofToken, IdentifierToken, Keyword, KeywordToken, NumberToken, StringToken, Symbol, SymbolToken, Token, TokenContents, Location
|
||||||
|
|
||||||
class Lexer:
|
class Lexer:
|
||||||
def __init__(self, source: str) -> None:
|
def __init__(self, source: str, filename: str) -> None:
|
||||||
self._source = source
|
self._source = source
|
||||||
self._location = 0
|
self._location = 0
|
||||||
self._line = 1
|
self._line = 1
|
||||||
self._col = 0
|
self._col = 0
|
||||||
|
self._filename = filename
|
||||||
self._peeked_token: Optional[Token] = None
|
self._peeked_token: Optional[Token] = None
|
||||||
self._current: str = ""
|
|
||||||
|
def _loc(self) -> Location:
|
||||||
|
return Location(self._filename, self._line, self._col)
|
||||||
|
|
||||||
|
def _token(self, loc: Location, value: str, contents: TokenContents) -> Token:
|
||||||
|
return Token(loc, value, contents)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_file(cls, path: str) -> 'Lexer':
|
def from_file(cls, path: str) -> 'Lexer':
|
||||||
with open(path) as f:
|
with open(path) as f:
|
||||||
return cls(f.read())
|
return cls(f.read(), path)
|
||||||
|
|
||||||
def _advance(self) -> str:
|
def _advance(self):
|
||||||
assert self._location < len(self._source)
|
assert self._location < len(self._source)
|
||||||
self._line, self._col = (self._line + 1, 0) if self._current == '\n' else (self._line, self._col + 1)
|
self._line, self._col = (self._line + 1, 0) if self._source[self._location] == '\n' else (self._line, self._col + 1)
|
||||||
self._location += 1
|
self._location += 1
|
||||||
self._current = self._source[self._location] if self._location < len(self._source) else ''
|
|
||||||
return self._current
|
|
||||||
|
|
||||||
# def _peek(self) -> str:
|
|
||||||
# assert self._location < len(self._source)-1
|
|
||||||
|
|
||||||
def next_token(self) -> Token:
|
def next_token(self) -> Token:
|
||||||
if self._peeked_token is not None:
|
if self._peeked_token is not None:
|
||||||
@ -34,71 +35,84 @@ class Lexer:
|
|||||||
|
|
||||||
while self._location < len(self._source) and self._source[self._location] in ' \t\n': self._advance()
|
while self._location < len(self._source) and self._source[self._location] in ' \t\n': self._advance()
|
||||||
|
|
||||||
if self._location >= len(self._source): return Token(self._line, self._col, '\0', EofToken())
|
if self._location >= len(self._source): return self._token(self._loc(), '\0', EofToken())
|
||||||
|
|
||||||
match self._source[self._location]:
|
match self._source[self._location]:
|
||||||
case c if c.isdigit():
|
case c if c.isdigit():
|
||||||
start_location = self._location
|
start_location = self._location
|
||||||
while self._location < len(self._source) and self._source[self._location].isdigit(): self._location += 1
|
loc = self._loc()
|
||||||
|
while self._location < len(self._source) and self._source[self._location].isdigit(): self._advance()
|
||||||
number = int(self._source[start_location:self._location])
|
number = int(self._source[start_location:self._location])
|
||||||
return Token(self._line, self._col, self._source[start_location:self._location], NumberToken(number))
|
return self._token(loc, self._source[start_location:self._location], NumberToken(number))
|
||||||
case c if c.isalpha() or c == "_":
|
case c if c.isalpha() or c == "_":
|
||||||
start_location = self._location
|
start_location = self._location
|
||||||
while self._location < len(self._source) and (self._source[self._location].isalpha() or self._source[self._location] in '_'): self._location += 1
|
loc = self._loc()
|
||||||
|
while self._location < len(self._source) and (self._source[self._location].isalpha() or self._source[self._location] in '_'): self._advance()
|
||||||
word = self._source[start_location:self._location]
|
word = self._source[start_location:self._location]
|
||||||
try:
|
try:
|
||||||
keyword = Keyword(word)
|
keyword = Keyword(word)
|
||||||
return Token(self._line, self._col, word, KeywordToken(keyword))
|
return self._token(loc, word, KeywordToken(keyword))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
try:
|
try:
|
||||||
symbol = Symbol(word)
|
symbol = Symbol(word)
|
||||||
return Token(self._line, self._col, word, SymbolToken(symbol))
|
return self._token(loc, word, SymbolToken(symbol))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return Token(self._line, self._col, word, IdentifierToken(word))
|
return self._token(loc, word, IdentifierToken(word))
|
||||||
case '"':
|
case '"':
|
||||||
# TODO: Escaping
|
# TODO: Proper escaping
|
||||||
self._location += 1
|
self._advance()
|
||||||
start_location = self._location
|
start_location = self._location
|
||||||
|
loc = self._loc()
|
||||||
escaping = False
|
escaping = False
|
||||||
while self._location < len(self._source) and (self._source[self._location] != '"' or escaping):
|
while self._location < len(self._source) and (self._source[self._location] != '"' or escaping):
|
||||||
escaping = self._source[self._location] == '\\' if not escaping else False
|
escaping = self._source[self._location] == '\\' if not escaping else False
|
||||||
self._location += 1
|
self._advance()
|
||||||
string = self._source[start_location:self._location].encode('utf-8').decode('unicode_escape')
|
string = self._source[start_location:self._location].encode('utf-8').decode('unicode_escape')
|
||||||
self._location += 1
|
self._advance()
|
||||||
return Token(self._line, self._col, self._source[start_location-1:self._location], StringToken(string))
|
return self._token(loc, self._source[start_location-1:self._location], StringToken(string))
|
||||||
# TODO: Make a proper Trie for this.
|
# TODO: Make a proper Trie for this.
|
||||||
case '|' if self._location < len(self._source)-1 and self._source[self._location+1] == '|':
|
case '|' if self._location < len(self._source)-1 and self._source[self._location+1] == '|':
|
||||||
self._location += 2
|
loc = self._loc()
|
||||||
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.Dpipe))
|
self._advance(); self._advance()
|
||||||
|
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dpipe))
|
||||||
case '&' if self._location < len(self._source)-1 and self._source[self._location+1] == '&':
|
case '&' if self._location < len(self._source)-1 and self._source[self._location+1] == '&':
|
||||||
self._location += 2
|
loc = self._loc()
|
||||||
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.Dampersand))
|
self._advance(); self._advance()
|
||||||
|
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dampersand))
|
||||||
case '*' if self._location < len(self._source)-1 and self._source[self._location+1] == '*':
|
case '*' if self._location < len(self._source)-1 and self._source[self._location+1] == '*':
|
||||||
self._location += 2
|
loc = self._loc()
|
||||||
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.Dasterisk))
|
self._advance(); self._advance()
|
||||||
|
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dasterisk))
|
||||||
case '-' if self._location < len(self._source)-1 and self._source[self._location+1] == '>':
|
case '-' if self._location < len(self._source)-1 and self._source[self._location+1] == '>':
|
||||||
self._location += 2
|
loc = self._loc()
|
||||||
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.Arrow))
|
self._advance(); self._advance()
|
||||||
|
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Arrow))
|
||||||
case '>' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
|
case '>' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
|
||||||
self._location += 2
|
loc = self._loc()
|
||||||
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.GreaterEqual))
|
self._advance(); self._advance()
|
||||||
|
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.GreaterEqual))
|
||||||
case '<' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
|
case '<' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
|
||||||
self._location += 2
|
loc = self._loc()
|
||||||
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.LesserEqual))
|
self._advance(); self._advance()
|
||||||
|
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.LesserEqual))
|
||||||
case '=' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
|
case '=' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
|
||||||
self._location += 2
|
loc = self._loc()
|
||||||
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.Dequal))
|
self._advance(); self._advance()
|
||||||
|
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.Dequal))
|
||||||
case '=' if self._location < len(self._source)-1 and self._source[self._location+1] == '>':
|
case '=' if self._location < len(self._source)-1 and self._source[self._location+1] == '>':
|
||||||
self._location += 2
|
loc = self._loc()
|
||||||
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.EqualArrow))
|
self._advance(); self._advance()
|
||||||
|
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.EqualArrow))
|
||||||
case '!' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
|
case '!' if self._location < len(self._source)-1 and self._source[self._location+1] == '=':
|
||||||
self._location += 2
|
loc = self._loc()
|
||||||
return Token(self._line, self._col, self._source[self._location-2:self._location], SymbolToken(Symbol.NotEqual))
|
self._advance(); self._advance()
|
||||||
|
return self._token(loc, self._source[self._location-2:self._location], SymbolToken(Symbol.NotEqual))
|
||||||
case c if c in Symbol._value2member_map_:
|
case c if c in Symbol._value2member_map_:
|
||||||
self._location += 1
|
loc = self._loc()
|
||||||
return Token(self._line, self._col, self._source[self._location-1], SymbolToken(Symbol(c)))
|
self._advance()
|
||||||
|
return self._token(loc, self._source[self._location-1], SymbolToken(Symbol(c)))
|
||||||
case _:
|
case _:
|
||||||
assert False, ("Unimplemented", c, self._location)
|
raise SyntaxError(f"{self._loc()}: Unknown character: '{c}'")
|
||||||
assert False, "Unreachable"
|
assert False, "Unreachable"
|
||||||
|
|
||||||
def peek_token(self) -> Token:
|
def peek_token(self) -> Token:
|
||||||
@ -108,12 +122,12 @@ class Lexer:
|
|||||||
|
|
||||||
def assert_tokenkind(self, kind: type) -> Token:
|
def assert_tokenkind(self, kind: type) -> Token:
|
||||||
token = self.next_token()
|
token = self.next_token()
|
||||||
assert isinstance(token.contents, kind), (f"Expected {kind} but got {token.contents}!", self.next_token(), self.next_token(), self.next_token())
|
if not isinstance(token.contents, kind): raise SyntaxError(f"{token.loc}: Expected {kind} but got {token.contents}!")
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def assert_token(self, expected: TokenContents) -> Token:
|
def assert_token(self, expected: TokenContents) -> Token:
|
||||||
token = self.next_token()
|
token = self.next_token()
|
||||||
assert token.contents == expected, (f"Expected {expected} but got {token.contents}!", self.next_token(), self.next_token())
|
if token.contents != expected: raise SyntaxError(f"{token.loc}: Expected {expected} but got {token.contents}!")
|
||||||
return token
|
return token
|
||||||
|
|
||||||
def check_token(self, expected: TokenContents) -> bool:
|
def check_token(self, expected: TokenContents) -> bool:
|
||||||
|
@ -89,8 +89,16 @@ class EofToken: pass
|
|||||||
TokenContents = KeywordToken | IdentifierToken | NumberToken | StringToken | SymbolToken | EofToken
|
TokenContents = KeywordToken | IdentifierToken | NumberToken | StringToken | SymbolToken | EofToken
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Token:
|
class Location:
|
||||||
|
file: str
|
||||||
line: int
|
line: int
|
||||||
col: int
|
col: int
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"{self.file}:{self.line}:{self.col+1}"
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Token:
|
||||||
|
loc: Location
|
||||||
value: str
|
value: str
|
||||||
contents: TokenContents
|
contents: TokenContents
|
||||||
|
Loading…
Reference in New Issue
Block a user