67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Callable, Iterable
|
|
|
|
|
|
@dataclass
|
|
class Lexem:
|
|
text: str
|
|
type_name: str
|
|
value: str
|
|
|
|
|
|
class LexemeType:
|
|
def __init__(
|
|
self,
|
|
name: str,
|
|
pattern: str,
|
|
value_func: Callable[[str], str] = lambda _: "",
|
|
):
|
|
self.name = name
|
|
self.regex = re.compile(r"\s*(" + pattern + ")")
|
|
self.value_func = value_func
|
|
|
|
def consume(self, text: str) -> tuple[Lexem | None, str]:
|
|
match = self.regex.match(text)
|
|
if match:
|
|
lexeme_text = match.group(1)
|
|
value = self.value_func(lexeme_text)
|
|
rest = text[match.end() :]
|
|
return Lexem(lexeme_text, self.name, value), rest
|
|
return None, text
|
|
|
|
|
|
class Lexer:
|
|
def __init__(
|
|
self,
|
|
lexeme_types: Iterable[LexemeType],
|
|
error_regex: str,
|
|
skip_types: Iterable[str] = [],
|
|
):
|
|
self.lexeme_types = lexeme_types
|
|
self.skip_types = skip_types
|
|
self.error_regex = re.compile(r"\s*(" + error_regex + ")")
|
|
|
|
def analyze(self, text: str) -> list[Lexem]:
|
|
lexems: list[Lexem] = []
|
|
while text.strip():
|
|
for lex_type in self.lexeme_types:
|
|
lexem, new_text = lex_type.consume(text)
|
|
if lexem:
|
|
if lexem.type_name not in self.skip_types:
|
|
lexems.append(lexem)
|
|
text = new_text
|
|
break
|
|
else:
|
|
error_lexeme, text = self._consume_error(text)
|
|
return lexems
|
|
|
|
def _consume_error(self, text: str) -> tuple[Lexem, str]:
|
|
match = self.error_regex.match(text)
|
|
err_text = match.group(1) if match else text.strip()
|
|
print(f"Недопустимая лексема: {err_text}")
|
|
rest = text[match.end() :] if match else ""
|
|
return Lexem(err_text, "ERROR", ""), rest
|