| """
|
| Sandhi Engine for Panini Tokenizer V4
|
| Generates pre-sandhi hypotheses for Sanskrit compound splitting.
|
| Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation.
|
|
|
| Uses table-driven design for maintainability.
|
| """
|
|
|
| from typing import List, Tuple, Generator
|
|
|
|
|
| class SandhiEngine:
|
| """
|
| Generates pre-sandhi hypotheses for Sanskrit compound splitting.
|
| Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation.
|
| """
|
|
|
| def __init__(self):
|
|
|
|
|
| self.VOWEL_SPLITS = {
|
|
|
| 'e': [('a', 'i'), ('A', 'i'), ('a', 'I'), ('A', 'I')],
|
| 'o': [('a', 'u'), ('A', 'u'), ('a', 'U'), ('A', 'U')],
|
| 'ar': [('a', 'f'), ('A', 'f'), ('a', 'F'), ('A', 'F')],
|
|
|
|
|
| 'E': [('a', 'e'), ('A', 'e'), ('a', 'E'), ('A', 'E')],
|
| 'O': [('a', 'o'), ('A', 'o'), ('a', 'O'), ('A', 'O')],
|
|
|
|
|
| 'A': [('a', 'a'), ('a', 'A'), ('A', 'a'), ('A', 'A')],
|
| 'I': [('i', 'i'), ('i', 'I'), ('I', 'i'), ('I', 'I')],
|
| 'U': [('u', 'u'), ('u', 'U'), ('U', 'u'), ('U', 'U')],
|
| }
|
|
|
|
|
| self.VOICED = set(['g', 'G', 'j', 'J', 'd', 'D', 'b', 'B', 'n', 'N', 'm', 'y', 'r', 'l', 'v', 'h'])
|
| self.HARD = set(['k', 'K', 'c', 'C', 't', 'T', 'w', 'W', 'p', 'P', 'S', 's'])
|
|
|
| def generate_splits(self, word: str, i: int) -> Generator[Tuple[str, str], None, None]:
|
| """
|
| Yields (left, right) tuples for a split AT index i.
|
| i is the index of the character being considered as the 'pivot'.
|
| """
|
| if i < 1 or i >= len(word):
|
| return
|
|
|
| char = word[i]
|
|
|
|
|
|
|
| yield (word[:i], word[i:])
|
|
|
|
|
|
|
| if char in self.VOWEL_SPLITS:
|
| for left_end, right_start in self.VOWEL_SPLITS[char]:
|
|
|
| yield (word[:i] + left_end, right_start + word[i+1:])
|
|
|
|
|
|
|
|
|
| if i + 1 < len(word):
|
| next_char = word[i+1]
|
| if char == 'y':
|
| for v in ['i', 'I']:
|
| yield (word[:i] + v, word[i+1:])
|
| elif char == 'v':
|
| for v in ['u', 'U']:
|
| yield (word[:i] + v, word[i+1:])
|
|
|
|
|
|
|
| if char == 'o' and i + 1 < len(word):
|
| if word[i+1] in self.VOICED:
|
| yield (word[:i] + "aH", word[i+1:])
|
|
|
|
|
| if char == 'r' and i + 1 < len(word):
|
| if word[i+1] in self.VOICED:
|
| yield (word[:i] + "H", word[i+1:])
|
|
|
|
|
| if char in ['s', 'S'] and i + 1 < len(word):
|
| if word[i+1] in self.HARD:
|
| yield (word[:i] + "H", word[i+1:])
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| engine = SandhiEngine()
|
|
|
| print("Testing SandhiEngine...")
|
|
|
| test_cases = [
|
| ("gaReSa", 3),
|
| ("devendra", 3),
|
| ("rAmo", 3),
|
| ("punarjanma", 4),
|
| ]
|
|
|
| for word, pos in test_cases:
|
| print(f"\n {word} at pos {pos}:")
|
| for left, right in engine.generate_splits(word, pos):
|
| print(f" {left} | {right}")
|
|
|