ArthaLabs
/

panini-tokenizer

Model card Files Files and versions

panini-tokenizer / src /sandhi_engine.py

ArthaLabs's picture

Upload folder using huggingface_hub

77111fb verified 4 months ago

history blame contribute delete

4.28 kB

	"""
	Sandhi Engine for Panini Tokenizer V4
	Generates pre-sandhi hypotheses for Sanskrit compound splitting.
	Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation.

	Uses table-driven design for maintainability.
	"""

	from typing import List, Tuple, Generator


	class SandhiEngine:
	"""
	Generates pre-sandhi hypotheses for Sanskrit compound splitting.
	Handles vowel coalescence (ac-sandhi) and visarga/consonant assimilation.
	"""

	def __init__(self):
	# ac-sandhi (vowel merger) tables
	# Key = surface char, Value = list of (left_end, right_start) pairs
	self.VOWEL_SPLITS = {
	# Guṇa
	'e': [('a', 'i'), ('A', 'i'), ('a', 'I'), ('A', 'I')],
	'o': [('a', 'u'), ('A', 'u'), ('a', 'U'), ('A', 'U')],
	'ar': [('a', 'f'), ('A', 'f'), ('a', 'F'), ('A', 'F')], # maharzi -> mahA + fzi

	# Vṛddhi
	'E': [('a', 'e'), ('A', 'e'), ('a', 'E'), ('A', 'E')], # ai
	'O': [('a', 'o'), ('A', 'o'), ('a', 'O'), ('A', 'O')], # au

	# Dīrgha (savarṇa dīrgha) - critical for long vowel restoration
	'A': [('a', 'a'), ('a', 'A'), ('A', 'a'), ('A', 'A')],
	'I': [('i', 'i'), ('i', 'I'), ('I', 'i'), ('I', 'I')],
	'U': [('u', 'u'), ('u', 'U'), ('U', 'u'), ('U', 'U')],
	}

	# Consonant categories
	self.VOICED = set(['g', 'G', 'j', 'J', 'd', 'D', 'b', 'B', 'n', 'N', 'm', 'y', 'r', 'l', 'v', 'h'])
	self.HARD = set(['k', 'K', 'c', 'C', 't', 'T', 'w', 'W', 'p', 'P', 'S', 's'])

	def generate_splits(self, word: str, i: int) -> Generator[Tuple[str, str], None, None]:
	"""
	Yields (left, right) tuples for a split AT index i.
	i is the index of the character being considered as the 'pivot'.
	"""
	if i < 1 or i >= len(word):
	return

	char = word[i]

	# 1. Default: hard cut (no sandhi)
	# Split BEFORE char: word[:i] \| word[i:]
	yield (word[:i], word[i:])

	# 2. Vowel coalescence (the char IS the result of merger)
	# e.g. gaṇ[e]śa -> left ends with 'a', right starts with 'i'
	if char in self.VOWEL_SPLITS:
	for left_end, right_start in self.VOWEL_SPLITS[char]:
	# Replace char at i with the split pair
	yield (word[:i] + left_end, right_start + word[i+1:])

	# 3. Yān sandhi (y -> i/I, v -> u/U)
	# e.g. praty[e]kam -> prati + ekam
	# CAUTION: Yān happens BEFORE a vowel, check word[i+1]
	if i + 1 < len(word):
	next_char = word[i+1]
	if char == 'y': # y -> i/I
	for v in ['i', 'I']:
	yield (word[:i] + v, word[i+1:])
	elif char == 'v': # v -> u/U
	for v in ['u', 'U']:
	yield (word[:i] + v, word[i+1:])

	# 4. Visarga sandhi restoration
	# 'o' before voiced consonant -> 'aH'
	if char == 'o' and i + 1 < len(word):
	if word[i+1] in self.VOICED:
	yield (word[:i] + "aH", word[i+1:])

	# 'r' before voiced -> 'H' (punarjanma -> punaH + janma)
	if char == 'r' and i + 1 < len(word):
	if word[i+1] in self.VOICED:
	yield (word[:i] + "H", word[i+1:])

	# 's'/'S' before hard consonant -> 'H'
	if char in ['s', 'S'] and i + 1 < len(word):
	if word[i+1] in self.HARD:
	yield (word[:i] + "H", word[i+1:])


	# --- TEST ---
	if __name__ == "__main__":
	engine = SandhiEngine()

	print("Testing SandhiEngine...")

	test_cases = [
	("gaReSa", 3), # e: should yield gaRa + iSa
	("devendra", 3), # e: should yield deva + indra
	("rAmo", 3), # o: should yield rAmaH before voiced
	("punarjanma", 4), # r: should yield punaH + janma
	]

	for word, pos in test_cases:
	print(f"\n {word} at pos {pos}:")
	for left, right in engine.generate_splits(word, pos):
	print(f" {left} \| {right}")