| """
|
| Vidyut Morphological Analyzer
|
| Provides deterministic morphological analysis using Vidyut Kosha.
|
| """
|
|
|
| import os
|
| import json
|
| from typing import Dict, List, Optional, Set
|
| from dataclasses import dataclass
|
|
|
|
|
| VIDYUT_DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vidyut_data")
|
| STEMS_FILE = os.path.join(os.path.dirname(__file__), "stems.json")
|
|
|
|
|
| _STEM_CACHE: set = set()
|
| _STEM_CACHE_LOADED = False
|
|
|
| def _load_stem_cache():
|
| """Load stems from stems.json for fast lookup."""
|
| global _STEM_CACHE, _STEM_CACHE_LOADED
|
| if _STEM_CACHE_LOADED:
|
| return
|
|
|
|
|
| COMMON_STEMS = {
|
|
|
| "rAma", "sItA", "kfzRa", "arjuna", "deva", "brahma", "Atma", "Atman",
|
| "parama", "param", "para", "maha", "mahA", "rAja", "vana", "gfha",
|
| "hfd", "padma", "gata", "gam", "gacC", "ti", "aH", "am", "jYa",
|
|
|
| "bhedAbheda", "bheda", "abheda", "vibhAga", "yoga", "vicAra",
|
| "sopAdhika", "pratyagAtman", "pratyag", "Atman", "AbhAsa", "bhAsa",
|
| "kzetra", "kzetrajYa", "santoSa", "mokSa", "saMsAra", "jIva",
|
| "brahman", "paramAtman", "pratyaya", "pramANa", "anumAna",
|
|
|
| "sat", "asat", "cit", "Ananda", "satcitAnanda",
|
|
|
| }
|
| _STEM_CACHE.update(COMMON_STEMS)
|
|
|
|
|
| if os.path.exists(STEMS_FILE):
|
| try:
|
| with open(STEMS_FILE, "r", encoding="utf-8") as f:
|
| stems = json.load(f)
|
| _STEM_CACHE.update(stems)
|
| print(f" VidyutAnalyzer: Loaded {len(_STEM_CACHE)} stems from cache")
|
| except Exception as e:
|
| print(f" VidyutAnalyzer: Stem cache load failed ({e})")
|
|
|
| _STEM_CACHE_LOADED = True
|
|
|
|
|
| @dataclass
|
| class MorphParse:
|
| """A single morphological parse of a word."""
|
| surface: str
|
| stem: str
|
| root: Optional[str]
|
| pratyaya: Optional[str]
|
| vibhakti: Optional[str]
|
| upasarga: Optional[str]
|
| is_compound: bool
|
| is_verb: bool
|
| derivation_depth: int
|
| kosha_validated: bool
|
|
|
| def token_form(self) -> str:
|
| """Return the canonical token form (stem without vibhakti)."""
|
| if self.vibhakti and self.surface.endswith(self.vibhakti):
|
| return self.surface[:-len(self.vibhakti)]
|
| return self.stem if self.stem else self.surface
|
|
|
|
|
| class VidyutAnalyzer:
|
| """
|
| Morphological analyzer using Vidyut Kosha.
|
| Provides deterministic disambiguation for tokenization.
|
| """
|
|
|
|
|
| VIBHAKTI_ENDINGS = [
|
|
|
| ("asya", "Gen.Sg"), ("Aya", "Dat.Sg"), ("At", "Abl.Sg"),
|
| ("ena", "Ins.Sg"), ("e", "Loc.Sg"), ("aH", "Nom.Sg"),
|
| ("am", "Acc.Sg"), ("O", "Nom.Du"), ("ayoH", "Gen.Du"),
|
| ("ABym", "Ins.Du"), ("AH", "Nom.Pl"), ("An", "Gen.Pl"),
|
| ("eByo", "Dat.Pl"), ("EH", "Ins.Pl"), ("ezu", "Loc.Pl"),
|
|
|
| ("AyAH", "Gen.Sg.F"), ("AyAm", "Loc.Sg.F"), ("ayA", "Ins.Sg.F"),
|
|
|
| ("Ani", "Nom.Pl.N"), ("AnAm", "Gen.Pl.N"),
|
|
|
| ("sya", "Gen"), ("ya", "Dat"), ("ya", "Loc"),
|
| ("m", "Acc"), ("H", "Nom.Sg"),
|
| ]
|
|
|
|
|
| KRT_SUFFIXES = [
|
| ("tvA", "ktvā"),
|
| ("ya", "lyap"),
|
| ("ta", "kta"),
|
| ("tavat", "ktavat"),
|
| ("at", "śatṛ"),
|
| ("Ana", "śānac"),
|
| ("tum", "tumun"),
|
| ("ti", "ktin"),
|
| ("ana", "lyuṭ"),
|
| ("aka", "ṇvul"),
|
| ("in", "ṇini"),
|
| ("tṛ", "tṛc"),
|
| ]
|
|
|
|
|
| TADDHITA_SUFFIXES = [
|
| ("tva", "tva"),
|
| ("tA", "tal"),
|
| ("maya", "mayaṭ"),
|
| ("vat", "vatup"),
|
| ("mat", "matup"),
|
| ("ika", "ṭhak"),
|
| ("Iya", "cha"),
|
| ("ya", "yat"),
|
| ]
|
|
|
|
|
| VERBAL_ENDINGS = [
|
|
|
| "ti", "anti", "si", "Ta", "mi", "maH", "vas", "mas",
|
| "te", "ante", "se", "Atte", "e", "mahi", "vahe", "mahe",
|
|
|
| "anto", "antaH", "antam", "antI", "antau",
|
| "ayanto", "ayantaH", "ayantam",
|
| "mAnaH", "mAnam", "mAnA",
|
| "taH", "tam", "te", "tAni",
|
| "tavAn", "tavatI", "tavat",
|
|
|
| ]
|
|
|
|
|
| UPASARGAS = [
|
| "pra", "parA", "apa", "sam", "anu", "ava", "nis", "nir", "dus", "dur",
|
| "vi", "A", "ni", "aDi", "api", "ati", "su", "ut", "ud", "aBi", "prati",
|
| "pari", "upa",
|
| ]
|
|
|
| def __init__(self, preload_cache: bool = True):
|
| """Initialize analyzer with fast stem cache."""
|
| self._parse_cache: Dict[str, List[MorphParse]] = {}
|
|
|
|
|
| _load_stem_cache()
|
|
|
| def _in_kosha(self, word: str) -> bool:
|
| """Check if word exists in stem cache (O(1) lookup)."""
|
| return word in _STEM_CACHE
|
|
|
| def _is_verb_form(self, word: str) -> bool:
|
| """
|
| Check if word is a verb form (tiṅanta/kṛdanta) that should be atomic.
|
| Rule 3: Verbal forms = single token, no SP, no splitting.
|
| """
|
|
|
| for ending in sorted(self.VERBAL_ENDINGS, key=len, reverse=True):
|
| if word.endswith(ending) and len(word) > len(ending) + 2:
|
|
|
| remainder = word[:-len(ending)]
|
|
|
| if len(remainder) >= 2:
|
| return True
|
| return False
|
|
|
| def _extract_vibhakti(self, word: str) -> tuple:
|
| """Extract vibhakti ending from a word. Returns (stem, vibhakti)."""
|
| for ending, _ in sorted(self.VIBHAKTI_ENDINGS, key=lambda x: -len(x[0])):
|
| if word.endswith(ending) and len(word) > len(ending) + 1:
|
| stem = word[:-len(ending)]
|
|
|
| for suffix in ["", "a", "A", "i", "I", "u", "U"]:
|
| test = stem + suffix
|
| if self._in_kosha(test):
|
| return (test, ending)
|
|
|
| return (stem, ending)
|
| return (word, None)
|
|
|
| def _extract_upasarga(self, word: str) -> tuple:
|
| """Extract upasarga prefix. Returns (upasarga, remainder)."""
|
| for upa in sorted(self.UPASARGAS, key=len, reverse=True):
|
| if word.startswith(upa) and len(word) > len(upa) + 2:
|
| remainder = word[len(upa):]
|
|
|
|
|
| if self._in_kosha(remainder):
|
| return (upa, remainder)
|
|
|
| for j in range(3, min(len(remainder), 10)):
|
| if self._in_kosha(remainder[:j]):
|
| return (upa, remainder)
|
| return (None, word)
|
|
|
| def _extract_pratyaya(self, word: str) -> tuple:
|
| """Extract kṛt/taddhita suffix. Returns (stem, pratyaya_type)."""
|
|
|
| for suffix, ptype in sorted(self.KRT_SUFFIXES, key=lambda x: -len(x[0])):
|
| if word.endswith(suffix) and len(word) > len(suffix) + 1:
|
| stem = word[:-len(suffix)]
|
| if self._in_kosha(stem) or len(stem) >= 2:
|
| return (stem, ptype)
|
|
|
|
|
| for suffix, ptype in sorted(self.TADDHITA_SUFFIXES, key=lambda x: -len(x[0])):
|
| if word.endswith(suffix) and len(word) > len(suffix) + 1:
|
| stem = word[:-len(suffix)]
|
| if self._in_kosha(stem) or len(stem) >= 2:
|
| return (stem, ptype)
|
|
|
| return (word, None)
|
|
|
| def analyze(self, word: str) -> List[MorphParse]:
|
| """
|
| Analyze a word and return all possible parses.
|
| Parses are sorted by preference (deterministic order).
|
| """
|
| if not word or len(word) < 2:
|
| return [MorphParse(
|
| surface=word, stem=word, root=None, pratyaya=None,
|
| vibhakti=None, upasarga=None, is_compound=False,
|
| is_verb=False, derivation_depth=0, kosha_validated=False
|
| )]
|
|
|
| if word in self._parse_cache:
|
| return self._parse_cache[word]
|
|
|
| parses = []
|
|
|
|
|
|
|
| if self._is_verb_form(word):
|
| parses.append(MorphParse(
|
| surface=word, stem=word, root=None, pratyaya=None,
|
| vibhakti=None, upasarga=None, is_compound=False,
|
| is_verb=True, derivation_depth=0, kosha_validated=True
|
| ))
|
|
|
| self._parse_cache[word] = parses
|
| return parses
|
|
|
|
|
| if self._in_kosha(word):
|
| parses.append(MorphParse(
|
| surface=word, stem=word, root=None, pratyaya=None,
|
| vibhakti=None, upasarga=None, is_compound=False,
|
| is_verb=False, derivation_depth=0, kosha_validated=True
|
| ))
|
|
|
|
|
| stem, vibhakti = self._extract_vibhakti(word)
|
| if vibhakti:
|
| parses.append(MorphParse(
|
| surface=word, stem=stem, root=None, pratyaya=None,
|
| vibhakti=vibhakti, upasarga=None, is_compound=False,
|
| is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(stem)
|
| ))
|
|
|
|
|
| upasarga, remainder = self._extract_upasarga(word)
|
| if upasarga:
|
| parses.append(MorphParse(
|
| surface=word, stem=remainder, root=None, pratyaya=None,
|
| vibhakti=None, upasarga=upasarga, is_compound=False,
|
| is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(remainder)
|
| ))
|
|
|
|
|
| prat_stem, pratyaya = self._extract_pratyaya(word)
|
| if pratyaya:
|
| parses.append(MorphParse(
|
| surface=word, stem=prat_stem, root=prat_stem, pratyaya=pratyaya,
|
| vibhakti=None, upasarga=None, is_compound=False,
|
| is_verb=False, derivation_depth=1, kosha_validated=self._in_kosha(prat_stem)
|
| ))
|
|
|
|
|
| if not parses:
|
| parses.append(MorphParse(
|
| surface=word, stem=word, root=None, pratyaya=None,
|
| vibhakti=None, upasarga=None, is_compound=False,
|
| is_verb=False, derivation_depth=0, kosha_validated=False
|
| ))
|
|
|
|
|
| parses = self._disambiguate(parses)
|
|
|
| self._parse_cache[word] = parses
|
| return parses
|
|
|
| def _disambiguate(self, parses: List[MorphParse]) -> List[MorphParse]:
|
| """
|
| Deterministic disambiguation. NO randomness, NO frequency.
|
|
|
| Priority:
|
| 1. Prefer fewer derivational splits
|
| 2. Prefer Kosha-validated stems
|
| 3. Prefer non-compound over compound
|
| """
|
| def sort_key(p: MorphParse) -> tuple:
|
| return (
|
| p.derivation_depth,
|
| 0 if p.kosha_validated else 1,
|
| 1 if p.is_compound else 0,
|
| )
|
|
|
| return sorted(parses, key=sort_key)
|
|
|
| def get_best_parse(self, word: str) -> MorphParse:
|
| """Get the single best (deterministic) parse for a word."""
|
| parses = self.analyze(word)
|
| return parses[0] if parses else MorphParse(
|
| surface=word, stem=word, root=None, pratyaya=None,
|
| vibhakti=None, upasarga=None, is_compound=False,
|
| is_verb=False, derivation_depth=0, kosha_validated=False
|
| )
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| print("Testing VidyutAnalyzer...")
|
| analyzer = VidyutAnalyzer(preload_cache=True)
|
|
|
| test_words = [
|
| "rAmaH", "gacCati", "paramAtma", "hfdpadmagataM",
|
| "sopAdhika", "bhAva", "abheda", "vicAraH"
|
| ]
|
|
|
| for word in test_words:
|
| parse = analyzer.get_best_parse(word)
|
| print(f" {word:20} → stem: {parse.stem:15} vibhakti: {parse.vibhakti or '-':8} kosha: {parse.kosha_validated}")
|
|
|