| """
|
| Panini Tokenizer - Interactive Demo
|
| HuggingFace Space for comparing Panini Tokenizer against SOTA models.
|
|
|
| ArthaLabs 2025
|
| """
|
|
|
| import gradio as gr
|
| from transformers import AutoTokenizer
|
| import sys
|
| import os
|
|
|
|
|
| BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| SRC_DIR = os.path.join(BASE_DIR, "src")
|
|
|
|
|
| sys.path.insert(0, SRC_DIR)
|
|
|
|
|
|
|
| import json
|
| STEMS_PATH = os.path.join(BASE_DIR, "stems.json")
|
|
|
|
|
| PANINI_AVAILABLE = False
|
| PANINI_SPLITTER = None
|
|
|
| try:
|
|
|
| import analyzer
|
| analyzer.STEMS_FILE = STEMS_PATH
|
| analyzer._STEM_CACHE_LOADED = False
|
|
|
| from splitter import SamasaSplitter
|
| PANINI_SPLITTER = SamasaSplitter()
|
| PANINI_AVAILABLE = True
|
| print(f"✅ Panini Tokenizer loaded successfully")
|
| except Exception as e:
|
| print(f"❌ Panini Tokenizer not available: {e}")
|
| import traceback
|
| traceback.print_exc()
|
|
|
|
|
| TOKENIZERS = {}
|
|
|
| def load_tokenizers():
|
| """Load all tokenizers for comparison."""
|
| global TOKENIZERS
|
|
|
|
|
| try:
|
| TOKENIZERS["Sanskrit-BERT"] = AutoTokenizer.from_pretrained(
|
| "Matej/bert-base-buddhist-sanskrit", trust_remote_code=True
|
| )
|
| print("✅ Sanskrit-BERT loaded")
|
| except Exception as e:
|
| print(f"Sanskrit-BERT failed: {e}")
|
|
|
|
|
| try:
|
| TOKENIZERS["MuRIL (Google)"] = AutoTokenizer.from_pretrained(
|
| "google/muril-base-cased", trust_remote_code=True
|
| )
|
| print("✅ MuRIL loaded")
|
| except Exception as e:
|
| print(f"MuRIL failed: {e}")
|
|
|
|
|
| try:
|
| TOKENIZERS["Ansh-256k (Indic)"] = AutoTokenizer.from_pretrained(
|
| "LingoIITGN/Ansh-256k", trust_remote_code=True
|
| )
|
| print("✅ Ansh-256k loaded")
|
| except Exception as e:
|
| print(f"Ansh-256k failed: {e}")
|
|
|
|
|
| try:
|
| TOKENIZERS["Sanskrit-Qwen2"] = AutoTokenizer.from_pretrained(
|
| "diabolic6045/Sanskrit-English-qwen2-tokenizer", trust_remote_code=True
|
| )
|
| print("✅ Sanskrit-Qwen2 loaded")
|
| except Exception as e:
|
| print(f"Sanskrit-Qwen2 failed: {e}")
|
|
|
|
|
| load_tokenizers()
|
|
|
| def tokenize_with_panini(text: str) -> list:
|
| """Tokenize using Panini Tokenizer."""
|
| if not PANINI_AVAILABLE or PANINI_SPLITTER is None:
|
| return ["[Panini not available]"]
|
|
|
| try:
|
| tokens = []
|
| words = text.split()
|
|
|
| for i, word in enumerate(words):
|
| prefix = "▁" if i == 0 else ""
|
| split_result = PANINI_SPLITTER.split_v4(word)
|
|
|
| if split_result.is_compound and len(split_result.components) > 1:
|
| for j, comp in enumerate(split_result.components):
|
| if j == 0:
|
| tokens.append(prefix + comp)
|
| else:
|
| tokens.append(comp)
|
| else:
|
| tokens.append(prefix + word)
|
|
|
| return tokens
|
| except Exception as e:
|
| return [f"[Error: {e}]"]
|
|
|
| def tokenize_text(text: str):
|
| """Tokenize text with all tokenizers and return comparison."""
|
| if not text.strip():
|
| return "Please enter some Sanskrit text (SLP1 transliteration)"
|
|
|
| results = []
|
|
|
|
|
| panini_tokens = tokenize_with_panini(text)
|
| results.append({
|
| "name": "🏆 Panini (Ours)",
|
| "count": len(panini_tokens),
|
| "tokens": panini_tokens,
|
| "is_panini": True
|
| })
|
|
|
|
|
| for name, tok in TOKENIZERS.items():
|
| try:
|
| tokens = tok.tokenize(text)
|
| results.append({
|
| "name": name,
|
| "count": len(tokens),
|
| "tokens": tokens,
|
| "is_panini": False
|
| })
|
| except Exception as e:
|
| results.append({
|
| "name": name,
|
| "count": "Error",
|
| "tokens": [str(e)[:30]],
|
| "is_panini": False
|
| })
|
|
|
|
|
| md = "## 📊 Tokenization Results\n\n"
|
|
|
|
|
| panini_count = results[0]['count'] if isinstance(results[0]['count'], int) else 0
|
| other_counts = [r['count'] for r in results[1:] if isinstance(r['count'], int)]
|
| if other_counts and panini_count > 0:
|
| avg_other = sum(other_counts) / len(other_counts)
|
| compression = avg_other / panini_count
|
| md += f"**Compression:** Panini uses **{compression:.1f}x fewer tokens** than average\n\n"
|
|
|
| md += "---\n\n"
|
|
|
|
|
| for r in results:
|
| if r['is_panini']:
|
| md += f"### {r['name']} — **{r['count']} tokens**\n"
|
| else:
|
| md += f"### {r['name']} — {r['count']} tokens\n"
|
|
|
|
|
| tokens_str = " | ".join(r['tokens'][:10])
|
| if len(tokens_str) > 80:
|
| tokens_str = tokens_str[:80] + "..."
|
| elif len(r['tokens']) > 10:
|
| tokens_str += " ..."
|
|
|
| md += f"```\n{tokens_str}\n```\n\n"
|
|
|
| return md
|
|
|
| def get_examples():
|
| """Return example inputs."""
|
| return [
|
| ["nirapekzajYAnasAkzAtkArasAmarthyam"],
|
| ["tadekaniScitArthavyavasthApanam"],
|
| ["svaprakASatvaparaprakASavyavacCedaH"],
|
| ["rAmo gacCati"],
|
| ["dharme kzetre kurukzetre"],
|
| ["parasparApekzApratiyogitvanirUpaNam"],
|
| ]
|
|
|
|
|
| with gr.Blocks(
|
| title="Panini Tokenizer - ArthaLabs",
|
| theme=gr.themes.Soft(),
|
| css="""
|
| .container { max-width: 900px; margin: auto; }
|
| .title { text-align: center; }
|
| """
|
| ) as demo:
|
|
|
| gr.Markdown(
|
| """
|
| # 🔤 Panini Tokenizer
|
| ### Grammar-First Sanskrit Tokenization by ArthaLabs
|
|
|
| Compare our morphology-based tokenizer against state-of-the-art multilingual models.
|
|
|
| **Input Format:** SLP1 transliteration (e.g., `rAmo gacCati` not `रामो गच्छति`)
|
| """
|
| )
|
|
|
| with gr.Row():
|
| with gr.Column(scale=3):
|
| text_input = gr.Textbox(
|
| label="Sanskrit Text (SLP1)",
|
| placeholder="Enter Sanskrit text in SLP1 transliteration...",
|
| lines=2,
|
| value="nirapekzajYAnasAkzAtkArasAmarthyam"
|
| )
|
| with gr.Column(scale=1):
|
| submit_btn = gr.Button("🔍 Tokenize", variant="primary", size="lg")
|
|
|
| output = gr.Markdown(label="Results")
|
|
|
| gr.Examples(
|
| examples=get_examples(),
|
| inputs=text_input,
|
| label="Example Inputs (click to try)"
|
| )
|
|
|
| submit_btn.click(
|
| fn=tokenize_text,
|
| inputs=text_input,
|
| outputs=output
|
| )
|
|
|
| text_input.submit(
|
| fn=tokenize_text,
|
| inputs=text_input,
|
| outputs=output
|
| )
|
|
|
| gr.Markdown(
|
| """
|
| ---
|
| ### About
|
|
|
| **Panini Tokenizer** uses recursive morphological analysis based on Pāṇinian grammar rules,
|
| not statistical BPE. This results in:
|
|
|
| - ✅ **2-4x fewer tokens** for complex compounds
|
| - ✅ **Semantically meaningful** token boundaries
|
| - ✅ **No arbitrary byte-level splits** like `##k`, `##z`, `##ab`
|
|
|
| [📖 Model Card](https://huggingface.co/ArthaLabs/panini-tokenizer) |
|
| [📊 Full Benchmarks](https://huggingface.co/ArthaLabs/panini-tokenizer/blob/main/BENCHMARKS.md)
|
|
|
| ---
|
| *© 2025 ArthaLabs - Apache 2.0 License*
|
| """
|
| )
|
|
|
| if __name__ == "__main__":
|
| demo.launch()
|
|
|