Spaces:

bilalnaveed
/

Naveedai

Runtime error

App Files Files Community

bilalnaveed commited on Feb 17

Commit

cfd8e56

verified ·

1 Parent(s): fc3b069

Upload model_loader.py with huggingface_hub

Browse files

Files changed (1) hide show

model_loader.py +430 -0

model_loader.py ADDED Viewed

	@@ -0,0 +1,430 @@

+"""
+Model Loader for the Tiny Conversational AI.
+Handles model downloading, loading, and optimization for CPU inference.
+Uses llama-cpp-python for maximum CPU performance with 4-bit quantization.
+"""
+import os
+import sys
+import time
+import shutil
+from pathlib import Path
+from typing import Optional, Dict, Any, Callable
+from dataclasses import dataclass
+import threading
+from config import Config, get_config
+from utils import (
+    get_logger, get_system_info, get_available_ram_gb, get_optimal_thread_count,
+    check_system_requirements, ProgressBar, Timer, ensure_dir
+)
+logger = get_logger(__name__)
+@dataclass
+class ModelInfo:
+    """Information about a loaded model."""
+    name: str
+    model_id: str
+    path: Path
+    size_gb: float
+    context_size: int
+    loaded: bool = False
+    load_time_seconds: float = 0.0
+class ModelLoader:
+    """
+    Handles loading and managing LLM models.
+    Uses llama-cpp-python for efficient CPU inference with 4-bit quantization.
+    """
+    def __init__(self, config: Optional[Config] = None):
+        self.config = config or get_config()
+        self.model = None
+        self.model_info: Optional[ModelInfo] = None
+        self._lock = threading.Lock()
+        self._loading = False
+        # Ensure models directory exists
+        ensure_dir(self.config.paths.models_dir)
+    def _get_model_path(self, model_id: str) -> Path:
+        """Get the local path for a model."""
+        model_config = self.config.model.AVAILABLE_MODELS.get(model_id)
+        if not model_config:
+            raise ValueError(f"Unknown model: {model_id}")
+        return self.config.paths.models_dir / model_config["file"]
+    def _is_model_downloaded(self, model_id: str) -> bool:
+        """Check if a model is already downloaded."""
+        path = self._get_model_path(model_id)
+        return path.exists()
+    def _download_model(
+        self,
+        model_id: str,
+        progress_callback: Optional[Callable[[float, str], None]] = None
+    ) -> Path:
+        """
+        Download a model from Hugging Face Hub.
+        Uses huggingface_hub for reliable downloads with resume support.
+        """
+        model_config = self.config.model.AVAILABLE_MODELS.get(model_id)
+        if not model_config:
+            raise ValueError(f"Unknown model: {model_id}")
+        dest_path = self._get_model_path(model_id)
+        if dest_path.exists():
+            logger.info(f"Model already downloaded: {dest_path}")
+            return dest_path
+        logger.info(f"Downloading model: {model_config['name']}")
+        logger.info(f"Repository: {model_config['repo']}")
+        logger.info(f"File: {model_config['file']}")
+        logger.info(f"Expected size: ~{model_config['size_gb']}GB")
+        if progress_callback:
+            progress_callback(0.0, f"Starting download of {model_config['name']}...")
+        try:
+            from huggingface_hub import hf_hub_download
+            # Download with progress
+            downloaded_path = hf_hub_download(
+                repo_id=model_config["repo"],
+                filename=model_config["file"],
+                local_dir=self.config.paths.models_dir,
+                local_dir_use_symlinks=False,
+                resume_download=True,
+            )
+            # Move to expected location if needed
+            downloaded_path = Path(downloaded_path)
+            if downloaded_path != dest_path:
+                if downloaded_path.exists():
+                    shutil.move(str(downloaded_path), str(dest_path))
+            if progress_callback:
+                progress_callback(1.0, "Download complete!")
+            logger.info(f"Model downloaded successfully: {dest_path}")
+            return dest_path
+        except ImportError:
+            logger.error("huggingface_hub not installed. Please install it: pip install huggingface_hub")
+            raise
+        except Exception as e:
+            logger.error(f"Download failed: {e}")
+            raise
+    def select_best_model(self) -> str:
+        """
+        Automatically select the best model based on available RAM.
+        Returns model_id of the selected model.
+        """
+        available_ram = get_available_ram_gb()
+        logger.info(f"Available RAM: {available_ram:.1f}GB")
+        # Sort models by quality (descending), filter by RAM requirement
+        suitable_models = []
+        for model_id, model_config in self.config.model.AVAILABLE_MODELS.items():
+            if model_config["min_ram_gb"] <= available_ram:
+                suitable_models.append((model_id, model_config))
+        if not suitable_models:
+            # Use smallest model as last resort
+            logger.warning("Low RAM detected, using smallest model")
+            return "tinyllama-1.1b"
+        # Sort by quality * speed score
+        suitable_models.sort(
+            key=lambda x: x[1]["quality"] * x[1]["speed"],
+            reverse=True
+        )
+        selected = suitable_models[0][0]
+        logger.info(f"Selected model: {selected}")
+        return selected
+    def load(
+        self,
+        model_id: Optional[str] = None,
+        auto_download: bool = True,
+        progress_callback: Optional[Callable[[float, str], None]] = None
+    ) -> Any:
+        """
+        Load a model for inference.
+        Args:
+            model_id: ID of the model to load. If None, auto-selects best model.
+            auto_download: Whether to download the model if not present.
+            progress_callback: Optional callback for progress updates (progress, message).
+        Returns:
+            The loaded model instance.
+        """
+        with self._lock:
+            if self._loading:
+                raise RuntimeError("Model is already being loaded")
+            self._loading = True
+        try:
+            timer = Timer("Model loading")
+            timer.start()
+            # Auto-select model if not specified
+            if model_id is None:
+                model_id = self.select_best_model()
+            model_config = self.config.model.AVAILABLE_MODELS.get(model_id)
+            if not model_config:
+                raise ValueError(f"Unknown model: {model_id}")
+            model_path = self._get_model_path(model_id)
+            # Download if needed
+            if not model_path.exists():
+                if auto_download:
+                    if progress_callback:
+                        progress_callback(0.0, "Downloading model...")
+                    model_path = self._download_model(model_id, progress_callback)
+                else:
+                    raise FileNotFoundError(
+                        f"Model not found: {model_path}\n"
+                        f"Run with auto_download=True or download manually from: "
+                        f"https://huggingface.co/{model_config['repo']}"
+                    )
+            # Check system requirements (use relaxed check - model will use virtual memory if needed)
+            check_result = check_system_requirements(min(model_config["min_ram_gb"], 1.0))
+            if not check_result["meets_requirements"]:
+                for error in check_result["errors"]:
+                    logger.warning(f"RAM warning (continuing anyway): {error}")
+                # Don't raise - let it try to load, OS will use swap if needed
+            for warning in check_result.get("warnings", []):
+                logger.warning(warning)
+            if progress_callback:
+                progress_callback(0.5, "Loading model into memory...")
+            # Load with llama-cpp-python
+            self.model = self._load_llama_cpp(model_path, model_config)
+            timer.stop()
+            # Store model info
+            self.model_info = ModelInfo(
+                name=model_config["name"],
+                model_id=model_id,
+                path=model_path,
+                size_gb=model_config["size_gb"],
+                context_size=model_config["context_size"],
+                loaded=True,
+                load_time_seconds=timer.elapsed,
+            )
+            if progress_callback:
+                progress_callback(1.0, f"Model loaded in {timer.elapsed:.1f}s")
+            logger.info(f"Model loaded successfully in {timer.elapsed:.1f}s")
+            return self.model
+        finally:
+            with self._lock:
+                self._loading = False
+    def _load_llama_cpp(self, model_path: Path, model_config: Dict[str, Any]) -> Any:
+        """Load model using llama-cpp-python for optimal CPU performance."""
+        try:
+            from llama_cpp import Llama
+        except ImportError:
+            logger.error(
+                "llama-cpp-python not installed. Please install it:\n"
+                "pip install llama-cpp-python"
+            )
+            raise
+        # Determine optimal settings
+        n_threads = self.config.model.n_threads
+        if n_threads == 0:
+            n_threads = get_optimal_thread_count()
+        n_ctx = min(
+            model_config.get("context_size", 4096),
+            self.config.model.max_context_length
+        )
+        logger.info(f"Loading model: {model_path.name}")
+        logger.info(f"Context size: {n_ctx}")
+        logger.info(f"Threads: {n_threads}")
+        logger.info(f"Batch size: {self.config.model.n_batch}")
+        # Load the model
+        model = Llama(
+            model_path=str(model_path),
+            n_ctx=n_ctx,
+            n_threads=n_threads,
+            n_batch=self.config.model.n_batch,
+            n_gpu_layers=self.config.model.n_gpu_layers,
+            use_mmap=self.config.model.use_mmap,
+            use_mlock=self.config.model.use_mlock,
+            verbose=False,  # Reduce noise
+        )
+        return model
+    def unload(self):
+        """Unload the current model to free memory."""
+        with self._lock:
+            if self.model is not None:
+                del self.model
+                self.model = None
+                self.model_info = None
+                # Force garbage collection
+                import gc
+                gc.collect()
+                logger.info("Model unloaded")
+    def is_loaded(self) -> bool:
+        """Check if a model is currently loaded."""
+        return self.model is not None
+    def get_model(self) -> Any:
+        """Get the loaded model instance."""
+        if not self.is_loaded():
+            raise RuntimeError("No model loaded. Call load() first.")
+        return self.model
+    def get_model_info(self) -> Optional[ModelInfo]:
+        """Get information about the loaded model."""
+        return self.model_info
+    def warmup(self, prompt: str = "Hello") -> float:
+        """
+        Warm up the model with a simple generation.
+        Returns the warmup time in seconds.
+        """
+        if not self.is_loaded():
+            raise RuntimeError("No model loaded. Call load() first.")
+        logger.info("Warming up model...")
+        timer = Timer("Warmup")
+        timer.start()
+        # Generate a short response
+        _ = self.model(
+            prompt,
+            max_tokens=10,
+            temperature=0.7,
+        )
+        timer.stop()
+        logger.info(f"Warmup complete in {timer.elapsed:.2f}s")
+        return timer.elapsed
+    def list_available_models(self) -> Dict[str, Dict[str, Any]]:
+        """List all available models with their info."""
+        models = {}
+        for model_id, model_config in self.config.model.AVAILABLE_MODELS.items():
+            models[model_id] = {
+                **model_config,
+                "downloaded": self._is_model_downloaded(model_id),
+                "path": str(self._get_model_path(model_id)),
+            }
+        return models
+    def delete_model(self, model_id: str) -> bool:
+        """Delete a downloaded model to free disk space."""
+        model_path = self._get_model_path(model_id)
+        if model_path.exists():
+            # Don't delete if currently loaded
+            if self.model_info and self.model_info.model_id == model_id:
+                self.unload()
+            model_path.unlink()
+            logger.info(f"Deleted model: {model_path}")
+            return True
+        return False
+# =============================================================================
+# CONVENIENCE FUNCTIONS
+# =============================================================================
+_global_loader: Optional[ModelLoader] = None
+def get_loader() -> ModelLoader:
+    """Get the global model loader instance."""
+    global _global_loader
+    if _global_loader is None:
+        _global_loader = ModelLoader()
+    return _global_loader
+def load_model(
+    model_id: Optional[str] = None,
+    auto_download: bool = True
+) -> Any:
+    """Convenience function to load a model."""
+    return get_loader().load(model_id, auto_download)
+def get_model() -> Any:
+    """Get the currently loaded model."""
+    return get_loader().get_model()
+if __name__ == "__main__":
+    # Test model loading
+    from utils import print_banner, print_system_status
+    print_banner()
+    print_system_status()
+    loader = ModelLoader()
+    print("\n📦 Available Models:")
+    for model_id, info in loader.list_available_models().items():
+        status = "✓ Downloaded" if info["downloaded"] else "○ Not downloaded"
+        print(f"   • {model_id}: {info['name']}")
+        print(f"     Size: {info['size_gb']}GB | Min RAM: {info['min_ram_gb']}GB | {status}")
+    # Auto-select and load best model
+    print("\n🚀 Loading model...")
+    try:
+        model = loader.load()
+        print(f"\n✓ Model loaded: {loader.model_info.name}")
+        print(f"  Load time: {loader.model_info.load_time_seconds:.1f}s")
+        # Warmup
+        warmup_time = loader.warmup()
+        print(f"  Warmup time: {warmup_time:.2f}s")
+        # Simple test
+        print("\n📝 Test generation:")
+        response = model(
+            "User: Hello!\nAssistant:",
+            max_tokens=50,
+            temperature=0.7,
+            stop=["User:", "\n\n"],
+        )
+        print(f"Response: {response['choices'][0]['text'].strip()}")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        raise