Darwin-9B-Opus

Running on A10G

App Files Files Community

SeaWolf-AI commited on 4 days ago

Commit

815e9c3

verified ·

1 Parent(s): d75309f

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -106

app.py CHANGED Viewed

@@ -1,30 +1,19 @@
 """
-🧬 Darwin-35B-A3B-Opus Q8 GGUF — llama-cpp-python Direct Serving
-전용 GPU · OpenAI-compatible streaming · 커스텀 프론트엔드
 """
-import sys, subprocess
 print(f"[BOOT] Python {sys.version}", flush=True)
-# ── llama-cpp-python CUDA 설치 확인 ──
-try:
-    from llama_cpp import Llama
-    print("[BOOT] llama-cpp-python already installed", flush=True)
-except ImportError:
-    print("[BOOT] Installing llama-cpp-python with CUDA...", flush=True)
-    subprocess.check_call([
-        sys.executable, "-m", "pip", "install",
-        "llama-cpp-python", "--no-cache-dir", "--prefer-binary",
-        "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu124",
-    ])
-    from llama_cpp import Llama
-    print("[BOOT] llama-cpp-python installed ✓", flush=True)
 import base64, os, re, json, io
 from typing import Generator, Optional
 import gradio as gr
-print(f"[BOOT] gradio {gr.__version__}", flush=True)
 import requests, httpx, uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
@@ -37,17 +26,16 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 # ══════════════════════════════════════════════════════════════════════════════
 # 1.  MODEL CONFIG
 # ══════════════════════════════════════════════════════════════════════════════
-REPO_ID    = "FINAL-Bench/Darwin-35B-A3B-Opus-Q8-GGUF"
-GGUF_FILE  = "merged_109838c2-q8_0-00001-of-00003.gguf"
-MODEL_NAME = "Darwin-35B-A3B-Opus-Q8"
 MODEL_CAP  = {
-    "arch": "MoE", "active": "3B / 35B total",
-    "ctx": "262K", "thinking": True, "vision": False,
     "max_tokens": 16384, "temp_max": 1.5,
 }
 PRESETS = {
-    "general":   "You are Darwin-35B-A3B-Opus, a highly capable reasoning model created by VIDRAFT via evolutionary merge. Think step by step for complex questions.",
     "code":      "You are an expert software engineer. Write clean, efficient, well-commented code. Explain your approach before writing. Use modern best practices.",
     "math":      "You are a world-class mathematician. Break problems step-by-step. Show full working. Use LaTeX where helpful.",
     "creative":  "You are a brilliant creative writer. Be imaginative, vivid, and engaging. Adapt tone and style to the request.",
@@ -56,61 +44,21 @@ PRESETS = {
 }
 # ══════════════════════════════════════════════════════════════════════════════
-# 2.  VRAM 감지 + 모델 로딩
 # ══════════════════════════════════════════════════════════════════════════════
-def detect_gpu_layers() -> int:
-    """사용 가능한 VRAM에 따라 n_gpu_layers 자동 결정"""
-    try:
-        import torch
-        if torch.cuda.is_available():
-            props = torch.cuda.get_device_properties(0)
-            vram_gb = (getattr(props, 'total_memory', 0) or getattr(props, 'total_mem', 0)) / (1024**3)
-            print(f"[GPU] {torch.cuda.get_device_name(0)} — {vram_gb:.1f} GB VRAM", flush=True)
-            if vram_gb >= 40:      # A100 40GB — 전체 레이어 GPU
-                return -1          # -1 = all layers
-            elif vram_gb >= 24:    # A10G 24GB — 약 25레이어
-                return 28
-            elif vram_gb >= 16:    # T4 16GB — 약 15레이어
-                return 18
-            else:
-                return 10
-        else:
-            print("[GPU] No CUDA device found, CPU-only mode", flush=True)
-            return 0
-    except Exception as e:
-        print(f"[GPU] Detection failed: {e}, using CPU", flush=True)
-        return 0
-N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", str(detect_gpu_layers())))
-N_CTX        = int(os.getenv("N_CTX", "32768"))
-print(f"[MODEL] Loading {REPO_ID} ...", flush=True)
-print(f"[MODEL] n_gpu_layers={N_GPU_LAYERS}, n_ctx={N_CTX}", flush=True)
-# ── Split GGUF: 3개 샤드 전부 다운로드 필수 ──
-from huggingface_hub import hf_hub_download
-GGUF_SHARDS = [
-    "merged_109838c2-q8_0-00001-of-00003.gguf",
-    "merged_109838c2-q8_0-00002-of-00003.gguf",
-    "merged_109838c2-q8_0-00003-of-00003.gguf",
-]
-shard_paths = []
-for shard in GGUF_SHARDS:
-    print(f"[MODEL] Downloading {shard} ...", flush=True)
-    p = hf_hub_download(repo_id=REPO_ID, filename=shard)
-    shard_paths.append(p)
-    print(f"[MODEL]   → {p}", flush=True)
-# 첫 번째 샤드 경로로 로드 (llama.cpp가 같은 폴더의 나머지 자동 감지)
-llm = Llama(
-    model_path=shard_paths[0],
-    n_gpu_layers=N_GPU_LAYERS,
-    n_ctx=N_CTX,
-    verbose=True,
 )
-print(f"[MODEL] {MODEL_NAME} loaded ✓", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
 # 3.  THINKING MODE HELPERS
@@ -181,7 +129,7 @@ def format_response(raw: str) -> str:
     return raw
 # ══════════════════════════════════════════════════════════════════════════════
-# 4.  GENERATION — llama-cpp-python 스트리밍 (초간단)
 # ══════════════════════════════════════════════════════════════════════════════
 def generate_reply(
     message:        str,
@@ -232,38 +180,55 @@ def generate_reply(
                 _, clean = parse_think_blocks(at)
                 messages.append({"role":"assistant","content":clean})
-    # PDF 텍스트가 image_input에 들어올 수 있음 (프론트엔드 호환)
     messages.append({"role": "user", "content": message})
-    print(f"[GEN] msgs={len(messages)}, max_new={max_new_tokens}, temp={temperature}", flush=True)
-    # ── llama-cpp 스트리밍 — 심플! ──
     try:
-        stream = llm.create_chat_completion(
-            messages=messages,
-            max_tokens=max_new_tokens,
-            temperature=max(temperature, 0.01) if temperature > 0.01 else 0.0,
-            top_p=float(top_p),
-            stream=True,
         )
-        raw = ""
-        for chunk in stream:
-            delta = chunk.get("choices", [{}])[0].get("delta", {})
-            token = delta.get("content", "")
-            if token:
-                raw += token
-                yield format_response(raw)
-        if raw:
-            print(f"[GEN] Done — {len(raw)} chars", flush=True)
-            yield format_response(raw)
-        else:
-            yield "**⚠️ 모델이 빈 응답을 반환했습니다.** 다시 시도해 주세요."
     except Exception as e:
-        print(f"[GEN] Error: {e}", flush=True)
-        yield f"**❌ Generation error:** `{e}`"
 # ══════════════════════════════════════════════════════════════════════════════
@@ -373,7 +338,7 @@ async def oauth_logout(request: Request):
 @fapp.get("/health")
 async def health():
-    return {"status": "ok", "model": MODEL_NAME, "gpu_layers": N_GPU_LAYERS, "ctx": N_CTX}
 # ── Web Search API (Brave) ──
 BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
@@ -424,10 +389,10 @@ async def api_extract_pdf(request: Request):
         return JSONResponse({"error": str(e)}, status_code=500)
 # ══════════════════════════════════════════════════════════════════════════════
-# 7.  MOUNT & RUN — 전용 GPU이므로 uvicorn.run() 정상 사용
 # ══════════════════════════════════════════════════════════════════════════════
 app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
 if __name__ == "__main__":
-    print(f"[BOOT] {MODEL_NAME} · llama-cpp · GPU layers: {N_GPU_LAYERS}", flush=True)
     uvicorn.run(app, host="0.0.0.0", port=7860)

 """
+🧬 Darwin-9B-Opus — transformers Direct Serving
+전용 GPU · Qwen3.5 9B · BF16 · Streaming · 커스텀 프론트엔드
 """
+import sys
 print(f"[BOOT] Python {sys.version}", flush=True)
 import base64, os, re, json, io
 from typing import Generator, Optional
+from threading import Thread
+import torch
 import gradio as gr
+print(f"[BOOT] gradio {gr.__version__}, torch {torch.__version__}", flush=True)
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import requests, httpx, uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
 # ══════════════════════════════════════════════════════════════════════════════
 # 1.  MODEL CONFIG
 # ══════════════════════════════════════════════════════════════════════════════
+MODEL_ID   = "FINAL-Bench/Darwin-9B-Opus"
+MODEL_NAME = "Darwin-9B-Opus"
 MODEL_CAP  = {
+    "arch": "Qwen3.5 Dense", "active": "9B",
+    "ctx": "131K", "thinking": True, "vision": False,
     "max_tokens": 16384, "temp_max": 1.5,
 }
 PRESETS = {
+    "general":   "You are Darwin-9B-Opus, a highly capable reasoning model created by VIDRAFT via evolutionary merge. Think step by step for complex questions.",
     "code":      "You are an expert software engineer. Write clean, efficient, well-commented code. Explain your approach before writing. Use modern best practices.",
     "math":      "You are a world-class mathematician. Break problems step-by-step. Show full working. Use LaTeX where helpful.",
     "creative":  "You are a brilliant creative writer. Be imaginative, vivid, and engaging. Adapt tone and style to the request.",
 }
 # ══════════════════════════════════════════════════════════════════════════════
+# 2.  MODEL LOADING — transformers + BF16
 # ══════════════════════════════════════════════════════════════════════════════
+print(f"[MODEL] Loading {MODEL_ID} ...", flush=True)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+print("[MODEL] Tokenizer loaded", flush=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True,
 )
+model.eval()
+print(f"[MODEL] {MODEL_NAME} loaded ✓ — device: {model.device}, dtype: {model.dtype}", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
 # 3.  THINKING MODE HELPERS
     return raw
 # ══════════════════════════════════════════════════════════════════════════════
+# 4.  GENERATION — transformers + TextIteratorStreamer
 # ══════════════════════════════════════════════════════════════════════════════
 def generate_reply(
     message:        str,
                 _, clean = parse_think_blocks(at)
                 messages.append({"role":"assistant","content":clean})
     messages.append({"role": "user", "content": message})
+    # ── 토크나이즈 ──
     try:
+        text_prompt = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True,
         )
+        inputs = tokenizer(text_prompt, return_tensors="pt").to(model.device)
+    except Exception as e:
+        yield f"**❌ Tokenization error:** `{e}`"
+        return
+    input_len = inputs["input_ids"].shape[-1]
+    print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, temp={temperature}", flush=True)
+    # ── 스트리밍 ──
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
+    gen_kwargs = dict(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=temperature > 0.01,
+        temperature=max(temperature, 0.01) if temperature > 0.01 else 1.0,
+        top_p=float(top_p),
+        streamer=streamer,
+        use_cache=True,
+    )
+    thread = Thread(target=model.generate, kwargs=gen_kwargs)
+    thread.start()
+    output = ""
+    try:
+        for text in streamer:
+            output += text
+            yield format_response(output)
     except Exception as e:
+        if output:
+            yield format_response(output)
+        else:
+            yield f"**❌ Generation error:** `{e}`"
+    thread.join()
+    if output:
+        print(f"[GEN] Done — {len(output)} chars", flush=True)
+        yield format_response(output)
+    else:
+        yield "**⚠�� 모델이 빈 응답을 반환했습니다.** 다시 시도해 주세요."
 # ══════════════════════════════════════════════════════════════════════════════
 @fapp.get("/health")
 async def health():
+    return {"status": "ok", "model": MODEL_ID, "device": str(model.device), "dtype": str(model.dtype)}
 # ── Web Search API (Brave) ──
 BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
         return JSONResponse({"error": str(e)}, status_code=500)
 # ══════════════════════════════════════════════════════════════════════════════
+# 7.  MOUNT & RUN
 # ══════════════════════════════════════════════════════════════════════════════
 app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
 if __name__ == "__main__":
+    print(f"[BOOT] {MODEL_NAME} · transformers · Ready", flush=True)
     uvicorn.run(app, host="0.0.0.0", port=7860)