SeaWolf-AI commited on
Commit
815e9c3
Β·
verified Β·
1 Parent(s): d75309f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -106
app.py CHANGED
@@ -1,30 +1,19 @@
1
  """
2
- 🧬 Darwin-35B-A3B-Opus Q8 GGUF β€” llama-cpp-python Direct Serving
3
- μ „μš© GPU Β· OpenAI-compatible streaming Β· μ»€μŠ€ν…€ ν”„λ‘ νŠΈμ—”λ“œ
4
  """
5
- import sys, subprocess
6
  print(f"[BOOT] Python {sys.version}", flush=True)
7
 
8
- # ── llama-cpp-python CUDA μ„€μΉ˜ 확인 ──
9
- try:
10
- from llama_cpp import Llama
11
- print("[BOOT] llama-cpp-python already installed", flush=True)
12
- except ImportError:
13
- print("[BOOT] Installing llama-cpp-python with CUDA...", flush=True)
14
- subprocess.check_call([
15
- sys.executable, "-m", "pip", "install",
16
- "llama-cpp-python", "--no-cache-dir", "--prefer-binary",
17
- "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu124",
18
- ])
19
- from llama_cpp import Llama
20
- print("[BOOT] llama-cpp-python installed βœ“", flush=True)
21
-
22
  import base64, os, re, json, io
23
  from typing import Generator, Optional
 
24
 
 
25
  import gradio as gr
26
- print(f"[BOOT] gradio {gr.__version__}", flush=True)
27
 
 
28
  import requests, httpx, uvicorn
29
  from fastapi import FastAPI, Request
30
  from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
@@ -37,17 +26,16 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
37
  # ══════════════════════════════════════════════════════════════════════════════
38
  # 1. MODEL CONFIG
39
  # ══════════════════════════════════════════════════════════════════════════════
40
- REPO_ID = "FINAL-Bench/Darwin-35B-A3B-Opus-Q8-GGUF"
41
- GGUF_FILE = "merged_109838c2-q8_0-00001-of-00003.gguf"
42
- MODEL_NAME = "Darwin-35B-A3B-Opus-Q8"
43
  MODEL_CAP = {
44
- "arch": "MoE", "active": "3B / 35B total",
45
- "ctx": "262K", "thinking": True, "vision": False,
46
  "max_tokens": 16384, "temp_max": 1.5,
47
  }
48
 
49
  PRESETS = {
50
- "general": "You are Darwin-35B-A3B-Opus, a highly capable reasoning model created by VIDRAFT via evolutionary merge. Think step by step for complex questions.",
51
  "code": "You are an expert software engineer. Write clean, efficient, well-commented code. Explain your approach before writing. Use modern best practices.",
52
  "math": "You are a world-class mathematician. Break problems step-by-step. Show full working. Use LaTeX where helpful.",
53
  "creative": "You are a brilliant creative writer. Be imaginative, vivid, and engaging. Adapt tone and style to the request.",
@@ -56,61 +44,21 @@ PRESETS = {
56
  }
57
 
58
  # ══════════════════════════════════════════════════════════════════════════════
59
- # 2. VRAM 감지 + λͺ¨λΈ λ‘œλ”©
60
  # ══════════════════════════════════════════════════════════════════════════════
61
- def detect_gpu_layers() -> int:
62
- """μ‚¬μš© κ°€λŠ₯ν•œ VRAM에 따라 n_gpu_layers μžλ™ κ²°μ •"""
63
- try:
64
- import torch
65
- if torch.cuda.is_available():
66
- props = torch.cuda.get_device_properties(0)
67
- vram_gb = (getattr(props, 'total_memory', 0) or getattr(props, 'total_mem', 0)) / (1024**3)
68
- print(f"[GPU] {torch.cuda.get_device_name(0)} β€” {vram_gb:.1f} GB VRAM", flush=True)
69
- if vram_gb >= 40: # A100 40GB β€” 전체 λ ˆμ΄μ–΄ GPU
70
- return -1 # -1 = all layers
71
- elif vram_gb >= 24: # A10G 24GB β€” μ•½ 25λ ˆμ΄μ–΄
72
- return 28
73
- elif vram_gb >= 16: # T4 16GB β€” μ•½ 15λ ˆμ΄μ–΄
74
- return 18
75
- else:
76
- return 10
77
- else:
78
- print("[GPU] No CUDA device found, CPU-only mode", flush=True)
79
- return 0
80
- except Exception as e:
81
- print(f"[GPU] Detection failed: {e}, using CPU", flush=True)
82
- return 0
83
-
84
- N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", str(detect_gpu_layers())))
85
- N_CTX = int(os.getenv("N_CTX", "32768"))
86
-
87
- print(f"[MODEL] Loading {REPO_ID} ...", flush=True)
88
- print(f"[MODEL] n_gpu_layers={N_GPU_LAYERS}, n_ctx={N_CTX}", flush=True)
89
-
90
- # ── Split GGUF: 3개 μƒ€λ“œ μ „λΆ€ λ‹€μš΄λ‘œλ“œ ν•„μˆ˜ ──
91
- from huggingface_hub import hf_hub_download
92
-
93
- GGUF_SHARDS = [
94
- "merged_109838c2-q8_0-00001-of-00003.gguf",
95
- "merged_109838c2-q8_0-00002-of-00003.gguf",
96
- "merged_109838c2-q8_0-00003-of-00003.gguf",
97
- ]
98
-
99
- shard_paths = []
100
- for shard in GGUF_SHARDS:
101
- print(f"[MODEL] Downloading {shard} ...", flush=True)
102
- p = hf_hub_download(repo_id=REPO_ID, filename=shard)
103
- shard_paths.append(p)
104
- print(f"[MODEL] β†’ {p}", flush=True)
105
-
106
- # 첫 번째 μƒ€λ“œ 경둜둜 λ‘œλ“œ (llama.cppκ°€ 같은 ν΄λ”μ˜ λ‚˜λ¨Έμ§€ μžλ™ 감지)
107
- llm = Llama(
108
- model_path=shard_paths[0],
109
- n_gpu_layers=N_GPU_LAYERS,
110
- n_ctx=N_CTX,
111
- verbose=True,
112
  )
113
- print(f"[MODEL] {MODEL_NAME} loaded βœ“", flush=True)
 
114
 
115
  # ══════════════════════════════════════════════════════════════════════════════
116
  # 3. THINKING MODE HELPERS
@@ -181,7 +129,7 @@ def format_response(raw: str) -> str:
181
  return raw
182
 
183
  # ══════════════════════════════════════════════════════════════════════════════
184
- # 4. GENERATION β€” llama-cpp-python 슀트리밍 (μ΄ˆκ°„λ‹¨)
185
  # ══════════════════════════════════════════════════════════════════════════════
186
  def generate_reply(
187
  message: str,
@@ -232,38 +180,55 @@ def generate_reply(
232
  _, clean = parse_think_blocks(at)
233
  messages.append({"role":"assistant","content":clean})
234
 
235
- # PDF ν…μŠ€νŠΈκ°€ image_input에 λ“€μ–΄μ˜¬ 수 있음 (ν”„λ‘ νŠΈμ—”λ“œ ν˜Έν™˜)
236
  messages.append({"role": "user", "content": message})
237
 
238
- print(f"[GEN] msgs={len(messages)}, max_new={max_new_tokens}, temp={temperature}", flush=True)
239
-
240
- # ── llama-cpp 슀트리밍 β€” μ‹¬ν”Œ! ──
241
  try:
242
- stream = llm.create_chat_completion(
243
- messages=messages,
244
- max_tokens=max_new_tokens,
245
- temperature=max(temperature, 0.01) if temperature > 0.01 else 0.0,
246
- top_p=float(top_p),
247
- stream=True,
248
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
- raw = ""
251
- for chunk in stream:
252
- delta = chunk.get("choices", [{}])[0].get("delta", {})
253
- token = delta.get("content", "")
254
- if token:
255
- raw += token
256
- yield format_response(raw)
257
-
258
- if raw:
259
- print(f"[GEN] Done β€” {len(raw)} chars", flush=True)
260
- yield format_response(raw)
261
- else:
262
- yield "**⚠️ λͺ¨λΈμ΄ 빈 응닡을 λ°˜ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.** λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
263
 
 
 
 
 
 
264
  except Exception as e:
265
- print(f"[GEN] Error: {e}", flush=True)
266
- yield f"**❌ Generation error:** `{e}`"
 
 
 
 
 
 
 
 
 
 
267
 
268
 
269
  # ══════════════════════════════════════════════════════════════════════════════
@@ -373,7 +338,7 @@ async def oauth_logout(request: Request):
373
 
374
  @fapp.get("/health")
375
  async def health():
376
- return {"status": "ok", "model": MODEL_NAME, "gpu_layers": N_GPU_LAYERS, "ctx": N_CTX}
377
 
378
  # ── Web Search API (Brave) ──
379
  BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
@@ -424,10 +389,10 @@ async def api_extract_pdf(request: Request):
424
  return JSONResponse({"error": str(e)}, status_code=500)
425
 
426
  # ══════════════════════════════════════════════════════════════════════════════
427
- # 7. MOUNT & RUN β€” μ „μš© GPUμ΄λ―€λ‘œ uvicorn.run() 정상 μ‚¬μš©
428
  # ══════════════════════════════════════════════════════════════════════════════
429
  app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
430
 
431
  if __name__ == "__main__":
432
- print(f"[BOOT] {MODEL_NAME} Β· llama-cpp Β· GPU layers: {N_GPU_LAYERS}", flush=True)
433
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  """
2
+ 🧬 Darwin-9B-Opus β€” transformers Direct Serving
3
+ μ „μš© GPU Β· Qwen3.5 9B Β· BF16 Β· Streaming Β· μ»€μŠ€ν…€ ν”„λ‘ νŠΈμ—”λ“œ
4
  """
5
+ import sys
6
  print(f"[BOOT] Python {sys.version}", flush=True)
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import base64, os, re, json, io
9
  from typing import Generator, Optional
10
+ from threading import Thread
11
 
12
+ import torch
13
  import gradio as gr
14
+ print(f"[BOOT] gradio {gr.__version__}, torch {torch.__version__}", flush=True)
15
 
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
17
  import requests, httpx, uvicorn
18
  from fastapi import FastAPI, Request
19
  from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
 
26
  # ══════════════════════════════════════════════════════════════════════════════
27
  # 1. MODEL CONFIG
28
  # ══════════════════════════════════════════════════════════════════════════════
29
+ MODEL_ID = "FINAL-Bench/Darwin-9B-Opus"
30
+ MODEL_NAME = "Darwin-9B-Opus"
 
31
  MODEL_CAP = {
32
+ "arch": "Qwen3.5 Dense", "active": "9B",
33
+ "ctx": "131K", "thinking": True, "vision": False,
34
  "max_tokens": 16384, "temp_max": 1.5,
35
  }
36
 
37
  PRESETS = {
38
+ "general": "You are Darwin-9B-Opus, a highly capable reasoning model created by VIDRAFT via evolutionary merge. Think step by step for complex questions.",
39
  "code": "You are an expert software engineer. Write clean, efficient, well-commented code. Explain your approach before writing. Use modern best practices.",
40
  "math": "You are a world-class mathematician. Break problems step-by-step. Show full working. Use LaTeX where helpful.",
41
  "creative": "You are a brilliant creative writer. Be imaginative, vivid, and engaging. Adapt tone and style to the request.",
 
44
  }
45
 
46
  # ══════════════════════════════════════════════════════════════════════════════
47
+ # 2. MODEL LOADING β€” transformers + BF16
48
  # ══════════════════════════════════════════════════════════════════════════════
49
+ print(f"[MODEL] Loading {MODEL_ID} ...", flush=True)
50
+
51
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
52
+ print("[MODEL] Tokenizer loaded", flush=True)
53
+
54
+ model = AutoModelForCausalLM.from_pretrained(
55
+ MODEL_ID,
56
+ torch_dtype=torch.bfloat16,
57
+ device_map="auto",
58
+ trust_remote_code=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  )
60
+ model.eval()
61
+ print(f"[MODEL] {MODEL_NAME} loaded βœ“ β€” device: {model.device}, dtype: {model.dtype}", flush=True)
62
 
63
  # ══════════════════════════════════════════════════════════════════════════════
64
  # 3. THINKING MODE HELPERS
 
129
  return raw
130
 
131
  # ══════════════════════════════════════════════════════════════════════════════
132
+ # 4. GENERATION β€” transformers + TextIteratorStreamer
133
  # ══════════════════════════════════════════════════════════════════════════════
134
  def generate_reply(
135
  message: str,
 
180
  _, clean = parse_think_blocks(at)
181
  messages.append({"role":"assistant","content":clean})
182
 
 
183
  messages.append({"role": "user", "content": message})
184
 
185
+ # ── ν† ν¬λ‚˜μ΄μ¦ˆ ──
 
 
186
  try:
187
+ text_prompt = tokenizer.apply_chat_template(
188
+ messages, tokenize=False, add_generation_prompt=True,
 
 
 
 
189
  )
190
+ inputs = tokenizer(text_prompt, return_tensors="pt").to(model.device)
191
+ except Exception as e:
192
+ yield f"**❌ Tokenization error:** `{e}`"
193
+ return
194
+
195
+ input_len = inputs["input_ids"].shape[-1]
196
+ print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, temp={temperature}", flush=True)
197
+
198
+ # ── 슀트리밍 ──
199
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
200
+
201
+ gen_kwargs = dict(
202
+ **inputs,
203
+ max_new_tokens=max_new_tokens,
204
+ do_sample=temperature > 0.01,
205
+ temperature=max(temperature, 0.01) if temperature > 0.01 else 1.0,
206
+ top_p=float(top_p),
207
+ streamer=streamer,
208
+ use_cache=True,
209
+ )
210
 
211
+ thread = Thread(target=model.generate, kwargs=gen_kwargs)
212
+ thread.start()
 
 
 
 
 
 
 
 
 
 
 
213
 
214
+ output = ""
215
+ try:
216
+ for text in streamer:
217
+ output += text
218
+ yield format_response(output)
219
  except Exception as e:
220
+ if output:
221
+ yield format_response(output)
222
+ else:
223
+ yield f"**❌ Generation error:** `{e}`"
224
+
225
+ thread.join()
226
+
227
+ if output:
228
+ print(f"[GEN] Done β€” {len(output)} chars", flush=True)
229
+ yield format_response(output)
230
+ else:
231
+ yield "**⚠�� λͺ¨λΈμ΄ 빈 응닡을 λ°˜ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.** λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
232
 
233
 
234
  # ══════════════════════════════════════════════════════════════════════════════
 
338
 
339
  @fapp.get("/health")
340
  async def health():
341
+ return {"status": "ok", "model": MODEL_ID, "device": str(model.device), "dtype": str(model.dtype)}
342
 
343
  # ── Web Search API (Brave) ──
344
  BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
 
389
  return JSONResponse({"error": str(e)}, status_code=500)
390
 
391
  # ══════════════════════════════════════════════════════════════════════════════
392
+ # 7. MOUNT & RUN
393
  # ══════════════════════════════════════════════════════════════════════════════
394
  app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
395
 
396
  if __name__ == "__main__":
397
+ print(f"[BOOT] {MODEL_NAME} Β· transformers Β· Ready", flush=True)
398
  uvicorn.run(app, host="0.0.0.0", port=7860)