walidsobhie-code commited on
Commit ·
c7f1596
1
Parent(s): 5ddf5f9
chore: Rename MCP server to Stack2.9
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- audit_async.py → audits/audit_async.py +0 -0
- audit_tools.py → audits/audit_tools.py +0 -0
- audit_tools_async.py → audits/audit_tools_async.py +0 -0
- cli/run_mcp_server.py +3 -2
- load_local.py → loaders/load_local.py +0 -0
- load_model_fix.py → loaders/load_model_fix.py +0 -0
- load_pure.py → loaders/load_pure.py +0 -0
- load_pytorch.py → loaders/load_pytorch.py +0 -0
- load_silent.py → loaders/load_silent.py +0 -0
- run_auto.py → runners/run_auto.py +0 -0
- run_cache.py → runners/run_cache.py +0 -0
- run_final.py → runners/run_final.py +0 -0
- run_full.py → runners/run_full.py +0 -0
- run_local.py → runners/run_local.py +0 -0
- run_quiet.py → runners/run_quiet.py +0 -0
- run_qwen.py → runners/run_qwen.py +0 -0
- run_simple.py → runners/run_simple.py +0 -0
- run_v2.py → runners/run_v2.py +0 -0
- scripts/augment_data.py +0 -124
- scripts/augment_training_data.py +0 -324
- scripts/combine_all.py +0 -135
- scripts/combine_datasets.py +0 -144
- scripts/convert_gguf.py +0 -141
- scripts/convert_to_gguf.py +0 -210
- scripts/create_mini_dataset.py +0 -180
- scripts/download_benchmark_datasets.py +0 -127
- scripts/download_public_datasets.py +0 -170
- scripts/{compare_models.py → eval/compare_models.py} +0 -0
- scripts/{humaneval_eval.py → eval/humaneval_eval.py} +0 -0
- scripts/{mbpp_eval.py → eval/mbpp_eval.py} +0 -0
- scripts/{model_info.py → eval/model_info.py} +0 -0
- scripts/{tool_use_evaluator.py → eval/tool_use_evaluator.py} +0 -0
- scripts/extract_code_pairs.py +0 -215
- scripts/extract_patterns_from_git.py +0 -309
- scripts/extract_rtmp_tools.py +0 -174
- scripts/extract_rtmp_tools.ts +0 -115
- scripts/extract_rtmp_tools_advanced.py +0 -199
- scripts/generate_code_completion_data.py +0 -262
- scripts/generate_from_rtmp.ts +0 -114
- scripts/generate_random_synthetic.py +0 -141
- scripts/generate_synthetic.py +0 -256
- scripts/generate_synthetic_v2.py +0 -316
- scripts/generate_tool_data.py +0 -615
- scripts/generate_tool_use_tests.py +0 -163
- scripts/mine_sessions.py +0 -233
- scripts/quality_validate.py +0 -158
- scripts/training-data-extractor.js +0 -1098
- scripts/{fuse_lora_adapters.py → training/fuse_lora_adapters.py} +0 -0
- scripts/{merge_lora_adapters.py → training/merge_lora_adapters.py} +0 -0
- scripts/update_context_window.py +0 -190
audit_async.py → audits/audit_async.py
RENAMED
|
File without changes
|
audit_tools.py → audits/audit_tools.py
RENAMED
|
File without changes
|
audit_tools_async.py → audits/audit_tools_async.py
RENAMED
|
File without changes
|
cli/run_mcp_server.py
CHANGED
|
@@ -4,8 +4,9 @@
|
|
| 4 |
import sys
|
| 5 |
import os
|
| 6 |
|
| 7 |
-
# Ensure
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
from src.mcp_server import main
|
| 11 |
|
|
|
|
| 4 |
import sys
|
| 5 |
import os
|
| 6 |
|
| 7 |
+
# Ensure project root is on the path
|
| 8 |
+
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 9 |
+
sys.path.insert(0, project_root)
|
| 10 |
|
| 11 |
from src.mcp_server import main
|
| 12 |
|
load_local.py → loaders/load_local.py
RENAMED
|
File without changes
|
load_model_fix.py → loaders/load_model_fix.py
RENAMED
|
File without changes
|
load_pure.py → loaders/load_pure.py
RENAMED
|
File without changes
|
load_pytorch.py → loaders/load_pytorch.py
RENAMED
|
File without changes
|
load_silent.py → loaders/load_silent.py
RENAMED
|
File without changes
|
run_auto.py → runners/run_auto.py
RENAMED
|
File without changes
|
run_cache.py → runners/run_cache.py
RENAMED
|
File without changes
|
run_final.py → runners/run_final.py
RENAMED
|
File without changes
|
run_full.py → runners/run_full.py
RENAMED
|
File without changes
|
run_local.py → runners/run_local.py
RENAMED
|
File without changes
|
run_quiet.py → runners/run_quiet.py
RENAMED
|
File without changes
|
run_qwen.py → runners/run_qwen.py
RENAMED
|
File without changes
|
run_simple.py → runners/run_simple.py
RENAMED
|
File without changes
|
run_v2.py → runners/run_v2.py
RENAMED
|
File without changes
|
scripts/augment_data.py
DELETED
|
@@ -1,124 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Data augmentation for training examples.
|
| 4 |
-
Increases dataset size by paraphrasing and variations.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import json
|
| 8 |
-
import random
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
from typing import List, Dict, Any
|
| 11 |
-
import argparse
|
| 12 |
-
|
| 13 |
-
# Paraphrase templates (rule-based, no LLM)
|
| 14 |
-
PARAPHRASES = {
|
| 15 |
-
"Read the file": ["Show me the contents of", "Open", "Display", "Fetch", "Get"],
|
| 16 |
-
"Create a new file": ["Write a file", "Generate", "Make a new file", "Add file"],
|
| 17 |
-
"Run": ["Execute", "Start", "Launch", "Invoke"],
|
| 18 |
-
"Search for": ["Find", "Look for", "Locate", "Grep for"],
|
| 19 |
-
"List all": ["Show all", "Display every", "Get list of"],
|
| 20 |
-
"Can you": ["Please", "Would you", "Kindly"],
|
| 21 |
-
"I need": ["I want", "I require", "Please provide"],
|
| 22 |
-
}
|
| 23 |
-
|
| 24 |
-
def paraphrase_text(text: str) -> str:
|
| 25 |
-
"""Apply simple paraphrasing to user prompt."""
|
| 26 |
-
result = text
|
| 27 |
-
for original, alternatives in PARAPHRASES.items():
|
| 28 |
-
if original in result:
|
| 29 |
-
replacement = random.choice(alternatives)
|
| 30 |
-
result = result.replace(original, replacement, 1)
|
| 31 |
-
return result
|
| 32 |
-
|
| 33 |
-
def augment_example(example: Dict[str, Any], variation_factor: float = 0.3) -> List[Dict[str, Any]]:
|
| 34 |
-
"""Generate variations of a single example."""
|
| 35 |
-
variations = [example] # Keep original
|
| 36 |
-
|
| 37 |
-
# Paraphrase user message
|
| 38 |
-
if random.random() < variation_factor:
|
| 39 |
-
new_ex = json.loads(json.dumps(example)) # Deep copy
|
| 40 |
-
original_user = new_ex["messages"][0]["content"]
|
| 41 |
-
new_ex["messages"][0]["content"] = paraphrase_text(original_user)
|
| 42 |
-
new_ex["source"] = "augmented_paraphrase"
|
| 43 |
-
variations.append(new_ex)
|
| 44 |
-
|
| 45 |
-
# Vary tool parameters (if any)
|
| 46 |
-
if "tool_use" in example["messages"][1]:
|
| 47 |
-
tool_input = example["messages"][1]["tool_use"]["input"]
|
| 48 |
-
if isinstance(tool_input, dict) and tool_input:
|
| 49 |
-
new_ex = json.loads(json.dumps(example))
|
| 50 |
-
# Randomly change file paths, commands, etc.
|
| 51 |
-
for key, val in new_ex["messages"][1]["tool_use"]["input"].items():
|
| 52 |
-
if key == "file_path" and isinstance(val, str):
|
| 53 |
-
# Change to a different plausible file
|
| 54 |
-
new_ex["messages"][1]["tool_use"]["input"][key] = random.choice([
|
| 55 |
-
"src/main.py", "README.md", "package.json", "config.yaml"
|
| 56 |
-
])
|
| 57 |
-
# Also update result if it contains the old file path
|
| 58 |
-
result_content = new_ex["messages"][2]["tool_result"]["content"]
|
| 59 |
-
new_ex["messages"][2]["tool_result"]["content"] = result_content.replace(val, new_ex["messages"][1]["tool_use"]["input"][key])
|
| 60 |
-
new_ex["source"] = "augmented_params"
|
| 61 |
-
variations.append(new_ex)
|
| 62 |
-
|
| 63 |
-
# Add filler words to user message
|
| 64 |
-
if random.random() < variation_factor * 0.5:
|
| 65 |
-
new_ex = json.loads(json.dumps(example))
|
| 66 |
-
fillers = [" please", " if you can", " when you have time", " thanks"]
|
| 67 |
-
user_msg = new_ex["messages"][0]["content"]
|
| 68 |
-
filler = random.choice(fillers)
|
| 69 |
-
new_ex["messages"][0]["content"] = user_msg + filler
|
| 70 |
-
new_ex["source"] = "augmented_filler"
|
| 71 |
-
variations.append(new_ex)
|
| 72 |
-
|
| 73 |
-
return variations
|
| 74 |
-
|
| 75 |
-
def main():
|
| 76 |
-
parser = argparse.ArgumentParser()
|
| 77 |
-
parser.add_argument("--input", type=str, default="training-data/scaled/template_synthetic.jsonl")
|
| 78 |
-
parser.add_argument("--output", type=str, default="training-data/scaled/augmented.jsonl")
|
| 79 |
-
parser.add_argument("--multiplier", type=int, default=3, help="How many times to multiply dataset")
|
| 80 |
-
args = parser.parse_args()
|
| 81 |
-
|
| 82 |
-
input_path = Path(args.input)
|
| 83 |
-
output_path = Path(args.output)
|
| 84 |
-
|
| 85 |
-
if not input_path.exists():
|
| 86 |
-
print(f"❌ Input file not found: {input_path}")
|
| 87 |
-
return
|
| 88 |
-
|
| 89 |
-
print(f"📈 Augmenting dataset: {input_path}")
|
| 90 |
-
examples = []
|
| 91 |
-
with open(input_path, 'r') as f:
|
| 92 |
-
for line in f:
|
| 93 |
-
examples.append(json.loads(line))
|
| 94 |
-
|
| 95 |
-
original_count = len(examples)
|
| 96 |
-
target_count = original_count * args.multiplier
|
| 97 |
-
print(f" Original: {original_count} examples")
|
| 98 |
-
print(f" Target: ~{target_count} examples (x{args.multiplier})")
|
| 99 |
-
|
| 100 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 101 |
-
|
| 102 |
-
generated = 0
|
| 103 |
-
with open(output_path, 'w') as f:
|
| 104 |
-
for ex in examples:
|
| 105 |
-
# Write original and variations
|
| 106 |
-
f.write(json.dumps(ex) + "\n")
|
| 107 |
-
generated += 1
|
| 108 |
-
|
| 109 |
-
# Generate variations until we reach multiplier
|
| 110 |
-
variations = augment_example(ex)
|
| 111 |
-
for var in variations[1:]: # Skip original (already written)
|
| 112 |
-
if generated < target_count:
|
| 113 |
-
f.write(json.dumps(var) + "\n")
|
| 114 |
-
generated += 1
|
| 115 |
-
|
| 116 |
-
if generated % 1000 == 0:
|
| 117 |
-
print(f" Generated {generated}/{target_count}...", end='\r')
|
| 118 |
-
|
| 119 |
-
print(f"\n✨ Augmented to {generated} examples")
|
| 120 |
-
print(f" Saved to: {output_path}")
|
| 121 |
-
print(f" Total dataset now: {original_count} → {generated} (x{generated/original_count:.1f})")
|
| 122 |
-
|
| 123 |
-
if __name__ == "__main__":
|
| 124 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/augment_training_data.py
DELETED
|
@@ -1,324 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Data augmentation script for tool_examples.jsonl.
|
| 4 |
-
Generates 2x-5x more training examples from existing data through:
|
| 5 |
-
- Paraphrasing user prompts
|
| 6 |
-
- Difficulty scaling (simpler/complex variations)
|
| 7 |
-
- Edge case generation
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
import json
|
| 11 |
-
import random
|
| 12 |
-
import argparse
|
| 13 |
-
from pathlib import Path
|
| 14 |
-
from typing import List, Dict, Any, Optional
|
| 15 |
-
from itertools import product
|
| 16 |
-
import copy
|
| 17 |
-
|
| 18 |
-
# Random seed for reproducibility
|
| 19 |
-
random.seed(42)
|
| 20 |
-
|
| 21 |
-
# Paraphrase templates
|
| 22 |
-
PARAPHRASES = {
|
| 23 |
-
"Can you": ["Please", "Would you kindly", "Could you", "Kindly"],
|
| 24 |
-
"I need": ["I'd like", "I require", "I want", "I must have"],
|
| 25 |
-
"show me": ["display", "show", "reveal", "let me see"],
|
| 26 |
-
"the file": ["this file", "that file", "a file"],
|
| 27 |
-
"run": ["execute", "launch", "start", "run"],
|
| 28 |
-
"create": ["make", "generate", "add", "write"],
|
| 29 |
-
"delete": ["remove", "erase", "drop", "destroy"],
|
| 30 |
-
"list": ["show", "display", "enumerate", "get"],
|
| 31 |
-
"search": ["find", "look for", "grep", "locate"],
|
| 32 |
-
"help me": ["assist me", "I need help", "please assist", "support"],
|
| 33 |
-
}
|
| 34 |
-
|
| 35 |
-
# Difficulty modifiers
|
| 36 |
-
EASY_MODIFIERS = [
|
| 37 |
-
"quickly",
|
| 38 |
-
"simply",
|
| 39 |
-
"just",
|
| 40 |
-
"easily",
|
| 41 |
-
]
|
| 42 |
-
|
| 43 |
-
COMPLEX_MODIFIERS = [
|
| 44 |
-
"carefully",
|
| 45 |
-
"thoroughly",
|
| 46 |
-
"in detail",
|
| 47 |
-
"completely",
|
| 48 |
-
"with all options",
|
| 49 |
-
]
|
| 50 |
-
|
| 51 |
-
# Edge case patterns
|
| 52 |
-
EDGE_CASE_PATTERNS = [
|
| 53 |
-
("empty_input", lambda ex: _create_empty_variant(ex)),
|
| 54 |
-
("multi_step", lambda ex: _create_multistep_variant(ex)),
|
| 55 |
-
("error_handling", lambda ex: _create_error_variant(ex)),
|
| 56 |
-
]
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def _deep_copy(obj: Any) -> Any:
|
| 60 |
-
"""Create a deep copy of a JSON-serializable object."""
|
| 61 |
-
return json.loads(json.dumps(obj))
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
def _create_empty_variant(example: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 65 |
-
"""Create variant with empty/blank user input."""
|
| 66 |
-
new_ex = _deep_copy(example)
|
| 67 |
-
# Keep system message, empty user message
|
| 68 |
-
for msg in new_ex["messages"]:
|
| 69 |
-
if msg["role"] == "user":
|
| 70 |
-
msg["content"] = " "
|
| 71 |
-
break
|
| 72 |
-
new_ex["source"] = "augmented_edge_empty"
|
| 73 |
-
return new_ex
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
def _create_multistep_variant(example: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 77 |
-
"""Create variant simulating multi-step reasoning."""
|
| 78 |
-
new_ex = _deep_copy(example)
|
| 79 |
-
# Add reasoning step before tool call
|
| 80 |
-
for i, msg in enumerate(new_ex["messages"]):
|
| 81 |
-
if msg.get("tool_calls"):
|
| 82 |
-
reasoning = {
|
| 83 |
-
"role": "assistant",
|
| 84 |
-
"content": "Let me think about this step by step. First, I need to understand what the user is asking for."
|
| 85 |
-
}
|
| 86 |
-
new_ex["messages"].insert(i, reasoning)
|
| 87 |
-
break
|
| 88 |
-
new_ex["source"] = "augmented_edge_multistep"
|
| 89 |
-
return new_ex
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
def _create_error_variant(example: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 93 |
-
"""Create variant simulating error handling."""
|
| 94 |
-
new_ex = _deep_copy(example)
|
| 95 |
-
for msg in new_ex["messages"]:
|
| 96 |
-
if msg.get("role") == "tool":
|
| 97 |
-
# Simulate an error in tool result
|
| 98 |
-
if "Successfully" in msg.get("content", ""):
|
| 99 |
-
msg["content"] = msg["content"].replace("Successfully", "Error occurred:")
|
| 100 |
-
elif "error" not in msg.get("content", "").lower():
|
| 101 |
-
msg["content"] = "Operation failed: Permission denied"
|
| 102 |
-
break
|
| 103 |
-
new_ex["source"] = "augmented_edge_error"
|
| 104 |
-
return new_ex
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
def paraphrase_text(text: str) -> str:
|
| 108 |
-
"""Apply simple paraphrasing to text."""
|
| 109 |
-
if not text:
|
| 110 |
-
return text
|
| 111 |
-
result = text
|
| 112 |
-
for original, alternatives in PARAPHRASES.items():
|
| 113 |
-
if original.lower() in result.lower():
|
| 114 |
-
# Case-insensitive replace, preserve original case pattern
|
| 115 |
-
idx = result.lower().find(original.lower())
|
| 116 |
-
prefix = result[:idx]
|
| 117 |
-
suffix = result[idx + len(original):]
|
| 118 |
-
replacement = random.choice(alternatives)
|
| 119 |
-
# Preserve case
|
| 120 |
-
if result[idx].isupper():
|
| 121 |
-
replacement = replacement.capitalize()
|
| 122 |
-
result = prefix + replacement + suffix
|
| 123 |
-
break
|
| 124 |
-
return result
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
def apply_difficulty(example: Dict[str, Any], level: str) -> Dict[str, Any]:
|
| 128 |
-
"""Apply difficulty scaling to an example."""
|
| 129 |
-
new_ex = _deep_copy(example)
|
| 130 |
-
modifiers = EASY_MODIFIERS if level == "easy" else COMPLEX_MODIFIERS
|
| 131 |
-
|
| 132 |
-
for msg in new_ex["messages"]:
|
| 133 |
-
if msg["role"] == "user" and msg.get("content"):
|
| 134 |
-
content = msg["content"]
|
| 135 |
-
if level == "easy":
|
| 136 |
-
# Simplify the request
|
| 137 |
-
content = content.replace("please", "").replace("kindly", "")
|
| 138 |
-
content = content.strip()
|
| 139 |
-
elif level == "complex":
|
| 140 |
-
# Add complexity
|
| 141 |
-
modifier = random.choice(modifiers)
|
| 142 |
-
content = f"{content} {modifier}"
|
| 143 |
-
msg["content"] = content
|
| 144 |
-
break
|
| 145 |
-
|
| 146 |
-
new_ex["source"] = f"augmented_difficulty_{level}"
|
| 147 |
-
return new_ex
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
def vary_tool_parameters(example: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 151 |
-
"""Generate variations with different tool parameters."""
|
| 152 |
-
variations = []
|
| 153 |
-
|
| 154 |
-
for msg in example.get("messages", []):
|
| 155 |
-
if msg.get("tool_calls"):
|
| 156 |
-
for tc in msg["tool_calls"]:
|
| 157 |
-
func = tc.get("function", {})
|
| 158 |
-
args_str = func.get("arguments", "{}")
|
| 159 |
-
try:
|
| 160 |
-
args = json.loads(args_str) if isinstance(args_str, str) else args_str
|
| 161 |
-
except (json.JSONDecodeError, TypeError):
|
| 162 |
-
continue
|
| 163 |
-
|
| 164 |
-
if not isinstance(args, dict):
|
| 165 |
-
continue
|
| 166 |
-
|
| 167 |
-
# Common parameter variations
|
| 168 |
-
param_variations = [
|
| 169 |
-
("file_path", ["src/main.py", "README.md", "config.yaml", "package.json", "tests/test.py"]),
|
| 170 |
-
("command", ["ls -la", "echo hello", "pwd", "whoami"]),
|
| 171 |
-
("pattern", ["*.py", "*.js", "*.md", "*.json"]),
|
| 172 |
-
("path", ["src", "lib", "docs", "."]),
|
| 173 |
-
]
|
| 174 |
-
|
| 175 |
-
for param_name, alternatives in param_variations:
|
| 176 |
-
if param_name in args:
|
| 177 |
-
original_val = args[param_name]
|
| 178 |
-
for alt_val in alternatives:
|
| 179 |
-
if alt_val != original_val:
|
| 180 |
-
new_ex = _deep_copy(example)
|
| 181 |
-
for new_msg in new_ex["messages"]:
|
| 182 |
-
if new_msg.get("tool_calls"):
|
| 183 |
-
for new_tc in new_msg["tool_calls"]:
|
| 184 |
-
new_func = new_tc.get("function", {})
|
| 185 |
-
new_args = json.loads(new_func.get("arguments", "{}"))
|
| 186 |
-
if param_name in new_args:
|
| 187 |
-
new_args[param_name] = alt_val
|
| 188 |
-
new_func["arguments"] = json.dumps(new_args)
|
| 189 |
-
new_ex["source"] = "augmented_params"
|
| 190 |
-
variations.append(new_ex)
|
| 191 |
-
break
|
| 192 |
-
|
| 193 |
-
return variations
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
def add_filler_variant(example: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 197 |
-
"""Add polite filler words to user message."""
|
| 198 |
-
fillers = [" please", " if you could", " when you get a chance", " thanks"]
|
| 199 |
-
|
| 200 |
-
new_ex = _deep_copy(example)
|
| 201 |
-
for msg in new_ex["messages"]:
|
| 202 |
-
if msg["role"] == "user" and msg.get("content"):
|
| 203 |
-
filler = random.choice(fillers)
|
| 204 |
-
msg["content"] = msg["content"].rstrip() + filler
|
| 205 |
-
break
|
| 206 |
-
|
| 207 |
-
new_ex["source"] = "augmented_filler"
|
| 208 |
-
return new_ex
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
def generate_edge_cases(example: Dict[str, Any], num_cases: int = 2) -> List[Dict[str, Any]]:
|
| 212 |
-
"""Generate edge case variations."""
|
| 213 |
-
cases = []
|
| 214 |
-
selected_patterns = random.sample(EDGE_CASE_PATTERNS, min(num_cases, len(EDGE_CASE_PATTERNS)))
|
| 215 |
-
|
| 216 |
-
for name, generator in selected_patterns:
|
| 217 |
-
try:
|
| 218 |
-
variant = generator(example)
|
| 219 |
-
if variant:
|
| 220 |
-
cases.append(variant)
|
| 221 |
-
except Exception:
|
| 222 |
-
continue
|
| 223 |
-
|
| 224 |
-
return cases
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
def augment_example(example: Dict[str, Any], target_multiplier: int = 3) -> List[Dict[str, Any]]:
|
| 228 |
-
"""Generate multiple augmented variations of a single example."""
|
| 229 |
-
variations = [example] # Always keep original
|
| 230 |
-
|
| 231 |
-
# 1. Paraphrase variant
|
| 232 |
-
if random.random() < 0.7:
|
| 233 |
-
new_ex = _deep_copy(example)
|
| 234 |
-
for msg in new_ex["messages"]:
|
| 235 |
-
if msg["role"] == "user" and msg.get("content"):
|
| 236 |
-
msg["content"] = paraphrase_text(msg["content"])
|
| 237 |
-
break
|
| 238 |
-
new_ex["source"] = "augmented_paraphrase"
|
| 239 |
-
variations.append(new_ex)
|
| 240 |
-
|
| 241 |
-
# 2. Difficulty variants (easy and complex)
|
| 242 |
-
if random.random() < 0.5:
|
| 243 |
-
variations.append(apply_difficulty(example, "easy"))
|
| 244 |
-
if random.random() < 0.5:
|
| 245 |
-
variations.append(apply_difficulty(example, "complex"))
|
| 246 |
-
|
| 247 |
-
# 3. Filler variant
|
| 248 |
-
if random.random() < 0.3:
|
| 249 |
-
filler_ex = add_filler_variant(example)
|
| 250 |
-
if filler_ex:
|
| 251 |
-
variations.append(filler_ex)
|
| 252 |
-
|
| 253 |
-
# 4. Tool parameter variations
|
| 254 |
-
param_variations = vary_tool_parameters(example)
|
| 255 |
-
variations.extend(param_variations[:2]) # Limit to 2
|
| 256 |
-
|
| 257 |
-
# 5. Edge cases
|
| 258 |
-
if random.random() < 0.3:
|
| 259 |
-
edge_cases = generate_edge_cases(example)
|
| 260 |
-
variations.extend(edge_cases[:1])
|
| 261 |
-
|
| 262 |
-
return variations[:target_multiplier] # Limit total variations
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
def main():
|
| 266 |
-
parser = argparse.ArgumentParser(description="Augment training data for Stack 2.9")
|
| 267 |
-
parser.add_argument("--input", type=str,
|
| 268 |
-
default="training-data/tool_examples.jsonl",
|
| 269 |
-
help="Input JSONL file")
|
| 270 |
-
parser.add_argument("--output", type=str,
|
| 271 |
-
default="training-data/augmented_tool_examples.jsonl",
|
| 272 |
-
help="Output JSONL file")
|
| 273 |
-
parser.add_argument("--multiplier", type=int, default=3,
|
| 274 |
-
help="Target multiplication factor (2-5)")
|
| 275 |
-
parser.add_argument("--seed", type=int, default=42,
|
| 276 |
-
help="Random seed for reproducibility")
|
| 277 |
-
|
| 278 |
-
args = parser.parse_args()
|
| 279 |
-
random.seed(args.seed)
|
| 280 |
-
|
| 281 |
-
input_path = Path(args.input)
|
| 282 |
-
output_path = Path(args.output)
|
| 283 |
-
|
| 284 |
-
if not input_path.exists():
|
| 285 |
-
print(f"Error: Input file not found: {input_path}")
|
| 286 |
-
return
|
| 287 |
-
|
| 288 |
-
print(f"Loading data from: {input_path}")
|
| 289 |
-
examples = []
|
| 290 |
-
with open(input_path, 'r', encoding='utf-8') as f:
|
| 291 |
-
for line in f:
|
| 292 |
-
line = line.strip()
|
| 293 |
-
if line:
|
| 294 |
-
try:
|
| 295 |
-
examples.append(json.loads(line))
|
| 296 |
-
except json.JSONDecodeError:
|
| 297 |
-
continue
|
| 298 |
-
|
| 299 |
-
original_count = len(examples)
|
| 300 |
-
print(f"Loaded {original_count} examples")
|
| 301 |
-
|
| 302 |
-
# Generate augmented examples
|
| 303 |
-
all_variations = []
|
| 304 |
-
for ex in examples:
|
| 305 |
-
variations = augment_example(ex, target_multiplier=args.multiplier)
|
| 306 |
-
all_variations.extend(variations)
|
| 307 |
-
|
| 308 |
-
total_count = len(all_variations)
|
| 309 |
-
|
| 310 |
-
# Write output
|
| 311 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 312 |
-
with open(output_path, 'w', encoding='utf-8') as f:
|
| 313 |
-
for var in all_variations:
|
| 314 |
-
f.write(json.dumps(var, ensure_ascii=False) + "\n")
|
| 315 |
-
|
| 316 |
-
print(f"\nAugmentation complete!")
|
| 317 |
-
print(f" Original: {original_count} examples")
|
| 318 |
-
print(f" Augmented: {total_count} examples")
|
| 319 |
-
print(f" Multiplier: {total_count/original_count:.1f}x")
|
| 320 |
-
print(f" Output: {output_path}")
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
if __name__ == "__main__":
|
| 324 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/combine_all.py
DELETED
|
@@ -1,135 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Final dataset combiner - loads all sources, deduplicates, splits.
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import json
|
| 7 |
-
import hashlib
|
| 8 |
-
import random
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
from datetime import datetime
|
| 11 |
-
import glob
|
| 12 |
-
|
| 13 |
-
def hash_messages(messages: list) -> str:
|
| 14 |
-
"""Create hash for deduplication."""
|
| 15 |
-
return hashlib.md5(json.dumps(messages, sort_keys=True).encode()).hexdigest()
|
| 16 |
-
|
| 17 |
-
def main():
|
| 18 |
-
output_dir = Path("training-data/final")
|
| 19 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 20 |
-
|
| 21 |
-
# Glob all potential source files
|
| 22 |
-
source_files = []
|
| 23 |
-
|
| 24 |
-
# Synthetic sources
|
| 25 |
-
source_files.extend(glob.glob("training-data/synthetic/*.jsonl"))
|
| 26 |
-
source_files.extend(glob.glob("training-data/advanced-patterns/*.jsonl"))
|
| 27 |
-
source_files.extend(glob.glob("training-data/scaled/*.jsonl"))
|
| 28 |
-
|
| 29 |
-
# Code pairs (JSON format)
|
| 30 |
-
source_files.extend(glob.glob("training-data/code-pairs/*.json"))
|
| 31 |
-
|
| 32 |
-
print(f"🔍 Found {len(source_files)} source files")
|
| 33 |
-
|
| 34 |
-
all_examples = []
|
| 35 |
-
seen_hashes = set()
|
| 36 |
-
source_counts = {}
|
| 37 |
-
|
| 38 |
-
for file_path in source_files:
|
| 39 |
-
path = Path(file_path)
|
| 40 |
-
source_name = path.stem
|
| 41 |
-
count = 0
|
| 42 |
-
|
| 43 |
-
try:
|
| 44 |
-
with open(path, 'r') as f:
|
| 45 |
-
for line in f:
|
| 46 |
-
line = line.strip()
|
| 47 |
-
if not line:
|
| 48 |
-
continue
|
| 49 |
-
|
| 50 |
-
try:
|
| 51 |
-
ex = json.loads(line)
|
| 52 |
-
|
| 53 |
-
# Convert code-pair format to message format
|
| 54 |
-
if "code" in ex and "comment" in ex and "messages" not in ex:
|
| 55 |
-
ex = {
|
| 56 |
-
"messages": [
|
| 57 |
-
{"role": "user", "content": f"Show me code for: {ex['comment'][:100]}"},
|
| 58 |
-
{"role": "assistant", "content": f"Here's a {ex.get('type', 'function')}:\n{ex['code']}"}
|
| 59 |
-
],
|
| 60 |
-
"source_original": source_name,
|
| 61 |
-
"type": "code_pair"
|
| 62 |
-
}
|
| 63 |
-
|
| 64 |
-
# Deduplication
|
| 65 |
-
if "messages" in ex:
|
| 66 |
-
msg_hash = hash_messages(ex["messages"])
|
| 67 |
-
if msg_hash in seen_hashes:
|
| 68 |
-
continue
|
| 69 |
-
seen_hashes.add(msg_hash)
|
| 70 |
-
|
| 71 |
-
# Track source
|
| 72 |
-
ex["source_original"] = ex.get("source_original", source_name)
|
| 73 |
-
all_examples.append(ex)
|
| 74 |
-
count += 1
|
| 75 |
-
except json.JSONDecodeError:
|
| 76 |
-
continue
|
| 77 |
-
|
| 78 |
-
except Exception as e:
|
| 79 |
-
print(f" ⚠️ Error reading {path}: {e}")
|
| 80 |
-
continue
|
| 81 |
-
|
| 82 |
-
source_counts[source_name] = count
|
| 83 |
-
if count > 0:
|
| 84 |
-
print(f" ✅ {source_name}: {count} examples")
|
| 85 |
-
|
| 86 |
-
print(f"\n✨ Total unique examples: {len(all_examples)}")
|
| 87 |
-
print(f" Deduplication removed: {sum(source_counts.values()) - len(all_examples)}")
|
| 88 |
-
|
| 89 |
-
# Shuffle
|
| 90 |
-
random.seed(42)
|
| 91 |
-
random.shuffle(all_examples)
|
| 92 |
-
|
| 93 |
-
# Splits (80/10/10)
|
| 94 |
-
n = len(all_examples)
|
| 95 |
-
n_train = int(n * 0.8)
|
| 96 |
-
n_val = int(n * 0.1)
|
| 97 |
-
|
| 98 |
-
splits = {
|
| 99 |
-
"train": all_examples[:n_train],
|
| 100 |
-
"val": all_examples[n_train:n_train+n_val],
|
| 101 |
-
"test": all_examples[n_train+n_val:]
|
| 102 |
-
}
|
| 103 |
-
|
| 104 |
-
for split_name, data in splits.items():
|
| 105 |
-
out_path = output_dir / f"{split_name}.jsonl"
|
| 106 |
-
with open(out_path, 'w') as f:
|
| 107 |
-
for ex in data:
|
| 108 |
-
f.write(json.dumps(ex) + "\n")
|
| 109 |
-
print(f" 📁 {split_name}: {len(data)} -> {out_path}")
|
| 110 |
-
|
| 111 |
-
# Manifest
|
| 112 |
-
manifest = {
|
| 113 |
-
"dataset": "Stack 2.9 Training Data",
|
| 114 |
-
"version": "1.0",
|
| 115 |
-
"created": datetime.now().isoformat(),
|
| 116 |
-
"total_examples": len(all_examples),
|
| 117 |
-
"splits": {name: len(data) for name, data in splits.items()},
|
| 118 |
-
"source_breakdown": source_counts,
|
| 119 |
-
"note": "Combined from multiple synthetic and code-pair sources"
|
| 120 |
-
}
|
| 121 |
-
|
| 122 |
-
with open(output_dir / "manifest.json", 'w') as f:
|
| 123 |
-
json.dump(manifest, f, indent=2)
|
| 124 |
-
|
| 125 |
-
print(f"\n✅ Final dataset ready!")
|
| 126 |
-
print(f" Total: {len(all_examples)} examples")
|
| 127 |
-
print(f" Manifest: {output_dir / 'manifest.json'}")
|
| 128 |
-
|
| 129 |
-
if len(all_examples) >= 50000:
|
| 130 |
-
print("\n🎉 TARGET ACHIEVED: 50,000+ examples!")
|
| 131 |
-
else:
|
| 132 |
-
print(f"\n⚠️ Still need {50000 - len(all_examples)} more to reach 50K target")
|
| 133 |
-
|
| 134 |
-
if __name__ == "__main__":
|
| 135 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/combine_datasets.py
DELETED
|
@@ -1,144 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Combine all training data sources into final dataset.
|
| 4 |
-
Applies deduplication and quality filtering.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import json
|
| 8 |
-
import hashlib
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
import argparse
|
| 11 |
-
from datetime import datetime
|
| 12 |
-
|
| 13 |
-
def hash_messages(messages: list) -> str:
|
| 14 |
-
"""Create a hash of messages to detect duplicates."""
|
| 15 |
-
m = hashlib.md5()
|
| 16 |
-
m.update(json.dumps(messages, sort_keys=True).encode())
|
| 17 |
-
return m.hexdigest()
|
| 18 |
-
|
| 19 |
-
def main():
|
| 20 |
-
parser = argparse.ArgumentParser()
|
| 21 |
-
parser.add_argument("--output", type=str, default="training-data/final/dataset.jsonl")
|
| 22 |
-
parser.add_argument("--train-size", type=float, default=0.8)
|
| 23 |
-
parser.add_argument("--val-size", type=float, default=0.1)
|
| 24 |
-
parser.add_argument("--max-dataset", type=int, default=50000, help="Max examples to include")
|
| 25 |
-
args = parser.parse_args()
|
| 26 |
-
|
| 27 |
-
output_path = Path(args.output)
|
| 28 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 29 |
-
|
| 30 |
-
# List all source files
|
| 31 |
-
sources = [
|
| 32 |
-
("training-data/synthetic/examples.jsonl", "original_synthetic"),
|
| 33 |
-
("training-data/advanced-patterns/examples.jsonl", "advanced_patterns"),
|
| 34 |
-
("training-data/code-pairs/pairs.json", "code_pairs"),
|
| 35 |
-
("training-data/code-pairs/extended_pairs.json", "code_pairs_extended"),
|
| 36 |
-
("training-data/scaled/synthetic_final.jsonl", "synthetic_augmented"),
|
| 37 |
-
("training-data/scaled/random_10k.jsonl", "random_10k"),
|
| 38 |
-
("training-data/scaled/random_5_5k.jsonl", "random_5k"),
|
| 39 |
-
]
|
| 40 |
-
|
| 41 |
-
all_examples = []
|
| 42 |
-
seen_hashes = set()
|
| 43 |
-
duplicates_removed = 0
|
| 44 |
-
|
| 45 |
-
print("📦 Combining datasets...")
|
| 46 |
-
for file_path, source in sources:
|
| 47 |
-
path = Path(file_path)
|
| 48 |
-
if not path.exists():
|
| 49 |
-
print(f" ⚠️ Not found: {path}")
|
| 50 |
-
continue
|
| 51 |
-
|
| 52 |
-
print(f" Loading {source}...")
|
| 53 |
-
count = 0
|
| 54 |
-
with open(path, 'r') as f:
|
| 55 |
-
for line in f:
|
| 56 |
-
try:
|
| 57 |
-
ex = json.loads(line)
|
| 58 |
-
|
| 59 |
-
# Convert code-pair format if needed
|
| 60 |
-
if "code" in ex and "comment" in ex:
|
| 61 |
-
# Convert code-pair to message format
|
| 62 |
-
ex = {
|
| 63 |
-
"messages": [
|
| 64 |
-
{"role": "user", "content": ex["comment"]},
|
| 65 |
-
{"role": "assistant", "content": f"Here's the code:\n{ex['code']}"}
|
| 66 |
-
],
|
| 67 |
-
"source": source,
|
| 68 |
-
"type": "code_pair"
|
| 69 |
-
}
|
| 70 |
-
|
| 71 |
-
# Deduplication
|
| 72 |
-
msg_hash = hash_messages(ex["messages"])
|
| 73 |
-
if msg_hash in seen_hashes:
|
| 74 |
-
duplicates_removed += 1
|
| 75 |
-
continue
|
| 76 |
-
seen_hashes.add(msg_hash)
|
| 77 |
-
|
| 78 |
-
# Add metadata
|
| 79 |
-
ex["source_original"] = source
|
| 80 |
-
all_examples.append(ex)
|
| 81 |
-
count += 1
|
| 82 |
-
|
| 83 |
-
if len(all_examples) >= args.max_dataset:
|
| 84 |
-
break
|
| 85 |
-
|
| 86 |
-
except json.JSONDecodeError:
|
| 87 |
-
continue
|
| 88 |
-
|
| 89 |
-
print(f" ✅ Added {count} examples")
|
| 90 |
-
|
| 91 |
-
print(f"\n✨ Total collected: {len(all_examples)} examples")
|
| 92 |
-
print(f" Duplicates removed: {duplicates_removed}")
|
| 93 |
-
|
| 94 |
-
# Shuffle
|
| 95 |
-
random.seed(42)
|
| 96 |
-
random.shuffle(all_examples)
|
| 97 |
-
|
| 98 |
-
# Split
|
| 99 |
-
n_total = len(all_examples)
|
| 100 |
-
n_train = int(n_total * args.train_size)
|
| 101 |
-
n_val = int(n_total * args.val_size)
|
| 102 |
-
n_test = n_total - n_train - n_val
|
| 103 |
-
|
| 104 |
-
train_set = all_examples[:n_train]
|
| 105 |
-
val_set = all_examples[n_train:n_train+n_val]
|
| 106 |
-
test_set = all_examples[n_train+n_val:]
|
| 107 |
-
|
| 108 |
-
print(f"\n📊 Split:")
|
| 109 |
-
print(f" Train: {len(train_set)}")
|
| 110 |
-
print(f" Val: {len(val_set)}")
|
| 111 |
-
print(f" Test: {len(test_set)}")
|
| 112 |
-
|
| 113 |
-
# Save splits
|
| 114 |
-
for split_name, split_data in [("train", train_set), ("val", val_set), ("test", test_set)]:
|
| 115 |
-
split_path = output_path.parent / f"{split_name}.jsonl"
|
| 116 |
-
with open(split_path, 'w') as f:
|
| 117 |
-
for ex in split_data:
|
| 118 |
-
f.write(json.dumps(ex) + "\n")
|
| 119 |
-
print(f" Saved {split_name} to {split_path}")
|
| 120 |
-
|
| 121 |
-
# Create manifest
|
| 122 |
-
manifest = {
|
| 123 |
-
"dataset": "Stack 2.9 Training Data",
|
| 124 |
-
"version": "1.0",
|
| 125 |
-
"created": datetime.now().isoformat(),
|
| 126 |
-
"total_examples": n_total,
|
| 127 |
-
"splits": {
|
| 128 |
-
"train": len(train_set),
|
| 129 |
-
"val": len(val_set),
|
| 130 |
-
"test": len(test_set)
|
| 131 |
-
},
|
| 132 |
-
"sources": {src: sum(1 for ex in all_examples if ex.get("source_original") == src) for src in set(ex.get("source_original") for ex in all_examples)}
|
| 133 |
-
}
|
| 134 |
-
|
| 135 |
-
manifest_path = output_path.parent / "manifest.json"
|
| 136 |
-
with open(manifest_path, 'w') as f:
|
| 137 |
-
json.dump(manifest, f, indent=2)
|
| 138 |
-
print(f"\n📄 Manifest: {manifest_path}")
|
| 139 |
-
|
| 140 |
-
print("\n✅ Dataset complete!")
|
| 141 |
-
|
| 142 |
-
if __name__ == "__main__":
|
| 143 |
-
import random
|
| 144 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/convert_gguf.py
DELETED
|
@@ -1,141 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Convert GGUF file to HuggingFace format
|
| 4 |
-
"""
|
| 5 |
-
import os
|
| 6 |
-
import sys
|
| 7 |
-
|
| 8 |
-
# Check for required packages
|
| 9 |
-
try:
|
| 10 |
-
import gguf
|
| 11 |
-
except ImportError:
|
| 12 |
-
print("Installing gguf...")
|
| 13 |
-
os.system("pip install gguf -q")
|
| 14 |
-
import gguf
|
| 15 |
-
|
| 16 |
-
try:
|
| 17 |
-
from transformers import AutoModel, AutoTokenizer
|
| 18 |
-
except ImportError:
|
| 19 |
-
print("Installing transformers...")
|
| 20 |
-
os.system("pip install transformers -q")
|
| 21 |
-
from transformers import AutoModel, AutoTokenizer
|
| 22 |
-
|
| 23 |
-
import torch
|
| 24 |
-
|
| 25 |
-
GGUF_PATH = "/Users/walidsobhi/.ollama/models/blobs/sha256-60e05f2100071479f596b964f89f510f057ce397ea22f2833a0cfe029bfc2463"
|
| 26 |
-
OUTPUT_DIR = "/Users/walidsobhi/.openclaw/workspace/stack-2.9/base_model_qwen7b"
|
| 27 |
-
|
| 28 |
-
print(f"Reading GGUF from: {GGUF_PATH}")
|
| 29 |
-
|
| 30 |
-
# Read the GGUF file
|
| 31 |
-
reader = gguf.GGUFReader(GGUF_PATH)
|
| 32 |
-
|
| 33 |
-
# Get tensor info
|
| 34 |
-
print("\n GGUF Tensors:")
|
| 35 |
-
for i, tensor in enumerate(reader.tensors):
|
| 36 |
-
print(f" {i}: {tensor.name} - shape {tensor.shape}, dtype {tensor.tensor_type}")
|
| 37 |
-
|
| 38 |
-
# Extract to HF format
|
| 39 |
-
print("\n Converting to HuggingFace format...")
|
| 40 |
-
|
| 41 |
-
# Create output directory
|
| 42 |
-
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 43 |
-
|
| 44 |
-
# Save model files
|
| 45 |
-
model_path = os.path.join(OUTPUT_DIR, "model.safetensors")
|
| 46 |
-
|
| 47 |
-
# Map GGUF types to PyTorch types
|
| 48 |
-
def gguf_to_torch_type(gguf_type):
|
| 49 |
-
type_map = {
|
| 50 |
-
"F32": torch.float32,
|
| 51 |
-
"F16": torch.float16,
|
| 52 |
-
"BF16": torch.bfloat16,
|
| 53 |
-
"I8": torch.int8,
|
| 54 |
-
"I16": torch.int16,
|
| 55 |
-
"I32": torch.int32,
|
| 56 |
-
"I64": torch.int64,
|
| 57 |
-
"U8": torch.uint8,
|
| 58 |
-
}
|
| 59 |
-
return type_map.get(gguf_type, torch.float32)
|
| 60 |
-
|
| 61 |
-
# Export tensors
|
| 62 |
-
state_dict = {}
|
| 63 |
-
for tensor in reader.tensors:
|
| 64 |
-
print(f" Converting {tensor.name}...")
|
| 65 |
-
# Read tensor data
|
| 66 |
-
data = reader.get_tensor(tensor.name)
|
| 67 |
-
state_dict[tensor.name] = data
|
| 68 |
-
|
| 69 |
-
# Save as safetensors
|
| 70 |
-
try:
|
| 71 |
-
from safetensors.torch import save_file
|
| 72 |
-
save_file(state_dict, model_path)
|
| 73 |
-
print(f"Model saved to: {model_path}")
|
| 74 |
-
except ImportError:
|
| 75 |
-
# Fallback to torch
|
| 76 |
-
torch.save(state_dict, model_path.replace(".safetensors", ".pt"))
|
| 77 |
-
print(f"Model saved to: {model_path.replace('.safetensors', '.pt')}")
|
| 78 |
-
|
| 79 |
-
# Save config.json
|
| 80 |
-
config = {
|
| 81 |
-
"model_type": "qwen2",
|
| 82 |
-
"architectures": ["Qwen2ForCausalLM"],
|
| 83 |
-
"vocab_size": 151936,
|
| 84 |
-
"hidden_size": 3584,
|
| 85 |
-
"intermediate_size": 18944,
|
| 86 |
-
"num_hidden_layers": 28,
|
| 87 |
-
"num_attention_heads": 28,
|
| 88 |
-
"num_key_value_heads": 4,
|
| 89 |
-
"max_position_embeddings": 32768,
|
| 90 |
-
"sliding_window": 32768,
|
| 91 |
-
"torch_dtype": "bfloat16",
|
| 92 |
-
"transformers_version": "4.37.0",
|
| 93 |
-
}
|
| 94 |
-
|
| 95 |
-
import json
|
| 96 |
-
config_path = os.path.join(OUTPUT_DIR, "config.json")
|
| 97 |
-
with open(config_path, "w") as f:
|
| 98 |
-
json.dump(config, f, indent=2)
|
| 99 |
-
print(f"Config saved to: {config_path}")
|
| 100 |
-
|
| 101 |
-
# Create tokenizer files
|
| 102 |
-
print("\n Creating tokenizer...")
|
| 103 |
-
|
| 104 |
-
# Use Qwen2 tokenizer config
|
| 105 |
-
tokenizer_config = {
|
| 106 |
-
"add_bos_token": False,
|
| 107 |
-
"add_eos_token": False,
|
| 108 |
-
"add_prefix_space": False,
|
| 109 |
-
"added_tokens_decoder": {},
|
| 110 |
-
"bos_token": "<|im_end|>",
|
| 111 |
-
"clean_up_tokenization_spaces": False,
|
| 112 |
-
"eos_token": "<|im_end|>",
|
| 113 |
-
"errors": "replace",
|
| 114 |
-
"model_max_length": 32768,
|
| 115 |
-
"pad_token": "<|im_end|>",
|
| 116 |
-
"tokenizer_class": "Qwen2Tokenizer",
|
| 117 |
-
"unk_token": "<|endoftext|>",
|
| 118 |
-
}
|
| 119 |
-
|
| 120 |
-
tokenizer_config_path = os.path.join(OUTPUT_DIR, "tokenizer_config.json")
|
| 121 |
-
with open(tokenizer_config_path, "w") as f:
|
| 122 |
-
json.dump(tokenizer_config, f, indent=2)
|
| 123 |
-
|
| 124 |
-
# Create a simple vocab file (this is a placeholder - real vocab is in the GGUF)
|
| 125 |
-
# The GGUF reader should have tokenizer data
|
| 126 |
-
vocab = {}
|
| 127 |
-
for i in range(151936):
|
| 128 |
-
vocab[f"<|token_{i}|>"] = i
|
| 129 |
-
|
| 130 |
-
vocab_path = os.path.join(OUTPUT_DIR, "vocab.json")
|
| 131 |
-
with open(vocab_path, "w") as f:
|
| 132 |
-
json.dump(vocab, f)
|
| 133 |
-
print(f"Vocab saved to: {vocab_path}")
|
| 134 |
-
|
| 135 |
-
print("\n✓ Conversion complete!")
|
| 136 |
-
print(f"Output directory: {OUTPUT_DIR}")
|
| 137 |
-
print("\nFiles created:")
|
| 138 |
-
for f in os.listdir(OUTPUT_DIR):
|
| 139 |
-
fpath = os.path.join(OUTPUT_DIR, f)
|
| 140 |
-
size = os.path.getsize(fpath) / (1024*1024)
|
| 141 |
-
print(f" {f}: {size:.1f} MB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/convert_to_gguf.py
DELETED
|
@@ -1,210 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
GGUF Conversion Script for Stack 2.9 Model
|
| 4 |
-
|
| 5 |
-
Converts the fine-tuned Stack 2.9 model to GGUF format for Ollama.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import os
|
| 9 |
-
import sys
|
| 10 |
-
import subprocess
|
| 11 |
-
import argparse
|
| 12 |
-
from pathlib import Path
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def find_llama_cpp():
|
| 16 |
-
"""Find llama.cpp directory in common locations."""
|
| 17 |
-
# Check common locations relative to this script
|
| 18 |
-
script_dir = Path(__file__).parent
|
| 19 |
-
workspace_root = script_dir.parent
|
| 20 |
-
|
| 21 |
-
possible_paths = [
|
| 22 |
-
workspace_root / "llama.cpp",
|
| 23 |
-
workspace_root / "extensions" / "llama.cpp",
|
| 24 |
-
Path.home() / "llama.cpp",
|
| 25 |
-
Path("/usr/local/llama.cpp"),
|
| 26 |
-
]
|
| 27 |
-
|
| 28 |
-
for path in possible_paths:
|
| 29 |
-
if path.exists() and (path / "convert.py").exists():
|
| 30 |
-
return path
|
| 31 |
-
|
| 32 |
-
return None
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def run_command(cmd, check=True):
|
| 36 |
-
"""Run a shell command and stream output."""
|
| 37 |
-
print(f"Running: {' '.join(cmd)}")
|
| 38 |
-
result = subprocess.run(cmd, capture_output=False, text=True)
|
| 39 |
-
|
| 40 |
-
if check and result.returncode != 0:
|
| 41 |
-
print(f"Error: Command failed with exit code {result.returncode}")
|
| 42 |
-
sys.exit(1)
|
| 43 |
-
|
| 44 |
-
return result
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
def convert_model(model_path, output_path, quantize_type="q4_0", llama_cpp_path=None):
|
| 48 |
-
"""
|
| 49 |
-
Convert a HuggingFace model to GGUF format using llama.cpp's convert.py.
|
| 50 |
-
|
| 51 |
-
Args:
|
| 52 |
-
model_path: Path to the input model (HuggingFace format)
|
| 53 |
-
output_path: Path where the GGUF file should be saved
|
| 54 |
-
quantize_type: Quantization type (e.g., q4_0, q5_0, q8_0)
|
| 55 |
-
llama_cpp_path: Path to llama.cpp directory (auto-detected if None)
|
| 56 |
-
"""
|
| 57 |
-
model_path = Path(model_path).resolve()
|
| 58 |
-
output_path = Path(output_path).resolve()
|
| 59 |
-
|
| 60 |
-
# Validate input model exists
|
| 61 |
-
if not model_path.exists():
|
| 62 |
-
print(f"Error: Model directory not found: {model_path}")
|
| 63 |
-
sys.exit(1)
|
| 64 |
-
|
| 65 |
-
# Find llama.cpp if not provided
|
| 66 |
-
if llama_cpp_path is None:
|
| 67 |
-
llama_cpp_path = find_llama_cpp()
|
| 68 |
-
if llama_cpp_path is None:
|
| 69 |
-
print("Error: llama.cpp not found!")
|
| 70 |
-
print("\nPlease install llama.cpp and ensure convert.py is available.")
|
| 71 |
-
print("You can clone it with:")
|
| 72 |
-
print(" git clone https://github.com/ggerganov/llama.cpp.git")
|
| 73 |
-
print("\nOr specify the path manually:")
|
| 74 |
-
print(" python convert_to_gguf.py --llama-cpp /path/to/llama.cpp")
|
| 75 |
-
sys.exit(1)
|
| 76 |
-
|
| 77 |
-
llama_cpp_path = Path(llama_cpp_path).resolve()
|
| 78 |
-
convert_script = llama_cpp_path / "convert.py"
|
| 79 |
-
|
| 80 |
-
if not convert_script.exists():
|
| 81 |
-
print(f"Error: convert.py not found at {convert_script}")
|
| 82 |
-
sys.exit(1)
|
| 83 |
-
|
| 84 |
-
# Create output directory
|
| 85 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 86 |
-
|
| 87 |
-
# Step 1: Convert to intermediate GGUF (unquantized)
|
| 88 |
-
print(f"\n=== Step 1: Converting to GGUF ===")
|
| 89 |
-
temp_gguf = output_path.parent / f"{output_path.stem}_temp.gguf"
|
| 90 |
-
|
| 91 |
-
convert_cmd = [
|
| 92 |
-
sys.executable,
|
| 93 |
-
str(convert_script),
|
| 94 |
-
str(model_path),
|
| 95 |
-
"--outfile", str(temp_gguf),
|
| 96 |
-
"--outtype", "f16", # Intermediate full precision
|
| 97 |
-
]
|
| 98 |
-
|
| 99 |
-
run_command(convert_cmd)
|
| 100 |
-
|
| 101 |
-
if not temp_gguf.exists():
|
| 102 |
-
print(f"Error: Conversion failed, {temp_gguf} not created")
|
| 103 |
-
sys.exit(1)
|
| 104 |
-
|
| 105 |
-
# Step 2: Quantize the GGUF file (if not full precision)
|
| 106 |
-
if quantize_type != "f16":
|
| 107 |
-
print(f"\n=== Step 2: Applying quantization ({quantize_type}) ===")
|
| 108 |
-
|
| 109 |
-
# llama.cpp quantize tool
|
| 110 |
-
quantize_tool = llama_cpp_path / "quantize"
|
| 111 |
-
|
| 112 |
-
# Try different possible names for quantize
|
| 113 |
-
if not quantize_tool.exists():
|
| 114 |
-
quantize_tool = llama_cpp_path / "build" / "bin" / "quantize"
|
| 115 |
-
if not quantize_tool.exists():
|
| 116 |
-
quantize_tool = llama_cpp_path / "build" / "quantize"
|
| 117 |
-
|
| 118 |
-
if not quantize_tool.exists():
|
| 119 |
-
print("Warning: quantize tool not found. Skipping quantization step.")
|
| 120 |
-
print("You may need to build llama.cpp first:")
|
| 121 |
-
print(f" cd {llama_cpp_path} && make quantize")
|
| 122 |
-
print("Using unquantized model as fallback.")
|
| 123 |
-
final_gguf = temp_gguf
|
| 124 |
-
else:
|
| 125 |
-
run_command([
|
| 126 |
-
str(quantize_tool),
|
| 127 |
-
str(temp_gguf),
|
| 128 |
-
str(output_path),
|
| 129 |
-
quantize_type
|
| 130 |
-
])
|
| 131 |
-
temp_gguf.unlink() # Remove temp file
|
| 132 |
-
final_gguf = output_path
|
| 133 |
-
else:
|
| 134 |
-
final_gguf = temp_gguf
|
| 135 |
-
if final_gguf != output_path:
|
| 136 |
-
temp_gguf.rename(output_path)
|
| 137 |
-
|
| 138 |
-
# Step 3: Validate the GGUF file
|
| 139 |
-
print(f"\n=== Step 3: Validating GGUF file ===")
|
| 140 |
-
|
| 141 |
-
if not final_gguf.exists():
|
| 142 |
-
print(f"Error: Final GGUF file not found: {final_gguf}")
|
| 143 |
-
sys.exit(1)
|
| 144 |
-
|
| 145 |
-
file_size = final_gguf.stat().st_size / (1024**3)
|
| 146 |
-
print(f"✓ GGUF file created: {final_gguf}")
|
| 147 |
-
print(f" Size: {file_size:.2f} GB")
|
| 148 |
-
print(f" Quantization: {quantize_type}")
|
| 149 |
-
|
| 150 |
-
# Step 4: Print Ollama import command
|
| 151 |
-
print(f"\n=== Ollama Import Command ===")
|
| 152 |
-
print(f"ollama import {output_path} --alias stack-2.9:7b")
|
| 153 |
-
print("\nAfter importing, you can run the model with:")
|
| 154 |
-
print(f" ollama run stack-2.9:7b")
|
| 155 |
-
|
| 156 |
-
print("\n✅ Conversion complete!")
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
def main():
|
| 160 |
-
parser = argparse.ArgumentParser(
|
| 161 |
-
description="Convert Stack 2.9 model to GGUF format for Ollama"
|
| 162 |
-
)
|
| 163 |
-
parser.add_argument(
|
| 164 |
-
"--model-dir",
|
| 165 |
-
type=str,
|
| 166 |
-
default="./output/stack-2.9-7b-merged",
|
| 167 |
-
help="Path to the merged model directory (default: ./output/stack-2.9-7b-merged)"
|
| 168 |
-
)
|
| 169 |
-
parser.add_argument(
|
| 170 |
-
"--output",
|
| 171 |
-
type=str,
|
| 172 |
-
default="./ollama_model/stack-2.9-7b.gguf",
|
| 173 |
-
help="Output GGUF file path (default: ./ollama_model/stack-2.9-7b.gguf)"
|
| 174 |
-
)
|
| 175 |
-
parser.add_argument(
|
| 176 |
-
"--qtype",
|
| 177 |
-
type=str,
|
| 178 |
-
default="q4_0",
|
| 179 |
-
choices=["f16", "q4_0", "q5_0", "q8_0", "q2_K", "q3_K_S", "q3_K_M", "q3_K_L", "q4_K_S", "q4_K_M", "q5_K_S", "q5_K_M", "q6_K"],
|
| 180 |
-
help="Quantization type (default: q4_0)"
|
| 181 |
-
)
|
| 182 |
-
parser.add_argument(
|
| 183 |
-
"--llama-cpp",
|
| 184 |
-
type=str,
|
| 185 |
-
default=None,
|
| 186 |
-
help="Path to llama.cpp directory (auto-detected if not provided)"
|
| 187 |
-
)
|
| 188 |
-
|
| 189 |
-
args = parser.parse_args()
|
| 190 |
-
|
| 191 |
-
# Resolve paths relative to workspace root
|
| 192 |
-
workspace_root = Path(__file__).parent.parent
|
| 193 |
-
model_path = (workspace_root / args.model_dir).resolve()
|
| 194 |
-
output_path = (workspace_root / args.output).resolve()
|
| 195 |
-
|
| 196 |
-
print("=== GGUF Conversion for Stack 2.9 ===\n")
|
| 197 |
-
print(f"Input model: {model_path}")
|
| 198 |
-
print(f"Output: {output_path}")
|
| 199 |
-
print(f"Quantization: {args.qtype}\n")
|
| 200 |
-
|
| 201 |
-
convert_model(
|
| 202 |
-
model_path=model_path,
|
| 203 |
-
output_path=output_path,
|
| 204 |
-
quantize_type=args.qtype,
|
| 205 |
-
llama_cpp_path=args.llama_cpp
|
| 206 |
-
)
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
if __name__ == "__main__":
|
| 210 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/create_mini_dataset.py
DELETED
|
@@ -1,180 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Create a minimal training dataset for rapid prototyping.
|
| 4 |
-
Samples N examples from the full data/final/train.jsonl ensuring tool diversity.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import argparse
|
| 8 |
-
import json
|
| 9 |
-
import random
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from typing import List, Dict
|
| 12 |
-
from collections import defaultdict, Counter
|
| 13 |
-
|
| 14 |
-
def load_full_dataset(train_path: str = "data/final/train.jsonl") -> List[Dict]:
|
| 15 |
-
"""Load the full dataset."""
|
| 16 |
-
path = Path(train_path)
|
| 17 |
-
if not path.exists():
|
| 18 |
-
raise FileNotFoundError(f"Training data not found at {path}. Please ensure data/final/train.jsonl exists.")
|
| 19 |
-
|
| 20 |
-
data = []
|
| 21 |
-
with open(path, 'r') as f:
|
| 22 |
-
for line in f:
|
| 23 |
-
data.append(json.loads(line))
|
| 24 |
-
return data
|
| 25 |
-
|
| 26 |
-
def extract_tool_calls(example: Dict) -> List[str]:
|
| 27 |
-
"""Extract tool names used in an example."""
|
| 28 |
-
tools = []
|
| 29 |
-
messages = example.get("messages", [])
|
| 30 |
-
for msg in messages:
|
| 31 |
-
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
| 32 |
-
for tc in msg["tool_calls"]:
|
| 33 |
-
func = tc.get("function", {})
|
| 34 |
-
name = func.get("name", "")
|
| 35 |
-
if name:
|
| 36 |
-
tools.append(name)
|
| 37 |
-
return tools
|
| 38 |
-
|
| 39 |
-
def create_mini_dataset(
|
| 40 |
-
output_path: str,
|
| 41 |
-
n_samples: int = 5000,
|
| 42 |
-
train_source: str = "data/final/train.jsonl",
|
| 43 |
-
seed: int = 42
|
| 44 |
-
):
|
| 45 |
-
"""Create a stratified mini dataset."""
|
| 46 |
-
random.seed(seed)
|
| 47 |
-
|
| 48 |
-
print(f"Loading full dataset from {train_source}...")
|
| 49 |
-
full_data = load_full_dataset(train_source)
|
| 50 |
-
print(f"Loaded {len(full_data)} total examples")
|
| 51 |
-
|
| 52 |
-
# Group by tool usage
|
| 53 |
-
tool_groups = defaultdict(list)
|
| 54 |
-
unknown_tools = []
|
| 55 |
-
|
| 56 |
-
for ex in full_data:
|
| 57 |
-
tools = extract_tool_calls(ex)
|
| 58 |
-
if tools:
|
| 59 |
-
# Use first tool as primary category
|
| 60 |
-
primary_tool = tools[0]
|
| 61 |
-
tool_groups[primary_tool].append(ex)
|
| 62 |
-
else:
|
| 63 |
-
unknown_tools.append(ex)
|
| 64 |
-
|
| 65 |
-
print(f"\nTool distribution in full dataset:")
|
| 66 |
-
total_tool_examples = sum(len(v) for v in tool_groups.values())
|
| 67 |
-
for tool, examples in sorted(tool_groups.items(), key=lambda x: len(x[1]), reverse=True)[:15]:
|
| 68 |
-
pct = len(examples) / len(full_data) * 100
|
| 69 |
-
print(f" {tool}: {len(examples)} examples ({pct:.1f}%)")
|
| 70 |
-
|
| 71 |
-
print(f" No-tool examples: {len(unknown_tools)} ({len(unknown_tools)/len(full_data)*100:.1f}%)")
|
| 72 |
-
|
| 73 |
-
# Determine sampling strategy
|
| 74 |
-
# Allocate samples proportionally, but ensure minimum 3 examples per tool
|
| 75 |
-
samples_per_tool = {}
|
| 76 |
-
min_per_tool = 3
|
| 77 |
-
remaining = n_samples
|
| 78 |
-
|
| 79 |
-
# First pass: assign minimum to all tools that have enough
|
| 80 |
-
for tool, examples in tool_groups.items():
|
| 81 |
-
if len(examples) >= min_per_tool:
|
| 82 |
-
samples_per_tool[tool] = min_per_tool
|
| 83 |
-
remaining -= min_per_tool
|
| 84 |
-
|
| 85 |
-
# Second pass: distribute remaining proportionally
|
| 86 |
-
if remaining > 0:
|
| 87 |
-
total_weight = sum(len(v) for v in tool_groups.values() if len(v) >= min_per_tool)
|
| 88 |
-
for tool, examples in tool_groups.items():
|
| 89 |
-
if len(examples) >= min_per_tool:
|
| 90 |
-
weight = len(examples) / total_weight
|
| 91 |
-
extra = int(remaining * weight)
|
| 92 |
-
samples_per_tool[tool] += extra
|
| 93 |
-
remaining -= extra
|
| 94 |
-
|
| 95 |
-
# Fill any leftover with no-tool examples
|
| 96 |
-
if remaining > 0 and unknown_tools:
|
| 97 |
-
samples_per_tool["__notool__"] = min(remaining, len(unknown_tools))
|
| 98 |
-
remaining -= min(remaining, len(unknown_tools))
|
| 99 |
-
|
| 100 |
-
# If we still have remaining, just take from the largest tool groups
|
| 101 |
-
if remaining > 0:
|
| 102 |
-
sorted_tools = sorted(tool_groups.items(), key=lambda x: len(x[1]), reverse=True)
|
| 103 |
-
for tool, examples in sorted_tools:
|
| 104 |
-
if remaining <= 0:
|
| 105 |
-
break
|
| 106 |
-
can_take = min(remaining, len(examples) - samples_per_tool.get(tool, 0))
|
| 107 |
-
if can_take > 0:
|
| 108 |
-
samples_per_tool[tool] = samples_per_tool.get(tool, 0) + can_take
|
| 109 |
-
remaining -= can_take
|
| 110 |
-
|
| 111 |
-
print(f"\nSampling plan (target {n_samples}):")
|
| 112 |
-
total_sampled = 0
|
| 113 |
-
for tool, n in sorted(samples_per_tool.items(), key=lambda x: x[1], reverse=True):
|
| 114 |
-
if n > 0:
|
| 115 |
-
available = len(tool_groups.get(tool, [])) if tool != "__notool__" else len(unknown_tools)
|
| 116 |
-
pct = n / n_samples * 100
|
| 117 |
-
print(f" {tool}: {n} examples ({pct:.1f}%) from {available} available")
|
| 118 |
-
total_sampled += n
|
| 119 |
-
|
| 120 |
-
# Perform sampling
|
| 121 |
-
mini_dataset = []
|
| 122 |
-
for tool, n_to_sample in samples_per_tool.items():
|
| 123 |
-
if n_to_sample <= 0:
|
| 124 |
-
continue
|
| 125 |
-
|
| 126 |
-
source_pool = tool_groups[tool] if tool != "__notool__" else unknown_tools
|
| 127 |
-
if len(source_pool) < n_to_sample:
|
| 128 |
-
n_to_sample = len(source_pool)
|
| 129 |
-
|
| 130 |
-
sampled = random.sample(source_pool, n_to_sample)
|
| 131 |
-
mini_dataset.extend(sampled)
|
| 132 |
-
|
| 133 |
-
# Shuffle the final dataset
|
| 134 |
-
random.shuffle(mini_dataset)
|
| 135 |
-
|
| 136 |
-
# Write output
|
| 137 |
-
output_path = Path(output_path)
|
| 138 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 139 |
-
|
| 140 |
-
with open(output_path, 'w') as f:
|
| 141 |
-
for ex in mini_dataset:
|
| 142 |
-
f.write(json.dumps(ex) + '\n')
|
| 143 |
-
|
| 144 |
-
print(f"\n✅ Mini dataset created: {len(mini_dataset)} examples")
|
| 145 |
-
print(f" Saved to: {output_path}")
|
| 146 |
-
|
| 147 |
-
# Stats
|
| 148 |
-
tool_counts = Counter()
|
| 149 |
-
for ex in mini_dataset:
|
| 150 |
-
tools = extract_tool_calls(ex)
|
| 151 |
-
if tools:
|
| 152 |
-
tool_counts[tools[0]] += 1
|
| 153 |
-
else:
|
| 154 |
-
tool_counts["__notool__"] += 1
|
| 155 |
-
|
| 156 |
-
print(f"\nFinal tool distribution:")
|
| 157 |
-
for tool, count in tool_counts.most_common(15):
|
| 158 |
-
pct = count / len(mini_dataset) * 100
|
| 159 |
-
print(f" {tool}: {count} ({pct:.1f}%)")
|
| 160 |
-
|
| 161 |
-
return mini_dataset
|
| 162 |
-
|
| 163 |
-
def main():
|
| 164 |
-
parser = argparse.ArgumentParser(description="Create mini dataset for fast prototyping")
|
| 165 |
-
parser.add_argument("--size", type=int, default=5000, help="Number of examples in mini dataset")
|
| 166 |
-
parser.add_argument("--output", type=str, default="./data_mini/train_mini.jsonl", help="Output file path")
|
| 167 |
-
parser.add_argument("--source", type=str, default="data/final/train.jsonl", help="Source full dataset")
|
| 168 |
-
parser.add_argument("--seed", type=int, default=42, help="Random seed for sampling")
|
| 169 |
-
|
| 170 |
-
args = parser.parse_args()
|
| 171 |
-
|
| 172 |
-
create_mini_dataset(
|
| 173 |
-
output_path=args.output,
|
| 174 |
-
n_samples=args.size,
|
| 175 |
-
train_source=args.source,
|
| 176 |
-
seed=args.seed
|
| 177 |
-
)
|
| 178 |
-
|
| 179 |
-
if __name__ == "__main__":
|
| 180 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/download_benchmark_datasets.py
DELETED
|
@@ -1,127 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Download benchmark datasets (HumanEval and MBPP) into ./data/ directory.
|
| 4 |
-
Uses huggingface datasets library for reliable downloads.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import os
|
| 8 |
-
import json
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
from datasets import load_dataset
|
| 11 |
-
import argparse
|
| 12 |
-
|
| 13 |
-
def download_humaneval(output_dir: str = "./data"):
|
| 14 |
-
"""Download HumanEval dataset (164 problems)."""
|
| 15 |
-
output_path = Path(output_dir) / "humaneval"
|
| 16 |
-
output_path.mkdir(parents=True, exist_ok=True)
|
| 17 |
-
|
| 18 |
-
print(f"⬇️ Downloading HumanEval to {output_path}...")
|
| 19 |
-
|
| 20 |
-
try:
|
| 21 |
-
# Load HumanEval from huggingface
|
| 22 |
-
dataset = load_dataset("openai_humaneval", split="test")
|
| 23 |
-
|
| 24 |
-
problems = {}
|
| 25 |
-
for idx, item in enumerate(dataset):
|
| 26 |
-
problem_id = f"HumanEval/{idx}"
|
| 27 |
-
problems[problem_id] = {
|
| 28 |
-
"task_id": problem_id,
|
| 29 |
-
"prompt": item["prompt"],
|
| 30 |
-
"canonical_solution": item["canonical_solution"],
|
| 31 |
-
"test": item["test"],
|
| 32 |
-
"entry_point": item["entry_point"]
|
| 33 |
-
}
|
| 34 |
-
|
| 35 |
-
# Save as JSONL (one problem per line)
|
| 36 |
-
output_file = output_path / "humaneval.jsonl"
|
| 37 |
-
with open(output_file, 'w') as f:
|
| 38 |
-
for problem in problems.values():
|
| 39 |
-
f.write(json.dumps(problem) + '\n')
|
| 40 |
-
|
| 41 |
-
# Also save a meta file
|
| 42 |
-
meta_file = output_path / "meta.json"
|
| 43 |
-
with open(meta_file, 'w') as f:
|
| 44 |
-
json.dump({
|
| 45 |
-
"name": "HumanEval",
|
| 46 |
-
"num_problems": len(problems),
|
| 47 |
-
"source": "openai_humaneval",
|
| 48 |
-
"description": "164 hand-written programming problems"
|
| 49 |
-
}, f, indent=2)
|
| 50 |
-
|
| 51 |
-
print(f"✅ HumanEval: {len(problems)} problems saved to {output_file}")
|
| 52 |
-
return len(problems)
|
| 53 |
-
|
| 54 |
-
except Exception as e:
|
| 55 |
-
print(f"❌ Failed to download HumanEval: {e}")
|
| 56 |
-
return 0
|
| 57 |
-
|
| 58 |
-
def download_mbpp(output_dir: str = "./data"):
|
| 59 |
-
"""Download MBPP dataset (500 problems)."""
|
| 60 |
-
output_path = Path(output_dir) / "mbpp"
|
| 61 |
-
output_path.mkdir(parents=True, exist_ok=True)
|
| 62 |
-
|
| 63 |
-
print(f"⬇️ Downloading MBPP to {output_path}...")
|
| 64 |
-
|
| 65 |
-
try:
|
| 66 |
-
# Load MBPP from huggingface
|
| 67 |
-
dataset = load_dataset("mbpp", split="test")
|
| 68 |
-
|
| 69 |
-
problems = {}
|
| 70 |
-
for idx, item in enumerate(dataset):
|
| 71 |
-
problem_id = f"MBPP/{idx}"
|
| 72 |
-
problems[problem_id] = {
|
| 73 |
-
"task_id": problem_id,
|
| 74 |
-
"text": item["text"],
|
| 75 |
-
"code": item["code"],
|
| 76 |
-
"test_list": item["test_list"],
|
| 77 |
-
"test_func": item["test_func"],
|
| 78 |
-
"challenge_test_list": item.get("challenge_test_list", [])
|
| 79 |
-
}
|
| 80 |
-
|
| 81 |
-
# Save as JSONL
|
| 82 |
-
output_file = output_path / "mbpp.jsonl"
|
| 83 |
-
with open(output_file, 'w') as f:
|
| 84 |
-
for problem in problems.values():
|
| 85 |
-
f.write(json.dumps(problem) + '\n')
|
| 86 |
-
|
| 87 |
-
# Meta file
|
| 88 |
-
meta_file = output_path / "meta.json"
|
| 89 |
-
with open(meta_file, 'w') as f:
|
| 90 |
-
json.dump({
|
| 91 |
-
"name": "MBPP",
|
| 92 |
-
"num_problems": len(problems),
|
| 93 |
-
"source": "mbpp",
|
| 94 |
-
"description": "500 beginner-friendly Python programming problems"
|
| 95 |
-
}, f, indent=2)
|
| 96 |
-
|
| 97 |
-
print(f"✅ MBPP: {len(problems)} problems saved to {output_file}")
|
| 98 |
-
return len(problems)
|
| 99 |
-
|
| 100 |
-
except Exception as e:
|
| 101 |
-
print(f"❌ Failed to download MBPP: {e}")
|
| 102 |
-
return 0
|
| 103 |
-
|
| 104 |
-
def main():
|
| 105 |
-
parser = argparse.ArgumentParser(description="Download benchmark datasets")
|
| 106 |
-
parser.add_argument("--output-dir", type=str, default="./data",
|
| 107 |
-
help="Output directory (default: ./data)")
|
| 108 |
-
parser.add_argument("--benchmark", type=str, choices=["humaneval", "mbpp", "both"],
|
| 109 |
-
default="both", help="Which benchmark to download")
|
| 110 |
-
args = parser.parse_args()
|
| 111 |
-
|
| 112 |
-
print("📥 Benchmark Dataset Downloader")
|
| 113 |
-
print(f"📁 Target directory: {args.output_dir}")
|
| 114 |
-
|
| 115 |
-
total_downloaded = 0
|
| 116 |
-
|
| 117 |
-
if args.benchmark in ["humaneval", "both"]:
|
| 118 |
-
total_downloaded += download_humaneval(args.output_dir)
|
| 119 |
-
|
| 120 |
-
if args.benchmark in ["mbpp", "both"]:
|
| 121 |
-
total_downloaded += download_mbpp(args.output_dir)
|
| 122 |
-
|
| 123 |
-
print(f"\n🎉 Total problems downloaded: {total_downloaded}")
|
| 124 |
-
print(f"📂 Data saved in: {Path(args.output_dir).resolve()}")
|
| 125 |
-
|
| 126 |
-
if __name__ == "__main__":
|
| 127 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/download_public_datasets.py
DELETED
|
@@ -1,170 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Download and integrate public coding datasets.
|
| 4 |
-
Datasets: OpenAssistant, CodeAct, CodeContests
|
| 5 |
-
Converts to Stack 2.9 format.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import json
|
| 9 |
-
import os
|
| 10 |
-
from datasets import load_dataset
|
| 11 |
-
from pathlib import Path
|
| 12 |
-
import argparse
|
| 13 |
-
|
| 14 |
-
def download_openassistant(output_path: Path, limit: int = 10000):
|
| 15 |
-
"""Download OpenAssistant and filter for coding conversations."""
|
| 16 |
-
print("📥 Downloading OpenAssistant dataset...")
|
| 17 |
-
try:
|
| 18 |
-
dataset = load_dataset("OpenAssistant/oasst1", split="train")
|
| 19 |
-
except Exception as e:
|
| 20 |
-
print(f"❌ Failed to load OpenAssistant: {e}")
|
| 21 |
-
return []
|
| 22 |
-
|
| 23 |
-
coding_examples = []
|
| 24 |
-
count = 0
|
| 25 |
-
|
| 26 |
-
for item in dataset:
|
| 27 |
-
# Filter for coding-related conversations
|
| 28 |
-
text = item.get("text", "").lower()
|
| 29 |
-
if any(keyword in text for keyword in ["code", "programming", "python", "javascript", "function", "api", "development"]):
|
| 30 |
-
# Convert to our format
|
| 31 |
-
messages = [
|
| 32 |
-
{"role": "user", "content": item.get("text", "")[:1000]}, # truncated
|
| 33 |
-
{"role": "assistant", "content": "Here's a coding assistant response..."}
|
| 34 |
-
]
|
| 35 |
-
coding_examples.append({
|
| 36 |
-
"messages": messages,
|
| 37 |
-
"source": "openassistant",
|
| 38 |
-
"dataset": "oasst1"
|
| 39 |
-
})
|
| 40 |
-
count += 1
|
| 41 |
-
if count >= limit:
|
| 42 |
-
break
|
| 43 |
-
|
| 44 |
-
print(f" Extracted {len(coding_examples)} coding-related examples from OpenAssistant")
|
| 45 |
-
return coding_examples
|
| 46 |
-
|
| 47 |
-
def download_codeact(output_path: Path, limit: int = 10000):
|
| 48 |
-
"""Download CodeAct dataset."""
|
| 49 |
-
print("📥 Downloading CodeAct dataset...")
|
| 50 |
-
try:
|
| 51 |
-
dataset = load_dataset("nuprl/CodeAct", split="train")
|
| 52 |
-
except Exception as e:
|
| 53 |
-
print(f"❌ Failed to load CodeAct: {e}")
|
| 54 |
-
return []
|
| 55 |
-
|
| 56 |
-
examples = []
|
| 57 |
-
count = 0
|
| 58 |
-
|
| 59 |
-
for item in dataset:
|
| 60 |
-
# CodeAct has actions - convert to tool calls
|
| 61 |
-
action = item.get("action", {})
|
| 62 |
-
if action:
|
| 63 |
-
messages = [
|
| 64 |
-
{"role": "user", "content": item.get("instruction", "")},
|
| 65 |
-
{
|
| 66 |
-
"role": "assistant",
|
| 67 |
-
"content": "Executing action...",
|
| 68 |
-
"tool_use": {
|
| 69 |
-
"name": "CodeActTool",
|
| 70 |
-
"input": action
|
| 71 |
-
}
|
| 72 |
-
},
|
| 73 |
-
{
|
| 74 |
-
"role": "user",
|
| 75 |
-
"content": "",
|
| 76 |
-
"tool_result": {
|
| 77 |
-
"tool_use_id": "tool_1",
|
| 78 |
-
"content": json.dumps(item.get("observation", {}))
|
| 79 |
-
}
|
| 80 |
-
},
|
| 81 |
-
{"role": "assistant", "content": item.get("final_answer", "Done.")}
|
| 82 |
-
]
|
| 83 |
-
examples.append({
|
| 84 |
-
"messages": messages,
|
| 85 |
-
"source": "codeact",
|
| 86 |
-
"dataset": "CodeAct"
|
| 87 |
-
})
|
| 88 |
-
count += 1
|
| 89 |
-
if count >= limit:
|
| 90 |
-
break
|
| 91 |
-
|
| 92 |
-
print(f" Extracted {len(examples)} examples from CodeAct")
|
| 93 |
-
return examples
|
| 94 |
-
|
| 95 |
-
def download_codecontests(output_path: Path, limit: int = 5000):
|
| 96 |
-
"""Download CodeContests (competition problems)."""
|
| 97 |
-
print("📥 Downloading CodeContests dataset...")
|
| 98 |
-
try:
|
| 99 |
-
dataset = load_dataset("m-a-p/CodeContests", split="train")
|
| 100 |
-
except Exception as e:
|
| 101 |
-
print(f"❌ Failed to load CodeContests: {e}")
|
| 102 |
-
return []
|
| 103 |
-
|
| 104 |
-
examples = []
|
| 105 |
-
count = 0
|
| 106 |
-
|
| 107 |
-
for item in dataset:
|
| 108 |
-
if item.get("problem") and item.get("solution"):
|
| 109 |
-
messages = [
|
| 110 |
-
{"role": "user", "content": f"Solve this problem:\n{item['problem']}"},
|
| 111 |
-
{"role": "assistant", "content": f"Here's a solution:\n```python\n{item['solution']}\n```"}
|
| 112 |
-
]
|
| 113 |
-
examples.append({
|
| 114 |
-
"messages": messages,
|
| 115 |
-
"source": "codecontests",
|
| 116 |
-
"dataset": "CodeContests"
|
| 117 |
-
})
|
| 118 |
-
count += 1
|
| 119 |
-
if count >= limit:
|
| 120 |
-
break
|
| 121 |
-
|
| 122 |
-
print(f" Extracted {len(examples)} examples from CodeContests")
|
| 123 |
-
return examples
|
| 124 |
-
|
| 125 |
-
def main():
|
| 126 |
-
parser = argparse.ArgumentParser()
|
| 127 |
-
parser.add_argument("--output", type=str, default="training-data/scaled/public_datasets.jsonl")
|
| 128 |
-
parser.add_argument("--limit-per-dataset", type=int, default=10000)
|
| 129 |
-
parser.add_argument("--skip-download", action="store_true", help="Use only existing datasets")
|
| 130 |
-
args = parser.parse_args()
|
| 131 |
-
|
| 132 |
-
output_path = Path(args.output)
|
| 133 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 134 |
-
|
| 135 |
-
all_examples = []
|
| 136 |
-
|
| 137 |
-
if not args.skip_download:
|
| 138 |
-
# OpenAssistant
|
| 139 |
-
all_examples.extend(download_openassistant(output_path, args.limit_per_dataset))
|
| 140 |
-
|
| 141 |
-
# CodeAct
|
| 142 |
-
all_examples.extend(download_codeact(output_path, args.limit_per_dataset))
|
| 143 |
-
|
| 144 |
-
# CodeContests
|
| 145 |
-
all_examples.extend(download_codecontests(output_path, min(5000, args.limit_per_dataset)))
|
| 146 |
-
else:
|
| 147 |
-
print("⚠️ Skipping downloads (--skip-download flag)")
|
| 148 |
-
|
| 149 |
-
# Write all examples
|
| 150 |
-
with open(output_path, 'w') as f:
|
| 151 |
-
for ex in all_examples:
|
| 152 |
-
f.write(json.dumps(ex) + "\n")
|
| 153 |
-
|
| 154 |
-
print(f"\n✨ Saved {len(all_examples)} examples from public datasets")
|
| 155 |
-
print(f" to: {output_path}")
|
| 156 |
-
|
| 157 |
-
# Show breakdown
|
| 158 |
-
sources = {}
|
| 159 |
-
for ex in all_examples:
|
| 160 |
-
src = ex.get("source", "unknown")
|
| 161 |
-
sources[src] = sources.get(src, 0) + 1
|
| 162 |
-
|
| 163 |
-
print("\n📊 Breakdown:")
|
| 164 |
-
for src, count in sources.items():
|
| 165 |
-
print(f" {src}: {count}")
|
| 166 |
-
|
| 167 |
-
print("\n⚠️ Note: These are raw integrations. May need format conversion to match tool-use patterns.")
|
| 168 |
-
|
| 169 |
-
if __name__ == "__main__":
|
| 170 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/{compare_models.py → eval/compare_models.py}
RENAMED
|
File without changes
|
scripts/{humaneval_eval.py → eval/humaneval_eval.py}
RENAMED
|
File without changes
|
scripts/{mbpp_eval.py → eval/mbpp_eval.py}
RENAMED
|
File without changes
|
scripts/{model_info.py → eval/model_info.py}
RENAMED
|
File without changes
|
scripts/{tool_use_evaluator.py → eval/tool_use_evaluator.py}
RENAMED
|
File without changes
|
scripts/extract_code_pairs.py
DELETED
|
@@ -1,215 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Extract code-comment pairs from the src/ directory.
|
| 4 |
-
Pairs: function/class code + its documentation comment.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import os
|
| 8 |
-
import re
|
| 9 |
-
import json
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from typing import List, Dict, Any
|
| 12 |
-
import argparse
|
| 13 |
-
|
| 14 |
-
def extract_jsdoc_comments(content: str) -> List[Dict[str, Any]]:
|
| 15 |
-
"""Extract JSDoc comments and associated code from JS/TS files."""
|
| 16 |
-
pairs = []
|
| 17 |
-
|
| 18 |
-
# Pattern to match JSDoc comment block followed by code
|
| 19 |
-
# Matches: /** ... */ followed by function/class/interface
|
| 20 |
-
pattern = re.compile(
|
| 21 |
-
r'/\*\*\s*(.*?)\s*\*/\s*' # JSDoc comment
|
| 22 |
-
r'(export\s+)?(async\s+)?(function|const|let|var|class|interface|type)\s+(\w+)',
|
| 23 |
-
re.DOTALL
|
| 24 |
-
)
|
| 25 |
-
|
| 26 |
-
for match in pattern.finditer(content):
|
| 27 |
-
comment_lines = match.group(1).strip().split('\n')
|
| 28 |
-
# Clean up comment markers
|
| 29 |
-
comment = []
|
| 30 |
-
for line in comment_lines:
|
| 31 |
-
line = line.strip()
|
| 32 |
-
if line.startswith('* '):
|
| 33 |
-
line = line[2:]
|
| 34 |
-
elif line.startswith('*'):
|
| 35 |
-
line = line[1:]
|
| 36 |
-
comment.append(line.strip())
|
| 37 |
-
comment_text = ' '.join(comment).strip()
|
| 38 |
-
|
| 39 |
-
code_start = match.end()
|
| 40 |
-
# Extract the function signature or class definition (up to opening brace or newline)
|
| 41 |
-
code_lines = []
|
| 42 |
-
lines = content[code_start:].split('\n')
|
| 43 |
-
for line in lines[:5]: # Take first few lines
|
| 44 |
-
code_lines.append(line)
|
| 45 |
-
if line.strip().endswith('{') or line.strip().endswith('>'):
|
| 46 |
-
break
|
| 47 |
-
code = '\n'.join(code_lines).strip()
|
| 48 |
-
|
| 49 |
-
if comment_text and code and len(code.split('\n')) >= 2:
|
| 50 |
-
pairs.append({
|
| 51 |
-
"code": code,
|
| 52 |
-
"comment": comment_text,
|
| 53 |
-
"type": match.group(3), # function/class/interface
|
| 54 |
-
"name": match.group(4)
|
| 55 |
-
})
|
| 56 |
-
|
| 57 |
-
return pairs
|
| 58 |
-
|
| 59 |
-
def extract_python_docstrings(content: str) -> List[Dict[str, Any]]:
|
| 60 |
-
"""Extract Python docstrings and associated code."""
|
| 61 |
-
pairs = []
|
| 62 |
-
|
| 63 |
-
# Pattern for triple-quoted docstring before function/class
|
| 64 |
-
pattern = re.compile(
|
| 65 |
-
r'''(?P<quote>''' + r'"""' + r'''|\'\'\')\s*(?P<doc>.*?)(?P=quote)\s*'''
|
| 66 |
-
r'(?:@\w+\s+)*def\s+(\w+)|class\s+(\w+)',
|
| 67 |
-
re.DOTALL
|
| 68 |
-
)
|
| 69 |
-
|
| 70 |
-
for match in pattern.finditer(content):
|
| 71 |
-
doc = match.group('doc').strip()
|
| 72 |
-
func_name = match.group(3) or match.group(4)
|
| 73 |
-
if func_name:
|
| 74 |
-
# Get the signature line
|
| 75 |
-
signature = content[match.end():].split('\n')[0].strip()
|
| 76 |
-
code = f"def {func_name}{signature}" if 'def' in signature else f"class {func_name}{signature}"
|
| 77 |
-
|
| 78 |
-
pairs.append({
|
| 79 |
-
"code": code,
|
| 80 |
-
"comment": doc,
|
| 81 |
-
"type": "function" if 'def' in signature else "class",
|
| 82 |
-
"name": func_name
|
| 83 |
-
})
|
| 84 |
-
|
| 85 |
-
return pairs
|
| 86 |
-
|
| 87 |
-
def extract_inline_comments(content: str, file_ext: str) -> List[Dict[str, Any]]:
|
| 88 |
-
"""Extract code block with preceding inline comment."""
|
| 89 |
-
pairs = []
|
| 90 |
-
|
| 91 |
-
lines = content.split('\n')
|
| 92 |
-
i = 0
|
| 93 |
-
while i < len(lines):
|
| 94 |
-
line = lines[i].rstrip()
|
| 95 |
-
# Check for // comment or # comment
|
| 96 |
-
if line.strip().startswith('//') or line.strip().startswith('#'):
|
| 97 |
-
comment = line.strip()[2:].strip()
|
| 98 |
-
# Look at next few lines for code
|
| 99 |
-
code_lines = []
|
| 100 |
-
j = i + 1
|
| 101 |
-
while j < len(lines) and len(code_lines) < 5:
|
| 102 |
-
next_line = lines[j].rstrip()
|
| 103 |
-
if next_line.strip() and not next_line.strip().startswith('//') and not next_line.strip().startswith('#'):
|
| 104 |
-
code_lines.append(next_line)
|
| 105 |
-
elif next_line.strip().startswith(('//', '#')):
|
| 106 |
-
break # Another comment block
|
| 107 |
-
j += 1
|
| 108 |
-
|
| 109 |
-
if comment and code_lines:
|
| 110 |
-
code = '\n'.join(code_lines)
|
| 111 |
-
# Only keep if comment is meaningful (>5 words or contains specific keywords)
|
| 112 |
-
if len(comment.split()) > 3 or any(kw in comment.lower() for kw in ['function', 'return', 'parameter', 'args', 'handle', 'process']):
|
| 113 |
-
pairs.append({
|
| 114 |
-
"code": code,
|
| 115 |
-
"comment": comment,
|
| 116 |
-
"type": "inline",
|
| 117 |
-
"name": None
|
| 118 |
-
})
|
| 119 |
-
i = j # Skip processed lines
|
| 120 |
-
else:
|
| 121 |
-
i += 1
|
| 122 |
-
else:
|
| 123 |
-
i += 1
|
| 124 |
-
|
| 125 |
-
return pairs
|
| 126 |
-
|
| 127 |
-
def process_file(file_path: Path) -> List[Dict[str, Any]]:
|
| 128 |
-
"""Process a single file and extract code-comment pairs."""
|
| 129 |
-
try:
|
| 130 |
-
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 131 |
-
content = f.read()
|
| 132 |
-
except Exception as e:
|
| 133 |
-
print(f"❌ Error reading {file_path}: {e}")
|
| 134 |
-
return []
|
| 135 |
-
|
| 136 |
-
pairs = []
|
| 137 |
-
|
| 138 |
-
# Extract by file type
|
| 139 |
-
if file_path.suffix in ['.js', '.ts', '.jsx', '.tsx']:
|
| 140 |
-
pairs.extend(extract_jsdoc_comments(content))
|
| 141 |
-
elif file_path.suffix == '.py':
|
| 142 |
-
pairs.extend(extract_python_docstrings(content))
|
| 143 |
-
|
| 144 |
-
# Inline comments for all types
|
| 145 |
-
pairs.extend(extract_inline_comments(content, file_path.suffix))
|
| 146 |
-
|
| 147 |
-
return pairs
|
| 148 |
-
|
| 149 |
-
def walk_source_files(src_dir: Path) -> List[Path]:
|
| 150 |
-
"""Walk src/ directory and return all relevant source files."""
|
| 151 |
-
extensions = ['.ts', '.tsx', '.js', '.jsx', '.py']
|
| 152 |
-
files = []
|
| 153 |
-
for ext in extensions:
|
| 154 |
-
files.extend(src_dir.rglob(f'*{ext}'))
|
| 155 |
-
return files
|
| 156 |
-
|
| 157 |
-
def main():
|
| 158 |
-
parser = argparse.ArgumentParser()
|
| 159 |
-
parser.add_argument("--src-dir", type=str, default="src")
|
| 160 |
-
parser.add_argument("--output", type=str, default="training-data/code-pairs/extended_pairs.json")
|
| 161 |
-
parser.add_argument("--limit", type=int, default=10000, help="Maximum pairs to extract")
|
| 162 |
-
args = parser.parse_args()
|
| 163 |
-
|
| 164 |
-
src_dir = Path(args.src_dir)
|
| 165 |
-
output_path = Path(args.output)
|
| 166 |
-
|
| 167 |
-
if not src_dir.exists():
|
| 168 |
-
print(f"❌ Source directory not found: {src_dir}")
|
| 169 |
-
return
|
| 170 |
-
|
| 171 |
-
print(f"🔍 Scanning {src_dir} for source files...")
|
| 172 |
-
files = walk_source_files(src_dir)
|
| 173 |
-
print(f" Found {len(files)} source files")
|
| 174 |
-
|
| 175 |
-
all_pairs = []
|
| 176 |
-
for file_path in files:
|
| 177 |
-
pairs = process_file(file_path)
|
| 178 |
-
if pairs:
|
| 179 |
-
all_pairs.extend(pairs)
|
| 180 |
-
print(f" {file_path.name}: {len(pairs)} pairs", end='\r')
|
| 181 |
-
|
| 182 |
-
if len(all_pairs) >= args.limit:
|
| 183 |
-
break
|
| 184 |
-
|
| 185 |
-
print(f"\n✨ Extracted {len(all_pairs)} code-comment pairs")
|
| 186 |
-
|
| 187 |
-
# Deduplicate (by comment+code hash)
|
| 188 |
-
seen = set()
|
| 189 |
-
unique_pairs = []
|
| 190 |
-
for pair in all_pairs:
|
| 191 |
-
key = (pair['comment'][:100], pair['code'][:100])
|
| 192 |
-
if key not in seen:
|
| 193 |
-
seen.add(key)
|
| 194 |
-
unique_pairs.append(pair)
|
| 195 |
-
|
| 196 |
-
print(f" After deduplication: {len(unique_pairs)} unique pairs")
|
| 197 |
-
|
| 198 |
-
# Save
|
| 199 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 200 |
-
with open(output_path, 'w') as f:
|
| 201 |
-
json.dump(unique_pairs, f, indent=2)
|
| 202 |
-
|
| 203 |
-
print(f"✅ Saved to: {output_path}")
|
| 204 |
-
|
| 205 |
-
# Stats
|
| 206 |
-
types = {}
|
| 207 |
-
for pair in unique_pairs:
|
| 208 |
-
t = pair.get('type', 'unknown')
|
| 209 |
-
types[t] = types.get(t, 0) + 1
|
| 210 |
-
print("\n📊 By type:")
|
| 211 |
-
for t, cnt in types.items():
|
| 212 |
-
print(f" {t}: {cnt}")
|
| 213 |
-
|
| 214 |
-
if __name__ == "__main__":
|
| 215 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/extract_patterns_from_git.py
DELETED
|
@@ -1,309 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Extract Code Patterns from Git History
|
| 4 |
-
|
| 5 |
-
Scans Git commit history to identify bug fixes and feature additions,
|
| 6 |
-
extracting "before → after" patterns for training data generation.
|
| 7 |
-
|
| 8 |
-
Usage:
|
| 9 |
-
python extract_patterns_from_git.py --repo-path . --output patterns.jsonl
|
| 10 |
-
python extract_patterns_from_git.py --repo-path . --output patterns.jsonl --since-date "2024-01-01"
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
import argparse
|
| 14 |
-
import hashlib
|
| 15 |
-
import json
|
| 16 |
-
import os
|
| 17 |
-
import subprocess
|
| 18 |
-
import sys
|
| 19 |
-
from datetime import datetime
|
| 20 |
-
from pathlib import Path
|
| 21 |
-
from typing import Optional
|
| 22 |
-
|
| 23 |
-
try:
|
| 24 |
-
from tqdm import tqdm
|
| 25 |
-
except ImportError:
|
| 26 |
-
tqdm = None
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
# Keywords that indicate bug fixes or improvements
|
| 30 |
-
BUG_FIX_KEYWORDS = [
|
| 31 |
-
"fix", "bug", "hotfix", "patch", "resolve", "correct", "repair",
|
| 32 |
-
"error", "crash", "fail", "issue", "problem", "broken"
|
| 33 |
-
]
|
| 34 |
-
|
| 35 |
-
FEATURE_KEYWORDS = [
|
| 36 |
-
"feat", "feature", "add", "new", "implement", "enhance", "improve",
|
| 37 |
-
"optimize", "refactor", "support", "introduce"
|
| 38 |
-
]
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def is_text_file(filepath: str) -> bool:
|
| 42 |
-
"""Check if a file is likely a text file (not binary)."""
|
| 43 |
-
binary_extensions = {
|
| 44 |
-
'.pyc', '.so', '.dll', '.exe', '.bin', '.dat', '.pickle',
|
| 45 |
-
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.ico', '.svg',
|
| 46 |
-
'.mp3', '.mp4', '.wav', '.avi', '.mov', '.pdf', '.zip',
|
| 47 |
-
'.tar', '.gz', '.rar', '.7z', '.whl', '.egg',
|
| 48 |
-
'.class', '.jar', '.war', '.ear',
|
| 49 |
-
'.db', '.sqlite', '.sqlite3',
|
| 50 |
-
'.ttf', '.otf', '.woff', '.woff2',
|
| 51 |
-
'.pem', '.key', '.crt', '.cer',
|
| 52 |
-
'.DS_Store', '.gitignore'
|
| 53 |
-
}
|
| 54 |
-
|
| 55 |
-
ext = Path(filepath).suffix.lower()
|
| 56 |
-
if ext in binary_extensions:
|
| 57 |
-
return False
|
| 58 |
-
|
| 59 |
-
# Try to read as text
|
| 60 |
-
try:
|
| 61 |
-
with open(filepath, 'rb') as f:
|
| 62 |
-
chunk = f.read(1024)
|
| 63 |
-
# Check for null bytes (common in binary files)
|
| 64 |
-
if b'\x00' in chunk:
|
| 65 |
-
return False
|
| 66 |
-
return True
|
| 67 |
-
except (OSError, IOError):
|
| 68 |
-
return False
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
def get_commit_messages(repo_path: str, since_date: Optional[str] = None) -> list[dict]:
|
| 72 |
-
"""Get commit information from git log."""
|
| 73 |
-
cmd = ["git", "-C", repo_path, "log", "--pretty=format:%H|%s|%an|%ad|%ae", "--date=iso"]
|
| 74 |
-
|
| 75 |
-
if since_date:
|
| 76 |
-
cmd.extend([f"--since={since_date}"])
|
| 77 |
-
|
| 78 |
-
try:
|
| 79 |
-
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 80 |
-
commits = []
|
| 81 |
-
|
| 82 |
-
for line in result.stdout.strip().split('\n'):
|
| 83 |
-
if not line:
|
| 84 |
-
continue
|
| 85 |
-
parts = line.split('|')
|
| 86 |
-
if len(parts) >= 5:
|
| 87 |
-
commits.append({
|
| 88 |
-
'hash': parts[0],
|
| 89 |
-
'message': parts[1],
|
| 90 |
-
'author': parts[2],
|
| 91 |
-
'date': parts[3],
|
| 92 |
-
'email': parts[4] if len(parts) > 4 else ''
|
| 93 |
-
})
|
| 94 |
-
|
| 95 |
-
return commits
|
| 96 |
-
except subprocess.CalledProcessError as e:
|
| 97 |
-
print(f"Error reading git log: {e}", file=sys.stderr)
|
| 98 |
-
return []
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def get_changed_files(repo_path: str, commit_hash: str) -> list[str]:
|
| 102 |
-
"""Get list of files changed in a commit."""
|
| 103 |
-
cmd = ["git", "-C", repo_path, "diff-tree", "--no-commit-id", "--name-only", "-r", commit_hash]
|
| 104 |
-
|
| 105 |
-
try:
|
| 106 |
-
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 107 |
-
files = []
|
| 108 |
-
for line in result.stdout.strip().split('\n'):
|
| 109 |
-
if line.strip():
|
| 110 |
-
files.append(line.strip())
|
| 111 |
-
return files
|
| 112 |
-
except subprocess.CalledProcessError:
|
| 113 |
-
return []
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
def get_file_diff(repo_path: str, commit_hash: str, filepath: str) -> tuple[Optional[str], Optional[str]]:
|
| 117 |
-
"""Get before and after content of a file in a commit."""
|
| 118 |
-
# Get the file content AFTER the commit
|
| 119 |
-
cmd_after = ["git", "-C", repo_path, "show", f"{commit_hash}:{filepath}"]
|
| 120 |
-
# Get the file content BEFORE the commit (parent)
|
| 121 |
-
cmd_before = ["git", "-C", repo_path, "show", f"{commit_hash}^:{filepath}"]
|
| 122 |
-
|
| 123 |
-
after_content = None
|
| 124 |
-
before_content = None
|
| 125 |
-
|
| 126 |
-
try:
|
| 127 |
-
result_after = subprocess.run(cmd_after, capture_output=True, text=True, check=True)
|
| 128 |
-
after_content = result_after.stdout
|
| 129 |
-
except subprocess.CalledProcessError:
|
| 130 |
-
# File might be new (no parent)
|
| 131 |
-
after_content = None
|
| 132 |
-
|
| 133 |
-
try:
|
| 134 |
-
result_before = subprocess.run(cmd_before, capture_output=True, text=True, check=True)
|
| 135 |
-
before_content = result_before.stdout
|
| 136 |
-
except subprocess.CalledProcessError:
|
| 137 |
-
# File was added in this commit
|
| 138 |
-
before_content = None
|
| 139 |
-
|
| 140 |
-
return before_content, after_content
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
def infer_problem_type(message: str) -> str:
|
| 144 |
-
"""Infer the problem type from commit message."""
|
| 145 |
-
msg_lower = message.lower()
|
| 146 |
-
|
| 147 |
-
# Check for bug fix indicators
|
| 148 |
-
for keyword in BUG_FIX_KEYWORDS:
|
| 149 |
-
if keyword in msg_lower:
|
| 150 |
-
return "bug_fix"
|
| 151 |
-
|
| 152 |
-
# Check for feature indicators
|
| 153 |
-
for keyword in FEATURE_KEYWORDS:
|
| 154 |
-
if keyword in msg_lower:
|
| 155 |
-
return "feature_addition"
|
| 156 |
-
|
| 157 |
-
return "unknown"
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
def compute_confidence(message: str, before: Optional[str], after: Optional[str]) -> float:
|
| 161 |
-
"""Compute confidence score for the extracted pattern."""
|
| 162 |
-
confidence = 0.5 # Base confidence
|
| 163 |
-
|
| 164 |
-
# Higher confidence if message contains clear keywords
|
| 165 |
-
msg_lower = message.lower()
|
| 166 |
-
if any(k in msg_lower for k in ["fix", "bug", "hotfix", "patch"]):
|
| 167 |
-
confidence += 0.2
|
| 168 |
-
if any(k in msg_lower for k in ["feat", "feature", "add", "implement"]):
|
| 169 |
-
confidence += 0.15
|
| 170 |
-
|
| 171 |
-
# Higher confidence if we have both before and after
|
| 172 |
-
if before and after:
|
| 173 |
-
confidence += 0.15
|
| 174 |
-
elif before or after:
|
| 175 |
-
confidence += 0.05
|
| 176 |
-
|
| 177 |
-
# Higher confidence for substantial changes
|
| 178 |
-
if before and after:
|
| 179 |
-
content_len = max(len(before), len(after))
|
| 180 |
-
if content_len > 100:
|
| 181 |
-
confidence += 0.1
|
| 182 |
-
if content_len > 500:
|
| 183 |
-
confidence += 0.1
|
| 184 |
-
|
| 185 |
-
return min(confidence, 1.0)
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
def generate_pattern_id(commit_hash: str, filepath: str) -> str:
|
| 189 |
-
"""Generate a unique pattern ID."""
|
| 190 |
-
content = f"{commit_hash}:{filepath}"
|
| 191 |
-
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
def extract_patterns(
|
| 195 |
-
repo_path: str,
|
| 196 |
-
output_path: str,
|
| 197 |
-
since_date: Optional[str] = None
|
| 198 |
-
) -> int:
|
| 199 |
-
"""Extract patterns from git history and write to JSONL file."""
|
| 200 |
-
|
| 201 |
-
print(f"Scanning repository: {repo_path}")
|
| 202 |
-
|
| 203 |
-
# Get all commits
|
| 204 |
-
commits = get_commit_messages(repo_path, since_date)
|
| 205 |
-
print(f"Found {len(commits)} commits")
|
| 206 |
-
|
| 207 |
-
if not commits:
|
| 208 |
-
print("No commits found.", file=sys.stderr)
|
| 209 |
-
return 0
|
| 210 |
-
|
| 211 |
-
patterns_extracted = 0
|
| 212 |
-
|
| 213 |
-
# Process each commit with progress bar
|
| 214 |
-
iterator = tqdm(commits, desc="Extracting patterns") if tqdm else commits
|
| 215 |
-
|
| 216 |
-
with open(output_path, 'w', encoding='utf-8') as outf:
|
| 217 |
-
for commit in iterator:
|
| 218 |
-
commit_hash = commit['hash']
|
| 219 |
-
message = commit['message']
|
| 220 |
-
author = commit['author']
|
| 221 |
-
date = commit['date']
|
| 222 |
-
|
| 223 |
-
# Infer problem type
|
| 224 |
-
problem_type = infer_problem_type(message)
|
| 225 |
-
|
| 226 |
-
# Skip if not a bug fix or feature
|
| 227 |
-
if problem_type == "unknown":
|
| 228 |
-
continue
|
| 229 |
-
|
| 230 |
-
# Get changed files
|
| 231 |
-
changed_files = get_changed_files(repo_path, commit_hash)
|
| 232 |
-
|
| 233 |
-
for filepath in changed_files:
|
| 234 |
-
# Skip binary files
|
| 235 |
-
full_path = os.path.join(repo_path, filepath)
|
| 236 |
-
if not os.path.exists(full_path):
|
| 237 |
-
continue
|
| 238 |
-
|
| 239 |
-
if not is_text_file(filepath):
|
| 240 |
-
continue
|
| 241 |
-
|
| 242 |
-
# Get diff
|
| 243 |
-
before_content, after_content = get_file_diff(repo_path, commit_hash, filepath)
|
| 244 |
-
|
| 245 |
-
# Skip if no meaningful change
|
| 246 |
-
if before_content == after_content:
|
| 247 |
-
continue
|
| 248 |
-
if not before_content and not after_content:
|
| 249 |
-
continue
|
| 250 |
-
|
| 251 |
-
# Compute confidence
|
| 252 |
-
confidence = compute_confidence(message, before_content, after_content)
|
| 253 |
-
|
| 254 |
-
# Create pattern record
|
| 255 |
-
pattern = {
|
| 256 |
-
"pattern_id": generate_pattern_id(commit_hash, filepath),
|
| 257 |
-
"problem_type": problem_type,
|
| 258 |
-
"before_code": before_content or "",
|
| 259 |
-
"after_code": after_content or "",
|
| 260 |
-
"commit_msg": message,
|
| 261 |
-
"author": author,
|
| 262 |
-
"date": date,
|
| 263 |
-
"confidence": round(confidence, 2)
|
| 264 |
-
}
|
| 265 |
-
|
| 266 |
-
# Write as JSONL
|
| 267 |
-
outf.write(json.dumps(pattern, ensure_ascii=False) + '\n')
|
| 268 |
-
patterns_extracted += 1
|
| 269 |
-
|
| 270 |
-
print(f"\nExtracted {patterns_extracted} patterns to {output_path}")
|
| 271 |
-
return patterns_extracted
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
def main():
|
| 275 |
-
parser = argparse.ArgumentParser(
|
| 276 |
-
description="Extract code patterns from Git history for training data"
|
| 277 |
-
)
|
| 278 |
-
parser.add_argument(
|
| 279 |
-
"--repo-path",
|
| 280 |
-
type=str,
|
| 281 |
-
required=True,
|
| 282 |
-
help="Path to the Git repository"
|
| 283 |
-
)
|
| 284 |
-
parser.add_argument(
|
| 285 |
-
"--output",
|
| 286 |
-
type=str,
|
| 287 |
-
required=True,
|
| 288 |
-
help="Output JSONL file path"
|
| 289 |
-
)
|
| 290 |
-
parser.add_argument(
|
| 291 |
-
"--since-date",
|
| 292 |
-
type=str,
|
| 293 |
-
default=None,
|
| 294 |
-
help="Only extract commits since this date (YYYY-MM-DD)"
|
| 295 |
-
)
|
| 296 |
-
|
| 297 |
-
args = parser.parse_args()
|
| 298 |
-
|
| 299 |
-
# Validate repo path
|
| 300 |
-
if not os.path.isdir(os.path.join(args.repo_path, '.git')):
|
| 301 |
-
print(f"Error: {args.repo_path} is not a Git repository", file=sys.stderr)
|
| 302 |
-
sys.exit(1)
|
| 303 |
-
|
| 304 |
-
# Run extraction
|
| 305 |
-
extract_patterns(args.repo_path, args.output, args.since_date)
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
if __name__ == "__main__":
|
| 309 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/extract_rtmp_tools.py
DELETED
|
@@ -1,174 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Extract training data from RTMP tools for Stack 2.9
|
| 4 |
-
Creates synthetic tool-use examples from the RTMP codebase
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import os
|
| 8 |
-
import json
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
|
| 11 |
-
RTMP_DIR = "/Users/walidsobhi/.openclaw/workspace/RTMP"
|
| 12 |
-
OUTPUT_DIR = "/Users/walidsobhi/.openclaw/workspace/stack-2.9/data/rtmp-tools"
|
| 13 |
-
|
| 14 |
-
def get_tool_description(tool_name: str) -> str:
|
| 15 |
-
"""Get tool descriptions from tool names"""
|
| 16 |
-
descriptions = {
|
| 17 |
-
"BashTool": "Execute shell commands in a sandboxed environment",
|
| 18 |
-
"FileReadTool": "Read file contents from the filesystem",
|
| 19 |
-
"FileWriteTool": "Write content to files",
|
| 20 |
-
"FileEditTool": "Edit files using sed-style replacements",
|
| 21 |
-
"GlobTool": "Find files matching glob patterns",
|
| 22 |
-
"GrepTool": "Search for patterns in files",
|
| 23 |
-
"TaskCreateTool": "Create tasks in the task list",
|
| 24 |
-
"TaskListTool": "List all tasks in the task list",
|
| 25 |
-
"TaskUpdateTool": "Update task status and details",
|
| 26 |
-
"TaskGetTool": "Get details of a specific task",
|
| 27 |
-
"WebSearchTool": "Search the web for information",
|
| 28 |
-
"WebFetchTool": "Fetch and analyze web pages",
|
| 29 |
-
"SkillTool": "Execute user-invocable skills",
|
| 30 |
-
"McpTool": "Call MCP (Model Context Protocol) tools",
|
| 31 |
-
"AgentTool": "Delegate tasks to sub-agents",
|
| 32 |
-
}
|
| 33 |
-
return descriptions.get(tool_name.replace("Tool", ""), f"Tool: {tool_name}")
|
| 34 |
-
|
| 35 |
-
def extract_tool_examples():
|
| 36 |
-
"""Extract tool patterns and create training examples"""
|
| 37 |
-
examples = []
|
| 38 |
-
|
| 39 |
-
tools_dir = Path(RTMP_DIR) / "tools"
|
| 40 |
-
if not tools_dir.exists():
|
| 41 |
-
print(f"❌ Tools directory not found: {tools_dir}")
|
| 42 |
-
return examples
|
| 43 |
-
|
| 44 |
-
# Process each tool
|
| 45 |
-
tool_dirs = [d for d in tools_dir.iterdir() if d.is_dir() and d.name.endswith("Tool")]
|
| 46 |
-
|
| 47 |
-
for tool_dir in tool_dirs:
|
| 48 |
-
tool_name = tool_dir.name
|
| 49 |
-
|
| 50 |
-
# Get main tool file
|
| 51 |
-
main_files = list(tool_dir.glob("*.tsx")) + list(tool_dir.glob("*.ts"))
|
| 52 |
-
|
| 53 |
-
# Create example for this tool
|
| 54 |
-
desc = get_tool_description(tool_name)
|
| 55 |
-
|
| 56 |
-
# Generate prompt-response pairs for tool usage
|
| 57 |
-
example = {
|
| 58 |
-
"messages": [
|
| 59 |
-
{
|
| 60 |
-
"role": "user",
|
| 61 |
-
"content": f"How do I use the {tool_name}? What does it do?"
|
| 62 |
-
},
|
| 63 |
-
{
|
| 64 |
-
"role": "assistant",
|
| 65 |
-
"content": f"The {tool_name} allows you to {desc.lower()}.\n\n" +
|
| 66 |
-
f"This is one of the core tools in RTMP (Real-Time AI Assistant).\n\n" +
|
| 67 |
-
f"Tools like {tool_name} enable the assistant to interact with the filesystem, " +
|
| 68 |
-
f"execute commands, manage tasks, and more."
|
| 69 |
-
}
|
| 70 |
-
]
|
| 71 |
-
}
|
| 72 |
-
examples.append(example)
|
| 73 |
-
|
| 74 |
-
# Add more specific examples for common tools
|
| 75 |
-
if "Bash" in tool_name:
|
| 76 |
-
example = {
|
| 77 |
-
"messages": [
|
| 78 |
-
{"role": "user", "content": "List all files in the current directory"},
|
| 79 |
-
{"role": "assistant", "tool_calls": [
|
| 80 |
-
{"name": "Bash", "arguments": {"command": "ls -la"}}
|
| 81 |
-
]}
|
| 82 |
-
]
|
| 83 |
-
}
|
| 84 |
-
examples.append(example)
|
| 85 |
-
|
| 86 |
-
elif "FileRead" in tool_name:
|
| 87 |
-
example = {
|
| 88 |
-
"messages": [
|
| 89 |
-
{"role": "user", "content": "Read the contents of config.json"},
|
| 90 |
-
{"role": "assistant", "tool_calls": [
|
| 91 |
-
{"name": "FileRead", "arguments": {"file_path": "config.json"}}
|
| 92 |
-
]}
|
| 93 |
-
]
|
| 94 |
-
}
|
| 95 |
-
examples.append(example)
|
| 96 |
-
|
| 97 |
-
elif "Glob" in tool_name:
|
| 98 |
-
example = {
|
| 99 |
-
"messages": [
|
| 100 |
-
{"role": "user", "content": "Find all TypeScript files in the project"},
|
| 101 |
-
{"role": "assistant", "tool_calls": [
|
| 102 |
-
{"name": "Glob", "arguments": {"pattern": "**/*.ts"}}
|
| 103 |
-
]}
|
| 104 |
-
]
|
| 105 |
-
}
|
| 106 |
-
examples.append(example)
|
| 107 |
-
|
| 108 |
-
elif "Grep" in tool_name:
|
| 109 |
-
example = {
|
| 110 |
-
"messages": [
|
| 111 |
-
{"role": "user", "content": "Find all occurrences of 'TODO' in the code"},
|
| 112 |
-
{"role": "assistant", "tool_calls": [
|
| 113 |
-
{"name": "Grep", "arguments": {"pattern": "TODO", "path": "."}}
|
| 114 |
-
]}
|
| 115 |
-
]
|
| 116 |
-
}
|
| 117 |
-
examples.append(example)
|
| 118 |
-
|
| 119 |
-
elif "TaskCreate" in tool_name:
|
| 120 |
-
example = {
|
| 121 |
-
"messages": [
|
| 122 |
-
{"role": "user", "content": "Create a task to fix the login bug"},
|
| 123 |
-
{"role": "assistant", "tool_calls": [
|
| 124 |
-
{"name": "TaskCreate", "arguments": {
|
| 125 |
-
"subject": "Fix login bug",
|
| 126 |
-
"description": "Investigate and fix the login issue"
|
| 127 |
-
}}
|
| 128 |
-
]}
|
| 129 |
-
]
|
| 130 |
-
}
|
| 131 |
-
examples.append(example)
|
| 132 |
-
|
| 133 |
-
elif "WebSearch" in tool_name:
|
| 134 |
-
example = {
|
| 135 |
-
"messages": [
|
| 136 |
-
{"role": "user", "content": "Search for latest Python 3.14 features"},
|
| 137 |
-
{"role": "assistant", "tool_calls": [
|
| 138 |
-
{"name": "WebSearch", "arguments": {"query": "Python 3.14 new features"}}
|
| 139 |
-
]}
|
| 140 |
-
]
|
| 141 |
-
}
|
| 142 |
-
examples.append(example)
|
| 143 |
-
|
| 144 |
-
return examples
|
| 145 |
-
|
| 146 |
-
def main():
|
| 147 |
-
print("=" * 60)
|
| 148 |
-
print("Extracting RTMP Tool Patterns for Training")
|
| 149 |
-
print("=" * 60)
|
| 150 |
-
|
| 151 |
-
# Create output directory
|
| 152 |
-
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 153 |
-
|
| 154 |
-
# Extract examples
|
| 155 |
-
examples = extract_tool_examples()
|
| 156 |
-
|
| 157 |
-
print(f"\n✅ Extracted {len(examples)} tool usage examples")
|
| 158 |
-
|
| 159 |
-
# Save to JSONL
|
| 160 |
-
output_file = os.path.join(OUTPUT_DIR, "tool_patterns.jsonl")
|
| 161 |
-
with open(output_file, 'w') as f:
|
| 162 |
-
for ex in examples:
|
| 163 |
-
f.write(json.dumps(ex) + '\n')
|
| 164 |
-
|
| 165 |
-
print(f"✅ Saved to: {output_file}")
|
| 166 |
-
|
| 167 |
-
# Also show some examples
|
| 168 |
-
print("\n📋 Sample examples:")
|
| 169 |
-
for i, ex in enumerate(examples[:3]):
|
| 170 |
-
user_msg = ex["messages"][0]["content"]
|
| 171 |
-
print(f" {i+1}. User: {user_msg[:60]}...")
|
| 172 |
-
|
| 173 |
-
if __name__ == "__main__":
|
| 174 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/extract_rtmp_tools.ts
DELETED
|
@@ -1,115 +0,0 @@
|
|
| 1 |
-
// Extract tool schemas from RTMP for training data
|
| 2 |
-
//
|
| 3 |
-
// This script extracts tool definitions from the RTMP codebase
|
| 4 |
-
// and adds them to stack-2.9's training data catalog.
|
| 5 |
-
|
| 6 |
-
import { readdir, readFile, writeFile } from 'fs/promises'
|
| 7 |
-
import { join, basename } from 'path'
|
| 8 |
-
|
| 9 |
-
const RTMP_TOOLS_DIR = '/Users/walidsobhi/.openclaw/workspace/RTMP/tools'
|
| 10 |
-
const STACK_CATALOG = '/Users/walidsobhi/.openclaw/workspace/stack-2.9/training-data/tools/catalog.json'
|
| 11 |
-
|
| 12 |
-
interface ToolSchema {
|
| 13 |
-
tool: string
|
| 14 |
-
description: string
|
| 15 |
-
hasPrompt: boolean
|
| 16 |
-
hasImplementation: boolean
|
| 17 |
-
inputSchema: Record<string, unknown>
|
| 18 |
-
}
|
| 19 |
-
|
| 20 |
-
async function extractToolSchemas(): Promise<ToolSchema[]> {
|
| 21 |
-
const tools: ToolSchema[] = []
|
| 22 |
-
const toolDirs = await readdir(RTMP_TOOLS_DIR)
|
| 23 |
-
|
| 24 |
-
for (const toolDir of toolDirs) {
|
| 25 |
-
const toolPath = join(RTMP_TOOLS_DIR, toolDir)
|
| 26 |
-
const stat = await readdir(toolPath).then(() => true).catch(() => false)
|
| 27 |
-
|
| 28 |
-
if (!stat) continue
|
| 29 |
-
|
| 30 |
-
// Try to extract tool name and description from tool files
|
| 31 |
-
let description = ''
|
| 32 |
-
let hasPrompt = false
|
| 33 |
-
let hasImplementation = false
|
| 34 |
-
|
| 35 |
-
try {
|
| 36 |
-
// Check for prompt.ts
|
| 37 |
-
const promptPath = join(toolPath, 'prompt.ts')
|
| 38 |
-
const promptContent = await readFile(promptPath, 'utf-8')
|
| 39 |
-
hasPrompt = true
|
| 40 |
-
|
| 41 |
-
// Extract first meaningful comment as description
|
| 42 |
-
const comments = promptContent.match(/\/\*\*[\s\S]*?\*\//g)
|
| 43 |
-
if (comments && comments.length > 0) {
|
| 44 |
-
const comment = comments[0]
|
| 45 |
-
description = comment
|
| 46 |
-
.replace(/\/\*\*|\*\//g, '')
|
| 47 |
-
.replace(/^\s*\*\s?/gm, '')
|
| 48 |
-
.trim()
|
| 49 |
-
.slice(0, 200)
|
| 50 |
-
}
|
| 51 |
-
} catch {
|
| 52 |
-
// No prompt.ts
|
| 53 |
-
}
|
| 54 |
-
|
| 55 |
-
try {
|
| 56 |
-
// Check for implementation files
|
| 57 |
-
const toolFiles = await readdir(toolPath)
|
| 58 |
-
hasImplementation = toolFiles.some(f =>
|
| 59 |
-
f.endsWith('.ts') || f.endsWith('.tsx')
|
| 60 |
-
)
|
| 61 |
-
} catch {
|
| 62 |
-
// Ignore
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
// Format tool name (remove Tool suffix for cleaner names)
|
| 66 |
-
const toolName = toolDir.replace(/Tool$/, '')
|
| 67 |
-
|
| 68 |
-
tools.push({
|
| 69 |
-
tool: toolDir,
|
| 70 |
-
description: description || `${toolName} tool`,
|
| 71 |
-
hasPrompt,
|
| 72 |
-
hasImplementation,
|
| 73 |
-
inputSchema: {}
|
| 74 |
-
})
|
| 75 |
-
}
|
| 76 |
-
|
| 77 |
-
return tools
|
| 78 |
-
}
|
| 79 |
-
|
| 80 |
-
async function main() {
|
| 81 |
-
console.log('Extracting tool schemas from RTMP...')
|
| 82 |
-
|
| 83 |
-
const tools = await extractToolSchemas()
|
| 84 |
-
console.log(`Found ${tools.length} tools`)
|
| 85 |
-
|
| 86 |
-
// Read existing catalog
|
| 87 |
-
let existingTools: ToolSchema[] = []
|
| 88 |
-
try {
|
| 89 |
-
const existingContent = await readFile(STACK_CATALOG, 'utf-8')
|
| 90 |
-
existingTools = JSON.parse(existingContent)
|
| 91 |
-
} catch {
|
| 92 |
-
console.log('No existing catalog found')
|
| 93 |
-
}
|
| 94 |
-
|
| 95 |
-
// Merge with existing (avoid duplicates)
|
| 96 |
-
const existingNames = new Set(existingTools.map(t => t.tool))
|
| 97 |
-
const newTools = tools.filter(t => !existingNames.has(t.tool))
|
| 98 |
-
|
| 99 |
-
console.log(`Adding ${newTools.length} new tools`)
|
| 100 |
-
|
| 101 |
-
// Combine
|
| 102 |
-
const allTools = [...existingTools, ...newTools]
|
| 103 |
-
|
| 104 |
-
// Write updated catalog
|
| 105 |
-
await writeFile(STACK_CATALOG, JSON.stringify(allTools, null, 2))
|
| 106 |
-
console.log(`Updated catalog with ${allTools.length} tools`)
|
| 107 |
-
|
| 108 |
-
// Also print summary
|
| 109 |
-
console.log('\nNew tools added:')
|
| 110 |
-
for (const tool of newTools) {
|
| 111 |
-
console.log(` - ${tool.tool}`)
|
| 112 |
-
}
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
-
main().catch(console.error)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/extract_rtmp_tools_advanced.py
DELETED
|
@@ -1,199 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Extract advanced training data from RTMP tools
|
| 4 |
-
Includes actual patterns, best practices, and tool definitions
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import os
|
| 8 |
-
import json
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
|
| 11 |
-
RTMP_DIR = "/Users/walidsobhi/.openclaw/workspace/RTMP"
|
| 12 |
-
OUTPUT_DIR = "/Users/walidsobhi/.openclaw/workspace/stack-2.9/data/rtmp-tools"
|
| 13 |
-
|
| 14 |
-
def get_tool_definitions():
|
| 15 |
-
"""Get detailed tool definitions with examples"""
|
| 16 |
-
tools = []
|
| 17 |
-
|
| 18 |
-
# BashTool
|
| 19 |
-
tools.append({
|
| 20 |
-
"messages": [
|
| 21 |
-
{"role": "system", "content": "You are a helpful AI coding assistant with access to various tools."},
|
| 22 |
-
{"role": "user", "content": "List all files in the current directory including hidden files"},
|
| 23 |
-
{"role": "assistant", "tool_calls": [
|
| 24 |
-
{"name": "Bash", "arguments": {"command": "ls -la"}}
|
| 25 |
-
]},
|
| 26 |
-
{"role": "tool", "content": "total 128\ndrwxr-xr-x 1 user staff 4096 Apr 4 10:00 .\ndrwxr-xr-x 1 user staff 4096 Apr 4 10:00 ..\n-rw-r--r-- 1 user staff 1024 Apr 4 09:00 file.ts"}
|
| 27 |
-
]
|
| 28 |
-
})
|
| 29 |
-
|
| 30 |
-
tools.append({
|
| 31 |
-
"messages": [
|
| 32 |
-
{"role": "user", "content": "Find all TypeScript files in the src directory"},
|
| 33 |
-
{"role": "assistant", "tool_calls": [
|
| 34 |
-
{"name": "Glob", "arguments": {"pattern": "src/**/*.ts"}}
|
| 35 |
-
]}
|
| 36 |
-
]
|
| 37 |
-
})
|
| 38 |
-
|
| 39 |
-
tools.append({
|
| 40 |
-
"messages": [
|
| 41 |
-
{"role": "user", "content": "Search for all TODO comments in the codebase"},
|
| 42 |
-
{"role": "assistant", "tool_calls": [
|
| 43 |
-
{"name": "Grep", "arguments": {"pattern": "TODO", "path": ".", "output_mode": "content"}}
|
| 44 |
-
]}
|
| 45 |
-
]
|
| 46 |
-
})
|
| 47 |
-
|
| 48 |
-
tools.append({
|
| 49 |
-
"messages": [
|
| 50 |
-
{"role": "user", "content": "Read the package.json file"},
|
| 51 |
-
{"role": "assistant", "tool_calls": [
|
| 52 |
-
{"name": "FileRead", "arguments": {"file_path": "package.json"}}
|
| 53 |
-
]}
|
| 54 |
-
]
|
| 55 |
-
})
|
| 56 |
-
|
| 57 |
-
tools.append({
|
| 58 |
-
"messages": [
|
| 59 |
-
{"role": "user", "content": "Create a new task to implement the login feature"},
|
| 60 |
-
{"role": "assistant", "tool_calls": [
|
| 61 |
-
{"name": "TaskCreate", "arguments": {
|
| 62 |
-
"subject": "Implement login feature",
|
| 63 |
-
"description": "Add user authentication with OAuth",
|
| 64 |
-
"activeForm": "Implementing login feature"
|
| 65 |
-
}}
|
| 66 |
-
]}
|
| 67 |
-
]
|
| 68 |
-
})
|
| 69 |
-
|
| 70 |
-
tools.append({
|
| 71 |
-
"messages": [
|
| 72 |
-
{"role": "user", "content": "Search the web for how to use PyTorch MPS on Apple Silicon"},
|
| 73 |
-
{"role": "assistant", "tool_calls": [
|
| 74 |
-
{"name": "WebSearch", "arguments": {"query": "PyTorch MPS Apple Silicon tutorial"}}
|
| 75 |
-
]}
|
| 76 |
-
]
|
| 77 |
-
})
|
| 78 |
-
|
| 79 |
-
tools.append({
|
| 80 |
-
"messages": [
|
| 81 |
-
{"role": "user", "content": "Fetch the documentation from https://pytorch.org/docs"},
|
| 82 |
-
{"role": "assistant", "tool_calls": [
|
| 83 |
-
{"name": "WebFetch", "arguments": {"url": "https://pytorch.org/docs", "prompt": "Extract the main features of PyTorch"}}
|
| 84 |
-
]}
|
| 85 |
-
]
|
| 86 |
-
})
|
| 87 |
-
|
| 88 |
-
# Task management examples
|
| 89 |
-
tools.append({
|
| 90 |
-
"messages": [
|
| 91 |
-
{"role": "user", "content": "Show me all open tasks"},
|
| 92 |
-
{"role": "assistant", "tool_calls": [
|
| 93 |
-
{"name": "TaskList", "arguments": {}}
|
| 94 |
-
]}
|
| 95 |
-
]
|
| 96 |
-
})
|
| 97 |
-
|
| 98 |
-
tools.append({
|
| 99 |
-
"messages": [
|
| 100 |
-
{"role": "user", "content": "What's the status of the login feature task?"},
|
| 101 |
-
{"role": "assistant", "tool_calls": [
|
| 102 |
-
{"name": "TaskGet", "arguments": {"taskId": "1"}}
|
| 103 |
-
]}
|
| 104 |
-
]
|
| 105 |
-
})
|
| 106 |
-
|
| 107 |
-
# File operations
|
| 108 |
-
tools.append({
|
| 109 |
-
"messages": [
|
| 110 |
-
{"role": "user", "content": "Write a simple Python script to a new file"},
|
| 111 |
-
{"role": "assistant", "tool_calls": [
|
| 112 |
-
{"name": "FileWrite", "arguments": {
|
| 113 |
-
"file_path": "hello.py",
|
| 114 |
-
"content": "print('Hello, World!')\n"
|
| 115 |
-
}}
|
| 116 |
-
]}
|
| 117 |
-
]
|
| 118 |
-
})
|
| 119 |
-
|
| 120 |
-
tools.append({
|
| 121 |
-
"messages": [
|
| 122 |
-
{"role": "user", "content": "Edit the config file to enable debug mode"},
|
| 123 |
-
{"role": "assistant", "tool_calls": [
|
| 124 |
-
{"name": "FileEdit", "arguments": {
|
| 125 |
-
"file_path": "config.json",
|
| 126 |
-
"old_string": "\"debug\": false",
|
| 127 |
-
"new_string": "\"debug\": true"
|
| 128 |
-
}}
|
| 129 |
-
]}
|
| 130 |
-
]
|
| 131 |
-
})
|
| 132 |
-
|
| 133 |
-
# Complex multi-step examples
|
| 134 |
-
tools.append({
|
| 135 |
-
"messages": [
|
| 136 |
-
{"role": "user", "content": "First find all React component files, then read one of them"},
|
| 137 |
-
{"role": "assistant", "tool_calls": [
|
| 138 |
-
{"name": "Glob", "arguments": {"pattern": "**/*.{tsx,jsx}"}},
|
| 139 |
-
{"name": "TaskCreate", "arguments": {
|
| 140 |
-
"subject": "Read React components",
|
| 141 |
-
"description": "Read the found component files",
|
| 142 |
-
"activeForm": "Reading React components"
|
| 143 |
-
}}
|
| 144 |
-
]}
|
| 145 |
-
]
|
| 146 |
-
})
|
| 147 |
-
|
| 148 |
-
# Skill invocation
|
| 149 |
-
tools.append({
|
| 150 |
-
"messages": [
|
| 151 |
-
{"role": "user", "content": "Commit the changes with a message"},
|
| 152 |
-
{"role": "assistant", "tool_calls": [
|
| 153 |
-
{"name": "Skill", "arguments": {"skill": "git-commit", "args": "-m 'Fix bug'"}}
|
| 154 |
-
]}
|
| 155 |
-
]
|
| 156 |
-
})
|
| 157 |
-
|
| 158 |
-
return tools
|
| 159 |
-
|
| 160 |
-
def main():
|
| 161 |
-
print("=" * 60)
|
| 162 |
-
print("Extracting Advanced RTMP Tool Patterns")
|
| 163 |
-
print("=" * 60)
|
| 164 |
-
|
| 165 |
-
# Get tool examples
|
| 166 |
-
tools = get_tool_definitions()
|
| 167 |
-
|
| 168 |
-
print(f"\n✅ Created {len(tools)} advanced tool examples")
|
| 169 |
-
|
| 170 |
-
# Save to JSONL
|
| 171 |
-
output_file = os.path.join(OUTPUT_DIR, "advanced_tool_patterns.jsonl")
|
| 172 |
-
with open(output_file, 'w') as f:
|
| 173 |
-
for ex in tools:
|
| 174 |
-
f.write(json.dumps(ex) + '\n')
|
| 175 |
-
|
| 176 |
-
print(f"✅ Saved to: {output_file}")
|
| 177 |
-
|
| 178 |
-
# Combine with previous
|
| 179 |
-
prev_file = os.path.join(OUTPUT_DIR, "tool_patterns.jsonl")
|
| 180 |
-
combined_file = os.path.join(OUTPUT_DIR, "combined_tools.jsonl")
|
| 181 |
-
|
| 182 |
-
with open(combined_file, 'w') as out:
|
| 183 |
-
# Previous simple patterns
|
| 184 |
-
if os.path.exists(prev_file):
|
| 185 |
-
with open(prev_file) as f:
|
| 186 |
-
for line in f:
|
| 187 |
-
out.write(line)
|
| 188 |
-
# Advanced patterns
|
| 189 |
-
with open(output_file) as f:
|
| 190 |
-
for line in f:
|
| 191 |
-
out.write(line)
|
| 192 |
-
|
| 193 |
-
print(f"\n📦 Total combined examples:")
|
| 194 |
-
with open(combined_file) as f:
|
| 195 |
-
count = sum(1 for _ in f)
|
| 196 |
-
print(f" {count} tool usage examples")
|
| 197 |
-
|
| 198 |
-
if __name__ == "__main__":
|
| 199 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/generate_code_completion_data.py
DELETED
|
@@ -1,262 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Synthetic Code Completion Training Data Generator for Stack 2.9
|
| 4 |
-
Generates training examples for pure code completion without tools.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import json
|
| 8 |
-
import random
|
| 9 |
-
import argparse
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from typing import Dict, List
|
| 12 |
-
|
| 13 |
-
LANGUAGES = ["python", "javascript", "go", "rust", "typescript"]
|
| 14 |
-
DIFFICULTY_EASY = "easy"
|
| 15 |
-
DIFFICULTY_MEDIUM = "medium"
|
| 16 |
-
DIFFICULTY_HARD = "hard"
|
| 17 |
-
|
| 18 |
-
# Code templates organized by language -> difficulty -> templates
|
| 19 |
-
CODE_TEMPLATES = {
|
| 20 |
-
"python": {
|
| 21 |
-
DIFFICULTY_EASY: [
|
| 22 |
-
{"context": "def greet(name):", "completion": ' return f"Hello, {name}!"', "description": "Simple greeting function"},
|
| 23 |
-
{"context": "numbers = [1, 2, 3, 4, 5]\n\n", "completion": "for num in numbers:\n print(num)", "description": "Loop through list"},
|
| 24 |
-
{"context": "class Person:\n def __init__(self, name):", "completion": " self.name = name", "description": "Class init"},
|
| 25 |
-
{"context": "def add(a, b):\n ", "completion": " return a + b", "description": "Add function"},
|
| 26 |
-
{"context": "if x > 0:\n print('positive')\nelif x < 0:\n ", "completion": " print('negative')", "description": "Conditional"},
|
| 27 |
-
],
|
| 28 |
-
DIFFICULTY_MEDIUM: [
|
| 29 |
-
{"context": "def fibonacci(n):\n if n <= 1:\n return n\n ", "completion": " return fibonacci(n-1) + fibonacci(n-2)", "description": "Fibonacci"},
|
| 30 |
-
{"context": "class Calculator:\n def __init__(self):\n self.result = 0\n \n def add(self, x):\n ", "completion": " self.result += x\n return self.result", "description": "Calculator"},
|
| 31 |
-
{"context": "async def fetch_data(url):\n async with aiohttp.ClientSession() as session:\n async with session.get(url) as response:\n ", "completion": " return await response.json()", "description": "Async HTTP"},
|
| 32 |
-
{"context": "def validate_email(email):\n pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'\n ", "completion": " return re.match(pattern, email) is not None", "description": "Email validation"},
|
| 33 |
-
{"context": "@app.route('/users/<int:user_id>')\ndef get_user(user_id):\n user = User.query.get_or_404(user_id)\n ", "completion": " return jsonify(user.to_dict())", "description": "Flask route"},
|
| 34 |
-
],
|
| 35 |
-
DIFFICULTY_HARD: [
|
| 36 |
-
{"context": "class LRUCache:\n def __init__(self, capacity):\n self.capacity = capacity\n self.cache = OrderedDict()\n \n def get(self, key):\n if key not in self.cache:\n return -1\n ", "completion": " self.cache.move_to_end(key)\n return self.cache[key]", "description": "LRU Cache"},
|
| 37 |
-
{"context": "def merge_sort(arr):\n if len(arr) <= 1:\n return arr\n \n mid = len(arr) // 2\n left = merge_sort(arr[:mid])\n right = merge_sort(arr[mid:])\n ", "completion": " return merge(left, right)", "description": "Merge sort"},
|
| 38 |
-
{"context": "class BinaryTree:\n def __init__(self, value):\n self.value = value\n self.left = None\n self.right = None\n \n def inorder(self, node, result=None):\n if result is None:\n result = []\n if node:\n ", "completion": " self.inorder(node.left, result)\n result.append(node.value)\n self.inorder(node.right, result)\n return result", "description": "Binary tree inorder"},
|
| 39 |
-
{"context": "def bellman_ford(graph, source):\n dist = {v: float('inf') for v in graph}\n dist[source] = 0\n \n for _ in range(len(graph) - 1):\n for u, v, w in graph.edges:\n if dist[u] != float('inf') and dist[u] + w < dist[v]:\n ", "completion": " dist[v] = dist[u] + w\n return dist", "description": "Bellman-Ford"},
|
| 40 |
-
],
|
| 41 |
-
},
|
| 42 |
-
"javascript": {
|
| 43 |
-
DIFFICULTY_EASY: [
|
| 44 |
-
{"context": "const greet = (name) => {", "completion": ' return `Hello, ${name}!`;', "description": "Arrow greeting"},
|
| 45 |
-
{"context": "const numbers = [1, 2, 3, 4, 5];\n\n", "completion": "numbers.forEach(num => console.log(num));", "description": "forEach loop"},
|
| 46 |
-
{"context": "class Person {\n constructor(name) {", "completion": " this.name = name;", "description": "JS class constructor"},
|
| 47 |
-
{"context": "const add = (a, b) => {", "completion": " return a + b;", "description": "Add function"},
|
| 48 |
-
{"context": "if (x > 0) {\n console.log('positive');\n} else if (x < 0) {\n ", "completion": " console.log('negative');", "description": "Conditional"},
|
| 49 |
-
],
|
| 50 |
-
DIFFICULTY_MEDIUM: [
|
| 51 |
-
{"context": "const fetchData = async (url) => {\n try {\n const response = await fetch(url);\n ", "completion": " return await response.json();\n } catch (error) {\n console.error('Error:', error);\n }", "description": "Async fetch"},
|
| 52 |
-
{"context": "class EventEmitter {\n constructor() {\n this.events = {};\n }\n \n on(event, callback) {\n ", "completion": " if (!this.events[event]) this.events[event] = [];\n this.events[event].push(callback);", "description": "Event emitter"},
|
| 53 |
-
{"context": "const debounce = (func, delay) => {\n let timeoutId;\n return (...args) => {\n clearTimeout(timeoutId);\n ", "completion": " timeoutId = setTimeout(() => func.apply(this, args), delay);", "description": "Debounce"},
|
| 54 |
-
{"context": "const memoize = (fn) => {\n const cache = new Map();\n return (n) => {\n if (cache.has(n)) {\n return cache.get(n);\n }\n ", "completion": " const result = fn(n);\n cache.set(n, result);\n return result;", "description": "Memoize"},
|
| 55 |
-
],
|
| 56 |
-
DIFFICULTY_HARD: [
|
| 57 |
-
{"context": "class PromisePool {\n constructor(maxConcurrent) {\n this.maxConcurrent = maxConcurrent;\n this.running = 0;\n this.queue = [];\n }\n \n add(promiseFn) {\n return new Promise((resolve, reject) => {\n ", "completion": " this.queue.push({ promiseFn, resolve, reject });\n this.process();\n });", "description": "Promise pool"},
|
| 58 |
-
{"context": "const virtualDOM = {\n createElement(tag, props, ...children) {\n return {\n tag,\n props: props || {},\n children: children.flat(),\n };\n },\n render(vnode, container) {\n ", "completion": " const el = document.createElement(vnode.tag);\n Object.entries(vnode.props || {}).forEach(([key, value]) => el.setAttribute(key, value));\n vnode.children.forEach(child => {\n if (typeof child === 'string') el.appendChild(document.createTextNode(child));\n else this.render(child, el);\n });\n container.appendChild(el);", "description": "Virtual DOM"},
|
| 59 |
-
],
|
| 60 |
-
},
|
| 61 |
-
"go": {
|
| 62 |
-
DIFFICULTY_EASY: [
|
| 63 |
-
{"context": "func greet(name string) string {", "completion": ' return "Hello, " + name + "!"', "description": "Greet function"},
|
| 64 |
-
{"context": "func add(a, b int) int {", "completion": " return a + b", "description": "Add function"},
|
| 65 |
-
{"context": "type Person struct {\n Name string\n ", "completion": " Age int", "description": "Struct definition"},
|
| 66 |
-
{"context": "for i := 0; i < 10; i++ {\n ", "completion": " fmt.Println(i)", "description": "For loop"},
|
| 67 |
-
{"context": "if x > 0 {\n fmt.Println(\"positive\")\n} else {\n ", "completion": ' fmt.Println("non-positive")', "description": "If-else"},
|
| 68 |
-
],
|
| 69 |
-
DIFFICULTY_MEDIUM: [
|
| 70 |
-
{"context": "func (p Person) Greet() string {", "completion": ' return fmt.Sprintf("Hello, %s!", p.Name)', "description": "Method"},
|
| 71 |
-
{"context": "func worker(jobs <-chan int, results chan<- int) {\n for j := range jobs {\n ", "completion": " results <- j * 2", "description": "Worker goroutine"},
|
| 72 |
-
{"context": "type Handler interface {\n Handle(ctx context.Context, req Request) Response\n ", "completion": " Cleanup(ctx context.Context)", "description": "Interface"},
|
| 73 |
-
{"context": "func fetchData(url string) ([]byte, error) {\n resp, err := http.Get(url)\n if err != nil {\n return nil, err\n }\n defer resp.Body.Close()\n ", "completion": " return io.ReadAll(resp.Body)", "description": "HTTP GET"},
|
| 74 |
-
],
|
| 75 |
-
DIFFICULTY_HARD: [
|
| 76 |
-
{"context": "type TreeNode struct {\n Val int\n Left *TreeNode\n Right *TreeNode\n}\n\nfunc (root *TreeNode) InorderTraversal() []int {\n var result []int\n var inorder func(*TreeNode)\n inorder = func(node *TreeNode) {\n if node == nil {\n return\n }\n ", "completion": " inorder(node.Left)\n result = append(result, node.Val)\n inorder(node.Right)", "description": "Tree inorder"},
|
| 77 |
-
{"context": "func (c *Client) StreamProcess(ctx context.Context, req *Request, stream chan<- *Response) error {\n for {\n select {\n case <-ctx.Done():\n return ctx.Err()\n default:\n result, err := c.processOne(req)\n if err != nil {\n return err\n }\n ", "completion": " select {\n case stream <- result:\n case <-ctx.Done():\n return ctx.Err()\n }", "description": "Streaming"},
|
| 78 |
-
],
|
| 79 |
-
},
|
| 80 |
-
"rust": {
|
| 81 |
-
DIFFICULTY_EASY: [
|
| 82 |
-
{"context": "fn greet(name: &str) -> String {", "completion": ' format!("Hello, {}!", name)', "description": "Greet function"},
|
| 83 |
-
{"context": "fn add(a: i32, b: i32) -> i32 {", "completion": " a + b", "description": "Add function"},
|
| 84 |
-
{"context": "struct Person {\n name: String,\n ", "completion": " age: u32,", "description": "Struct"},
|
| 85 |
-
{"context": "let numbers = vec![1, 2, 3, 4, 5];\nfor num in &numbers {\n ", "completion": " println!(\"{}\", num);", "description": "For loop"},
|
| 86 |
-
{"context": "fn main() {\n let result = match value {\n Some(x) => x,\n ", "completion": " None => 0,", "description": "Match"},
|
| 87 |
-
],
|
| 88 |
-
DIFFICULTY_MEDIUM: [
|
| 89 |
-
{"context": "impl Person {\n fn new(name: String, age: u32) -> Self {", "completion": " Person { name, age }", "description": "Constructor"},
|
| 90 |
-
{"context": "fn fetch_data(url: &str) -> Result<String, Error> {\n let response = reqwest::blocking::get(url)?;\n ", "completion": " let body = response.text()?;\n Ok(body)", "description": "HTTP request"},
|
| 91 |
-
{"context": "fn process_items<T: Display>(items: Vec<T>) -> String {\n items\n .iter()\n .enumerate()\n .map(|(i, item)| format!(\"{}: {}\", i, item))\n ", "completion": " .collect::<Vec<_>>()\n .join(\", \")", "description": "Iterator chain"},
|
| 92 |
-
{"context": "fn spawn_worker(jobs: Arc<Mutex<Vec<Job>>>) {\n thread::spawn(move || {\n loop {\n let job = {\n let mut jobs = jobs.lock().unwrap();\n jobs.pop()\n };\n match job {\n Some(job) => job.execute(),\n ", "completion": " None => break,\n };\n }\n });", "description": "Worker thread"},
|
| 93 |
-
],
|
| 94 |
-
DIFFICULTY_HARD: [
|
| 95 |
-
{"context": "pub struct LRUCache<K, V> {\n capacity: usize,\n cache: LinkedHashMap<K, V>,\n}\n\nimpl<K: Eq + Hash + Clone, V: Clone> LRUCache<K, V> {\n pub fn get(&mut self, key: &K) -> Option<&V> {\n if self.cache.contains_key(key) {\n ", "completion": " self.cache.remove(key);\n let value = self.cache[key].clone();\n self.cache.insert(key.clone(), value);\n self.cache.get(key)\n } else {\n None\n }", "description": "LRU Cache"},
|
| 96 |
-
{"context": "pub trait Observer<T> {\n fn update(&self, event: &T);\n}\n\npub struct Subject<T> {\n observers: Vec<Box<dyn Observer<T>>>,\n}\n\nimpl<T> Subject<T> {\n pub fn notify(&self, event: &T) {\n for observer in &self.observers {\n ", "completion": " observer.update(event);", "description": "Observer pattern"},
|
| 97 |
-
],
|
| 98 |
-
},
|
| 99 |
-
}
|
| 100 |
-
|
| 101 |
-
VARIANTS = ["basic", "explain", "debug", "optimize"]
|
| 102 |
-
|
| 103 |
-
VARIANT_PROMPTS = {
|
| 104 |
-
"basic": {"system": "You are a helpful AI assistant that helps with code completion.", "user_prefix": "Complete the following code:\n\n"},
|
| 105 |
-
"explain": {"system": "You are a helpful AI assistant that explains and completes code.", "user_prefix": "Explain what this code does and complete it:\n\n"},
|
| 106 |
-
"debug": {"system": "You are a helpful AI assistant that finds bugs and suggests fixes.", "user_prefix": "There's a bug in this code. Fix and complete it:\n\n"},
|
| 107 |
-
"optimize": {"system": "You are a helpful AI assistant that optimizes code for performance.", "user_prefix": "Optimize this code and complete it:\n\n"},
|
| 108 |
-
}
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
def create_completion_example(context, completion, language, difficulty, variant, description):
|
| 112 |
-
"""Create a single code completion example."""
|
| 113 |
-
variant_info = VARIANT_PROMPTS[variant]
|
| 114 |
-
messages = [
|
| 115 |
-
{"role": "system", "content": variant_info["system"]},
|
| 116 |
-
{"role": "user", "content": f"{variant_info['user_prefix']}```{language}\n{context}```"},
|
| 117 |
-
{"role": "assistant", "content": f"Here's the completed code:\n\n```{language}\n{context}{completion}\n```"}
|
| 118 |
-
]
|
| 119 |
-
return {
|
| 120 |
-
"messages": messages,
|
| 121 |
-
"language": language,
|
| 122 |
-
"difficulty": difficulty,
|
| 123 |
-
"variant": variant,
|
| 124 |
-
"description": description,
|
| 125 |
-
"context": context,
|
| 126 |
-
"completion": completion,
|
| 127 |
-
}
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
def generate_examples_for_language(language, difficulty, num_examples, variants):
|
| 131 |
-
"""Generate examples for a specific language and difficulty."""
|
| 132 |
-
templates = CODE_TEMPLATES[language][difficulty]
|
| 133 |
-
examples = []
|
| 134 |
-
for i in range(num_examples):
|
| 135 |
-
template = templates[i % len(templates)]
|
| 136 |
-
variant = random.choice(variants)
|
| 137 |
-
example = create_completion_example(
|
| 138 |
-
context=template["context"],
|
| 139 |
-
completion=template["completion"],
|
| 140 |
-
language=language,
|
| 141 |
-
difficulty=difficulty,
|
| 142 |
-
variant=variant,
|
| 143 |
-
description=template["description"]
|
| 144 |
-
)
|
| 145 |
-
examples.append(example)
|
| 146 |
-
return examples
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
def generate_dataset(num_examples=1000, languages=None, difficulties=None, variants=None, balance=True):
|
| 150 |
-
"""Generate the complete dataset."""
|
| 151 |
-
if languages is None:
|
| 152 |
-
languages = LANGUAGES
|
| 153 |
-
if difficulties is None:
|
| 154 |
-
difficulties = [DIFFICULTY_EASY, DIFFICULTY_MEDIUM, DIFFICULTY_HARD]
|
| 155 |
-
if variants is None:
|
| 156 |
-
variants = VARIANTS
|
| 157 |
-
|
| 158 |
-
examples = []
|
| 159 |
-
|
| 160 |
-
if balance:
|
| 161 |
-
examples_per_lang = num_examples // len(languages)
|
| 162 |
-
examples_per_diff = examples_per_lang // len(difficulties)
|
| 163 |
-
remainder = num_examples % (len(languages) * len(difficulties))
|
| 164 |
-
|
| 165 |
-
for lang in languages:
|
| 166 |
-
for diff_idx, diff in enumerate(difficulties):
|
| 167 |
-
count = examples_per_diff + (1 if diff_idx < remainder else 0)
|
| 168 |
-
lang_examples = generate_examples_for_language(lang, diff, count, variants)
|
| 169 |
-
examples.extend(lang_examples)
|
| 170 |
-
else:
|
| 171 |
-
for _ in range(num_examples):
|
| 172 |
-
lang = random.choice(languages)
|
| 173 |
-
diff = random.choice(difficulties)
|
| 174 |
-
template = random.choice(CODE_TEMPLATES[lang][diff])
|
| 175 |
-
variant = random.choice(variants)
|
| 176 |
-
example = create_completion_example(
|
| 177 |
-
context=template["context"],
|
| 178 |
-
completion=template["completion"],
|
| 179 |
-
language=lang,
|
| 180 |
-
difficulty=diff,
|
| 181 |
-
variant=variant,
|
| 182 |
-
description=template["description"]
|
| 183 |
-
)
|
| 184 |
-
examples.append(example)
|
| 185 |
-
|
| 186 |
-
random.shuffle(examples)
|
| 187 |
-
return examples
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
def save_jsonl(examples, output_path):
|
| 191 |
-
"""Save examples to JSONL format."""
|
| 192 |
-
output_file = Path(output_path)
|
| 193 |
-
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 194 |
-
with open(output_file, 'w', encoding='utf-8') as f:
|
| 195 |
-
for example in examples:
|
| 196 |
-
f.write(json.dumps(example, ensure_ascii=False) + '\n')
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
def save_json(examples, output_path):
|
| 200 |
-
"""Save examples to JSON format."""
|
| 201 |
-
output_file = Path(output_path)
|
| 202 |
-
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 203 |
-
with open(output_file, 'w', encoding='utf-8') as f:
|
| 204 |
-
json.dump(examples, f, ensure_ascii=False, indent=2)
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
def main():
|
| 208 |
-
parser = argparse.ArgumentParser(description="Generate synthetic code completion training data")
|
| 209 |
-
parser.add_argument("--num-examples", type=int, default=1000, help="Number of examples to generate")
|
| 210 |
-
parser.add_argument("--output-dir", type=str, default="training-data/code-completion", help="Output directory")
|
| 211 |
-
parser.add_argument("--output-format", choices=["jsonl", "json", "both"], default="jsonl", help="Output format")
|
| 212 |
-
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
| 213 |
-
args = parser.parse_args()
|
| 214 |
-
|
| 215 |
-
random.seed(args.seed)
|
| 216 |
-
|
| 217 |
-
print(f"Generating {args.num_examples} code completion training examples...")
|
| 218 |
-
print(f" Languages: {LANGUAGES}")
|
| 219 |
-
print(f" Output directory: {args.output_dir}")
|
| 220 |
-
|
| 221 |
-
examples = generate_dataset(
|
| 222 |
-
num_examples=args.num_examples,
|
| 223 |
-
languages=LANGUAGES,
|
| 224 |
-
difficulties=[DIFFICULTY_EASY, DIFFICULTY_MEDIUM, DIFFICULTY_HARD],
|
| 225 |
-
variants=VARIANTS
|
| 226 |
-
)
|
| 227 |
-
|
| 228 |
-
output_dir = Path(args.output_dir)
|
| 229 |
-
|
| 230 |
-
if args.output_format in ["jsonl", "both"]:
|
| 231 |
-
jsonl_path = output_dir / "code_completion.jsonl"
|
| 232 |
-
save_jsonl(examples, str(jsonl_path))
|
| 233 |
-
print(f"Saved JSONL: {jsonl_path}")
|
| 234 |
-
|
| 235 |
-
if args.output_format in ["json", "both"]:
|
| 236 |
-
json_path = output_dir / "code_completion.json"
|
| 237 |
-
save_json(examples, str(json_path))
|
| 238 |
-
print(f"Saved JSON: {json_path}")
|
| 239 |
-
|
| 240 |
-
# Statistics
|
| 241 |
-
print(f"\nStatistics:")
|
| 242 |
-
print(f" Total examples: {len(examples)}")
|
| 243 |
-
|
| 244 |
-
lang_counts = {}
|
| 245 |
-
diff_counts = {}
|
| 246 |
-
for ex in examples:
|
| 247 |
-
lang_counts[ex["language"]] = lang_counts.get(ex["language"], 0) + 1
|
| 248 |
-
diff_counts[ex["difficulty"]] = diff_counts.get(ex["difficulty"], 0) + 1
|
| 249 |
-
|
| 250 |
-
print(f" By language:")
|
| 251 |
-
for lang, count in sorted(lang_counts.items(), key=lambda x: x[1], reverse=True):
|
| 252 |
-
print(f" - {lang}: {count}")
|
| 253 |
-
|
| 254 |
-
print(f" By difficulty:")
|
| 255 |
-
for diff, count in sorted(diff_counts.items(), key=lambda x: x[1], reverse=True):
|
| 256 |
-
print(f" - {diff}: {count}")
|
| 257 |
-
|
| 258 |
-
print(f"\nGeneration complete!")
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
if __name__ == "__main__":
|
| 262 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/generate_from_rtmp.ts
DELETED
|
@@ -1,114 +0,0 @@
|
|
| 1 |
-
// Generate synthetic training data from RTMP codebase
|
| 2 |
-
//
|
| 3 |
-
// Extracts code examples and patterns from RTMP to create training data
|
| 4 |
-
// for stack-2.9.
|
| 5 |
-
|
| 6 |
-
import { readdir, readFile, writeFile, mkdir } from 'fs/promises'
|
| 7 |
-
import { join, basename } from 'path'
|
| 8 |
-
|
| 9 |
-
const RTMP_DIR = '/Users/walidsobhi/.openclaw/workspace/RTMP'
|
| 10 |
-
const OUTPUT_DIR = '/Users/walidsobhi/.openclaw/workspace/stack-2.9/training-data/src-derived'
|
| 11 |
-
|
| 12 |
-
interface TrainingExample {
|
| 13 |
-
messages: Array<{
|
| 14 |
-
role: string
|
| 15 |
-
content: string
|
| 16 |
-
}>
|
| 17 |
-
}
|
| 18 |
-
|
| 19 |
-
const SYSTEM_PROMPT = `You are Stack, an AI coding assistant based on Claude Code. You help with programming tasks, answer questions, use tools when needed, and provide code examples.`
|
| 20 |
-
|
| 21 |
-
async function extractCodeExamples(): Promise<TrainingExample[]> {
|
| 22 |
-
const examples: TrainingExample[] = []
|
| 23 |
-
|
| 24 |
-
// Extract from RTMP tools prompts - these are good instruction examples
|
| 25 |
-
const toolsDir = join(RTMP_DIR, 'tools')
|
| 26 |
-
const toolDirs = await readdir(toolsDir).catch(() => [])
|
| 27 |
-
|
| 28 |
-
for (const toolDir of toolDirs.slice(0, 10)) { // Limit to 10 tools
|
| 29 |
-
const promptPath = join(toolsDir, toolDir, 'prompt.ts')
|
| 30 |
-
try {
|
| 31 |
-
const content = await readFile(promptPath, 'utf-8')
|
| 32 |
-
|
| 33 |
-
// Extract useful code patterns
|
| 34 |
-
const toolName = toolDir.replace('Tool', '')
|
| 35 |
-
|
| 36 |
-
// Create example from tool usage
|
| 37 |
-
examples.push({
|
| 38 |
-
messages: [
|
| 39 |
-
{ role: 'system', content: SYSTEM_PROMPT },
|
| 40 |
-
{
|
| 41 |
-
role: 'user',
|
| 42 |
-
content: `How do I use the ${toolName} tool?`
|
| 43 |
-
},
|
| 44 |
-
{
|
| 45 |
-
role: 'assistant',
|
| 46 |
-
content: `The ${toolName} tool allows you to ${getToolDescription(toolName)}. Here's how to use it:\n\n\`\`\`\n// Example usage\n// See the tool source for complete documentation\n\`\`\`\n\nKey features:\n- Feature 1\n- Feature 2`
|
| 47 |
-
}
|
| 48 |
-
]
|
| 49 |
-
})
|
| 50 |
-
} catch {
|
| 51 |
-
// Skip if no prompt
|
| 52 |
-
}
|
| 53 |
-
}
|
| 54 |
-
|
| 55 |
-
// Extract from RTMP commands
|
| 56 |
-
const commandsDir = join(RTMP_DIR, 'commands')
|
| 57 |
-
try {
|
| 58 |
-
const commandDirs = await readdir(commandsDir)
|
| 59 |
-
for (const cmd of commandDirs.slice(0, 5)) {
|
| 60 |
-
examples.push({
|
| 61 |
-
messages: [
|
| 62 |
-
{ role: 'system', content: SYSTEM_PROMPT },
|
| 63 |
-
{
|
| 64 |
-
role: 'user',
|
| 65 |
-
content: `How do I use the /${cmd} command?`
|
| 66 |
-
},
|
| 67 |
-
{
|
| 68 |
-
role: 'assistant',
|
| 69 |
-
content: `The /${cmd} command provides ${cmd} functionality. Use it by typing /${cmd} in your prompt.`
|
| 70 |
-
}
|
| 71 |
-
]
|
| 72 |
-
})
|
| 73 |
-
}
|
| 74 |
-
} catch {
|
| 75 |
-
// Ignore
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
return examples
|
| 79 |
-
}
|
| 80 |
-
|
| 81 |
-
function getToolDescription(toolName: string): string {
|
| 82 |
-
const descriptions: Record<string, string> = {
|
| 83 |
-
'Bash': 'execute shell commands and get output',
|
| 84 |
-
'FileRead': 'read files from the filesystem',
|
| 85 |
-
'FileWrite': 'write content to files',
|
| 86 |
-
'FileEdit': 'make targeted edits to files',
|
| 87 |
-
'Glob': 'find files matching patterns',
|
| 88 |
-
'Grep': 'search for text in files',
|
| 89 |
-
'LSP': 'get language server features like autocomplete',
|
| 90 |
-
'MCP': 'use Model Context Protocol servers',
|
| 91 |
-
'Task': 'create and manage task lists',
|
| 92 |
-
'Todo': 'track tasks and todo items'
|
| 93 |
-
}
|
| 94 |
-
return descriptions[toolName] || 'perform its designated function'
|
| 95 |
-
}
|
| 96 |
-
|
| 97 |
-
async function main() {
|
| 98 |
-
console.log('Generating synthetic training data from RTMP...')
|
| 99 |
-
|
| 100 |
-
// Ensure output directory exists
|
| 101 |
-
await mkdir(OUTPUT_DIR, { recursive: true }).catch(() => {})
|
| 102 |
-
|
| 103 |
-
const examples = await extractCodeExamples()
|
| 104 |
-
console.log(`Generated ${examples.length} training examples`)
|
| 105 |
-
|
| 106 |
-
// Write to JSONL
|
| 107 |
-
const outputPath = join(OUTPUT_DIR, 'rtmp_examples.jsonl')
|
| 108 |
-
const content = examples.map(e => JSON.stringify(e)).join('\n')
|
| 109 |
-
await writeFile(outputPath, content)
|
| 110 |
-
|
| 111 |
-
console.log(`Written to ${outputPath}`)
|
| 112 |
-
}
|
| 113 |
-
|
| 114 |
-
main().catch(console.error)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/generate_random_synthetic.py
DELETED
|
@@ -1,141 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Generate random synthetic tool-use examples.
|
| 4 |
-
Uses tool catalog to create syntactically valid random conversations.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import json
|
| 8 |
-
import random
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
import argparse
|
| 11 |
-
|
| 12 |
-
def load_tool_catalog(path: str):
|
| 13 |
-
with open(path, 'r') as f:
|
| 14 |
-
return json.load(f)
|
| 15 |
-
|
| 16 |
-
def random_value_for_type(param_name: str) -> Any:
|
| 17 |
-
"""Generate a plausible random value based on parameter name."""
|
| 18 |
-
if 'file' in param_name or 'path' in param_name:
|
| 19 |
-
return random.choice(['src/main.py', 'README.md', 'package.json', 'config.yaml', 'tests/test.js'])
|
| 20 |
-
elif 'command' in param_name or 'cmd' in param_name:
|
| 21 |
-
return random.choice(['npm test', 'pytest', 'git status', 'ls -la', 'make build'])
|
| 22 |
-
elif 'pattern' in param_name or 'glob' in param_name:
|
| 23 |
-
return random.choice(['**/*.py', '**/*.js', '**/*.md'])
|
| 24 |
-
elif 'query' in param_name or 'search' in param_name:
|
| 25 |
-
return random.choice(['TODO', 'FIXME', 'function main'])
|
| 26 |
-
elif 'url' in param_name or 'uri' in param_name:
|
| 27 |
-
return random.choice(['https://api.example.com', 'mcp://server/resource'])
|
| 28 |
-
elif 'status' in param_name:
|
| 29 |
-
return random.choice(['pending', 'in_progress', 'completed'])
|
| 30 |
-
elif 'id' in param_name or 'task_id' in param_name:
|
| 31 |
-
return random.randint(100, 999)
|
| 32 |
-
elif 'name' in param_name:
|
| 33 |
-
return random.choice(['agent1', 'myteam', 'task123'])
|
| 34 |
-
elif 'content' in param_name or 'text' in param_name:
|
| 35 |
-
return 'Lorem ipsum dolor sit amet...'
|
| 36 |
-
elif 'directory' in param_name or 'dir' in param_name:
|
| 37 |
-
return random.choice(['.', 'src', 'tests', 'lib'])
|
| 38 |
-
elif 'branch' in param_name:
|
| 39 |
-
return random.choice(['main', 'develop', 'feature/new'])
|
| 40 |
-
else:
|
| 41 |
-
return f"value_{random.randint(1,100)}"
|
| 42 |
-
|
| 43 |
-
def generate_random_example(tools: List[Dict], tool_count: int = 1) -> Dict[str, Any]:
|
| 44 |
-
"""Generate a random conversation with one or more tool uses."""
|
| 45 |
-
tools_sample = random.sample(tools, min(tool_count, len(tools)))
|
| 46 |
-
|
| 47 |
-
# Build messages
|
| 48 |
-
messages = []
|
| 49 |
-
|
| 50 |
-
# Random user prompt
|
| 51 |
-
user_prompt = random.choice([
|
| 52 |
-
"Help me with something",
|
| 53 |
-
"Do a task",
|
| 54 |
-
"I need assistance",
|
| 55 |
-
"Can you handle this?",
|
| 56 |
-
"Execute this",
|
| 57 |
-
"Run this operation"
|
| 58 |
-
])
|
| 59 |
-
messages.append({"role": "user", "content": user_prompt})
|
| 60 |
-
|
| 61 |
-
# For each tool, add assistant tool-use and tool-result
|
| 62 |
-
for i, tool in enumerate(tools_sample):
|
| 63 |
-
tool_name = tool.get("tool") or tool.get("name", "UnknownTool")
|
| 64 |
-
|
| 65 |
-
# Generate random parameters based on tool's expected inputs
|
| 66 |
-
# We don't have strict schema, so make up plausible params
|
| 67 |
-
tool_input = {}
|
| 68 |
-
for j in range(random.randint(1, 3)):
|
| 69 |
-
param_name = random.choice(['file_path', 'command', 'pattern', 'query', 'url', 'id', 'name', 'directory'])
|
| 70 |
-
tool_input[param_name] = random_value_for_type(param_name)
|
| 71 |
-
|
| 72 |
-
# Assistant uses tool
|
| 73 |
-
messages.append({
|
| 74 |
-
"role": "assistant",
|
| 75 |
-
"content": f"Using {tool_name}...",
|
| 76 |
-
"tool_use": {
|
| 77 |
-
"name": tool_name,
|
| 78 |
-
"input": tool_input
|
| 79 |
-
}
|
| 80 |
-
})
|
| 81 |
-
|
| 82 |
-
# Tool result
|
| 83 |
-
result_content = f"Operation completed successfully. Affected items: {random.randint(1,10)}"
|
| 84 |
-
messages.append({
|
| 85 |
-
"role": "user",
|
| 86 |
-
"content": "",
|
| 87 |
-
"tool_result": {
|
| 88 |
-
"tool_use_id": f"tool_{i+1}",
|
| 89 |
-
"content": result_content
|
| 90 |
-
}
|
| 91 |
-
})
|
| 92 |
-
|
| 93 |
-
# Assistant acknowledges
|
| 94 |
-
messages.append({
|
| 95 |
-
"role": "assistant",
|
| 96 |
-
"content": random.choice(["Done.", "Completed.", "All set."])
|
| 97 |
-
})
|
| 98 |
-
|
| 99 |
-
return {
|
| 100 |
-
"messages": messages,
|
| 101 |
-
"source": "random_synthetic",
|
| 102 |
-
"tools_used": [t.get("tool") for t in tools_sample]
|
| 103 |
-
}
|
| 104 |
-
|
| 105 |
-
def main():
|
| 106 |
-
parser = argparse.ArgumentParser()
|
| 107 |
-
parser.add_argument("--catalog", type=str, default="training-data/tools/catalog.json")
|
| 108 |
-
parser.add_argument("--output", type=str, default="training-data/scaled/random_synthetic.jsonl")
|
| 109 |
-
parser.add_argument("--count", type=int, default=10000)
|
| 110 |
-
parser.add_argument("--tools-per-example", type=int, default=1)
|
| 111 |
-
args = parser.parse_args()
|
| 112 |
-
|
| 113 |
-
catalog_path = Path(args.catalog)
|
| 114 |
-
output_path = Path(args.output)
|
| 115 |
-
|
| 116 |
-
if not catalog_path.exists():
|
| 117 |
-
print(f"❌ Catalog not found: {catalog_path}")
|
| 118 |
-
return
|
| 119 |
-
|
| 120 |
-
tools = load_tool_catalog(catalog_path)
|
| 121 |
-
print(f"🔧 Loaded {len(tools)} tools from catalog")
|
| 122 |
-
|
| 123 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 124 |
-
|
| 125 |
-
with open(output_path, 'w') as f:
|
| 126 |
-
for i in range(args.count):
|
| 127 |
-
example = generate_random_example(tools, args.tools_per_example)
|
| 128 |
-
f.write(json.dumps(example) + "\n")
|
| 129 |
-
if (i+1) % 1000 == 0:
|
| 130 |
-
print(f" Generated {i+1}/{args.count}...", end='\r')
|
| 131 |
-
|
| 132 |
-
print(f"\n✨ Generated {args.count} random synthetic examples")
|
| 133 |
-
print(f" Saved to: {output_path}")
|
| 134 |
-
|
| 135 |
-
# Show sample
|
| 136 |
-
with open(output_path, 'r') as f:
|
| 137 |
-
sample = json.loads(f.readline())
|
| 138 |
-
print(f"\n📝 Sample: {len(sample['messages'])} messages, tools: {sample.get('tools_used')}")
|
| 139 |
-
|
| 140 |
-
if __name__ == "__main__":
|
| 141 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/generate_synthetic.py
DELETED
|
@@ -1,256 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Generate synthetic training examples using templates.
|
| 4 |
-
No external APIs - pure template expansion and variation.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import json
|
| 8 |
-
import random
|
| 9 |
-
import string
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from typing import Dict, List, Any
|
| 12 |
-
import argparse
|
| 13 |
-
|
| 14 |
-
# Load tool catalog
|
| 15 |
-
def load_tools(catalog_path: str) -> List[Dict[str, Any]]:
|
| 16 |
-
with open(catalog_path, 'r') as f:
|
| 17 |
-
return json.load(f)
|
| 18 |
-
|
| 19 |
-
# Template definitions for each tool
|
| 20 |
-
def get_tool_templates(tool_name: str) -> List[Dict[str, Any]]:
|
| 21 |
-
"""Return list of template scenarios for a given tool."""
|
| 22 |
-
templates = {
|
| 23 |
-
"FileReadTool": [
|
| 24 |
-
{
|
| 25 |
-
"user": "Read the file {file_path}",
|
| 26 |
-
"params": {"file_path": "{file_path}"},
|
| 27 |
-
"result": "Contents of {file_path}:\n{file_content}"
|
| 28 |
-
},
|
| 29 |
-
{
|
| 30 |
-
"user": "Show me what's in {file_path}",
|
| 31 |
-
"params": {"file_path": "{file_path}"},
|
| 32 |
-
"result": "Here's {file_path}:\n{file_content}"
|
| 33 |
-
},
|
| 34 |
-
{
|
| 35 |
-
"user": "Can you open {file_path}?",
|
| 36 |
-
"params": {"file_path": "{file_path}"},
|
| 37 |
-
"result": "Opening {file_path}...\n{file_content}"
|
| 38 |
-
}
|
| 39 |
-
],
|
| 40 |
-
"FileWriteTool": [
|
| 41 |
-
{
|
| 42 |
-
"user": "Create a new file {file_path} with content: {content}",
|
| 43 |
-
"params": {"file_path": "{file_path}", "content": "{content}"},
|
| 44 |
-
"result": "File {file_path} created successfully"
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"user": "Write this to {file_path}: {content}",
|
| 48 |
-
"params": {"file_path": "{file_path}", "content": "{content}"},
|
| 49 |
-
"result": "Wrote to {file_path}"
|
| 50 |
-
}
|
| 51 |
-
],
|
| 52 |
-
"GlobTool": [
|
| 53 |
-
{
|
| 54 |
-
"user": "Find all {pattern} files",
|
| 55 |
-
"params": {"pattern": "{pattern}"},
|
| 56 |
-
"result": "Found {count} files:\n{files}"
|
| 57 |
-
},
|
| 58 |
-
{
|
| 59 |
-
"user": "List files matching {pattern}",
|
| 60 |
-
"params": {"pattern": "{pattern}"},
|
| 61 |
-
"result": "Matches for {pattern}:\n{files}"
|
| 62 |
-
}
|
| 63 |
-
],
|
| 64 |
-
"GrepTool": [
|
| 65 |
-
{
|
| 66 |
-
"user": "Search for {pattern} in {directory}",
|
| 67 |
-
"params": {"pattern": "{pattern}", "directory": "{directory}"},
|
| 68 |
-
"result": "Found {count} matches:\n{matches}"
|
| 69 |
-
},
|
| 70 |
-
{
|
| 71 |
-
"user": "Find all occurrences of {pattern}",
|
| 72 |
-
"params": {"pattern": "{pattern}"},
|
| 73 |
-
"result": "Search results:\n{matches}"
|
| 74 |
-
}
|
| 75 |
-
],
|
| 76 |
-
"BashTool": [
|
| 77 |
-
{
|
| 78 |
-
"user": "Run: {command}",
|
| 79 |
-
"params": {"command": "{command}"},
|
| 80 |
-
"result": "$ {command}\n{output}"
|
| 81 |
-
},
|
| 82 |
-
{
|
| 83 |
-
"user": "Execute {command}",
|
| 84 |
-
"params": {"command": "{command}"},
|
| 85 |
-
"result": "Output:\n{output}"
|
| 86 |
-
}
|
| 87 |
-
]
|
| 88 |
-
}
|
| 89 |
-
|
| 90 |
-
# Return templates or generate generic ones if not defined
|
| 91 |
-
return templates.get(tool_name, [
|
| 92 |
-
{
|
| 93 |
-
"user": "Use {tool} with {params}",
|
| 94 |
-
"params": {"arg": "value"},
|
| 95 |
-
"result": "Operation completed"
|
| 96 |
-
}
|
| 97 |
-
])
|
| 98 |
-
|
| 99 |
-
def generate_variations(template: Dict[str, str], count: int, tool_name: str) -> List[Dict[str, Any]]:
|
| 100 |
-
"""Generate multiple variations of a template."""
|
| 101 |
-
examples = []
|
| 102 |
-
|
| 103 |
-
for _ in range(count):
|
| 104 |
-
# Create parameter values
|
| 105 |
-
params = generate_params(tool_name, template.get("params", {}))
|
| 106 |
-
|
| 107 |
-
# Fill template placeholders
|
| 108 |
-
user_prompt = fill_template(template.get("user", ""), params)
|
| 109 |
-
tool_params = fill_params(template.get("params", {}), params)
|
| 110 |
-
result = fill_template(template.get("result", ""), params)
|
| 111 |
-
|
| 112 |
-
# Build conversation
|
| 113 |
-
messages = [
|
| 114 |
-
{"role": "user", "content": user_prompt},
|
| 115 |
-
{
|
| 116 |
-
"role": "assistant",
|
| 117 |
-
"content": "I'll help with that.",
|
| 118 |
-
"tool_use": {
|
| 119 |
-
"name": tool_name,
|
| 120 |
-
"input": tool_params
|
| 121 |
-
}
|
| 122 |
-
},
|
| 123 |
-
{
|
| 124 |
-
"role": "user",
|
| 125 |
-
"content": "",
|
| 126 |
-
"tool_result": {
|
| 127 |
-
"tool_use_id": "tool_1",
|
| 128 |
-
"content": result
|
| 129 |
-
}
|
| 130 |
-
},
|
| 131 |
-
{"role": "assistant", "content": "Done!"}
|
| 132 |
-
]
|
| 133 |
-
|
| 134 |
-
examples.append({
|
| 135 |
-
"messages": messages,
|
| 136 |
-
"source": "synthetic_template",
|
| 137 |
-
"tool": tool_name
|
| 138 |
-
})
|
| 139 |
-
|
| 140 |
-
return examples
|
| 141 |
-
|
| 142 |
-
def generate_params(tool_name: str, template_params: Dict[str, str]) -> Dict[str, Any]:
|
| 143 |
-
"""Generate realistic parameter values based on tool and template."""
|
| 144 |
-
params = {}
|
| 145 |
-
|
| 146 |
-
for key, placeholder in template_params.items():
|
| 147 |
-
if placeholder == "{file_path}":
|
| 148 |
-
params[key] = random.choice([
|
| 149 |
-
"src/main.py", "package.json", "README.md", "src/utils.js",
|
| 150 |
-
"tests/test_api.py", "config.yaml", "Dockerfile", "requirements.txt"
|
| 151 |
-
])
|
| 152 |
-
elif placeholder == "{pattern}":
|
| 153 |
-
params[key] = random.choice([
|
| 154 |
-
"**/*.py", "**/*.js", "**/*.ts", "*.md", "src/**/*.test.js"
|
| 155 |
-
])
|
| 156 |
-
elif placeholder == "{command}":
|
| 157 |
-
params[key] = random.choice([
|
| 158 |
-
"ls -la", "pwd", "git status", "npm run build",
|
| 159 |
-
"python -m pytest", "make test", "docker ps"
|
| 160 |
-
])
|
| 161 |
-
elif placeholder == "{content}":
|
| 162 |
-
params[key] = random.choice([
|
| 163 |
-
"console.log('Hello, World!');",
|
| 164 |
-
"def hello():\n return 'Hello'",
|
| 165 |
-
"# TODO: implement\npass",
|
| 166 |
-
"import React from 'react';"
|
| 167 |
-
])
|
| 168 |
-
elif placeholder == "{directory}":
|
| 169 |
-
params[key] = random.choice([
|
| 170 |
-
".", "src", "tests", "lib", "app/components"
|
| 171 |
-
])
|
| 172 |
-
else:
|
| 173 |
-
# Generic placeholder
|
| 174 |
-
params[key] = f"generated_value_{random.randint(1,1000)}"
|
| 175 |
-
|
| 176 |
-
return params
|
| 177 |
-
|
| 178 |
-
def fill_template(template: str, params: Dict[str, Any]) -> str:
|
| 179 |
-
"""Replace placeholders in template."""
|
| 180 |
-
result = template
|
| 181 |
-
for key, value in params.items():
|
| 182 |
-
placeholder = f"{{{key}}}"
|
| 183 |
-
result = result.replace(placeholder, str(value))
|
| 184 |
-
return result
|
| 185 |
-
|
| 186 |
-
def fill_params(template_params: Dict[str, str], params: Dict[str, Any]) -> Dict[str, Any]:
|
| 187 |
-
"""Fill parameter templates."""
|
| 188 |
-
filled = {}
|
| 189 |
-
for key, placeholder in template_params.items():
|
| 190 |
-
if placeholder in [f"{{{k}}}" for k in params.keys()]:
|
| 191 |
-
# Find matching param key
|
| 192 |
-
param_key = placeholder.strip("{}")
|
| 193 |
-
filled[key] = params.get(param_key, placeholder)
|
| 194 |
-
else:
|
| 195 |
-
filled[key] = params.get(key, placeholder)
|
| 196 |
-
return filled
|
| 197 |
-
|
| 198 |
-
def main():
|
| 199 |
-
parser = argparse.ArgumentParser()
|
| 200 |
-
parser.add_argument("--catalog", type=str, default="training-data/tools/catalog.json")
|
| 201 |
-
parser.add_argument("--output", type=str, default="training-data/scaled/template_synthetic.jsonl")
|
| 202 |
-
parser.add_argument("--examples-per-tool", type=int, default=500)
|
| 203 |
-
parser.add_argument("--tools-limit", type=int, default=None, help="Limit number of tools to process")
|
| 204 |
-
args = parser.parse_args()
|
| 205 |
-
|
| 206 |
-
catalog_path = Path(args.catalog)
|
| 207 |
-
output_path = Path(args.output)
|
| 208 |
-
|
| 209 |
-
if not catalog_path.exists():
|
| 210 |
-
print(f"❌ Tool catalog not found: {catalog_path}")
|
| 211 |
-
return
|
| 212 |
-
|
| 213 |
-
tools = load_tools(catalog_path)
|
| 214 |
-
if args.tools_limit:
|
| 215 |
-
tools = tools[:args.tools_limit]
|
| 216 |
-
|
| 217 |
-
print(f"🔧 Generating synthetic examples for {len(tools)} tools")
|
| 218 |
-
print(f" Target: {args.examples_per_tool} examples per tool")
|
| 219 |
-
print(f" Total expected: ~{len(tools) * args.examples_per_tool} examples")
|
| 220 |
-
|
| 221 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 222 |
-
|
| 223 |
-
total_examples = 0
|
| 224 |
-
with open(output_path, 'w') as f:
|
| 225 |
-
for tool in tools:
|
| 226 |
-
tool_name = tool.get("tool") or tool.get("name", "Unknown")
|
| 227 |
-
templates = get_tool_templates(tool_name)
|
| 228 |
-
|
| 229 |
-
if not templates:
|
| 230 |
-
print(f"⚠️ No templates for {tool_name}, skipping")
|
| 231 |
-
continue
|
| 232 |
-
|
| 233 |
-
# Generate examples for each template
|
| 234 |
-
examples_per_template = args.examples_per_tool // len(templates)
|
| 235 |
-
|
| 236 |
-
for template in templates:
|
| 237 |
-
examples = generate_variations(template, examples_per_template, tool_name)
|
| 238 |
-
for ex in examples:
|
| 239 |
-
f.write(json.dumps(ex) + "\n")
|
| 240 |
-
total_examples += 1
|
| 241 |
-
|
| 242 |
-
print(f"✅ {tool_name}: {examples_per_template * len(templates)} examples")
|
| 243 |
-
|
| 244 |
-
print(f"\n✨ Generated {total_examples} synthetic examples")
|
| 245 |
-
print(f" Saved to: {output_path}")
|
| 246 |
-
|
| 247 |
-
# Create a sample
|
| 248 |
-
print("\n📝 Sample example:")
|
| 249 |
-
with open(output_path, 'r') as f:
|
| 250 |
-
first_line = f.readline()
|
| 251 |
-
if first_line:
|
| 252 |
-
sample = json.loads(first_line)
|
| 253 |
-
print(f" User: {sample['messages'][0]['content'][:80]}...")
|
| 254 |
-
|
| 255 |
-
if __name__ == "__main__":
|
| 256 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/generate_synthetic_v2.py
DELETED
|
@@ -1,316 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Generate high-quality synthetic training data using tool-specific templates.
|
| 4 |
-
Each tool gets realistic scenarios with proper parameters.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import json
|
| 8 |
-
import random
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
from typing import Dict, List, Any
|
| 11 |
-
import argparse
|
| 12 |
-
|
| 13 |
-
# Comprehensive templates for all tools
|
| 14 |
-
TOOL_TEMPLATES = {
|
| 15 |
-
"AgentTool": [
|
| 16 |
-
{"user": "Create an agent to help with testing", "params": {"name": "test_agent", "goal": "Write unit tests"}, "result": "Agent 'test_agent' created"},
|
| 17 |
-
{"user": "Spawn a teammate to handle frontend tasks", "params": {"name": "frontend_dev", "skills": ["react", "typescript"]}, "result": "Teammate 'frontend_dev' added to team"}
|
| 18 |
-
],
|
| 19 |
-
"AskUserQuestionTool": [
|
| 20 |
-
{"user": "Ask the user which framework they prefer", "params": {"question": "Which framework do you want to use: React, Vue, or Angular?"}, "result": "User responded: React"},
|
| 21 |
-
{"user": "I need clarification on the requirements", "params": {"question": "Should the API be REST or GraphQL?"}, "result": "User answered: REST"}
|
| 22 |
-
],
|
| 23 |
-
"BashTool": [
|
| 24 |
-
{"user": "Run tests", "params": {"command": "npm test"}, "result": "PASS src/index.test.js\nTests: 12 passed, 0 failed"},
|
| 25 |
-
{"user": "Check git status", "params": {"command": "git status"}, "result": "On branch main\nModified: src/index.js"},
|
| 26 |
-
{"user": "Install dependencies", "params": {"command": "pip install -r requirements.txt"}, "result": "Successfully installed Flask==2.0.0"}
|
| 27 |
-
],
|
| 28 |
-
"BriefTool": [
|
| 29 |
-
{"user": "Give me a brief on this project", "params": {"topic": "project_overview"}, "result": "This is a voice-enabled AI coding assistant built on Qwen2.5-Coder-32B"},
|
| 30 |
-
{"user": "Brief me on the architecture", "params": {"topic": "architecture"}, "result": "Stack 2.9 uses vLLM for inference with LoRA fine-tuning"}
|
| 31 |
-
],
|
| 32 |
-
"ConfigTool": [
|
| 33 |
-
{"user": "Show configuration", "params": {"section": "model"}, "result": "model: Qwen2.5-Coder-32B\ncontext: 32768"},
|
| 34 |
-
{"user": "Get settings", "params": {"key": "max_tokens"}, "result": "max_tokens: 4000"}
|
| 35 |
-
],
|
| 36 |
-
"EnterPlanModeTool": [
|
| 37 |
-
{"user": "Enter plan mode", "params": {"goal": "Refactor authentication module"}, "result": "Plan mode activated for: Refactor authentication module"},
|
| 38 |
-
],
|
| 39 |
-
"EnterWorktreeTool": [
|
| 40 |
-
{"user": "Create a worktree for feature branch", "params": {"branch": "feature/new-ui"}, "result": "Worktree created at .worktrees/feature_new_ui"},
|
| 41 |
-
],
|
| 42 |
-
"ExitWorktreeTool": [
|
| 43 |
-
{"user": "Exit current worktree", "params": {}, "result": "Exited worktree, returning to main"},
|
| 44 |
-
],
|
| 45 |
-
"FileEditTool": [
|
| 46 |
-
{"user": "Fix syntax error in main.py line 10", "params": {"file_path": "src/main.py", "old_string": "prin('hello')", "new_string": "print('hello')"}, "result": "File edited successfully"},
|
| 47 |
-
{"user": "Add import statement to app.py", "params": {"file_path": "app.py", "insert_after": "import os", "new_string": "import sys"}, "result": "Import added"},
|
| 48 |
-
],
|
| 49 |
-
"FileReadTool": [
|
| 50 |
-
{"user": "Read package.json", "params": {"file_path": "package.json"}, "result": "{\n \"name\": \"myapp\",\n \"version\": \"1.0.0\"\n}"},
|
| 51 |
-
{"user": "Show me README.md", "params": {"file_path": "README.md"}, "result": "# My Project\n\nDescription here..."},
|
| 52 |
-
],
|
| 53 |
-
"FileWriteTool": [
|
| 54 |
-
{"user": "Create a new file utils.py", "params": {"file_path": "src/utils.py", "content": "def helper():\n return 'help'"}, "result": "File src/utils.py created"},
|
| 55 |
-
],
|
| 56 |
-
"GlobTool": [
|
| 57 |
-
{"user": "Find all Python files", "params": {"pattern": "**/*.py"}, "result": "Found 15 files:\nsrc/main.py\nsrc/utils.py\ntests/test_main.py"},
|
| 58 |
-
{"user": "List test files", "params": {"pattern": "**/*.test.js"}, "result": "Found 3 files:\ntests/unit.test.js\ntests/integration.test.js"},
|
| 59 |
-
],
|
| 60 |
-
"GrepTool": [
|
| 61 |
-
{"user": "Search for 'TODO' comments", "params": {"pattern": "TODO"}, "result": "src/main.py:15:# TODO: implement error handling\nsrc/utils.py:42:# TODO: add validation"},
|
| 62 |
-
{"user": "Find all console.log statements", "params": {"pattern": "console.log"}, "result": "src/index.js:10:console.log('debug')\nsrc/app.js:25:console.log('start')"},
|
| 63 |
-
],
|
| 64 |
-
"LSPTool": [
|
| 65 |
-
{"user": "Get definition of function calculateTotal", "params": {"file_path": "src/math.js", "line": 10, "character": 15}, "result": "Definition at src/math.js:20-30\nfunction calculateTotal(items) {...}"},
|
| 66 |
-
{"user": "Find references of MyClass", "params": {"file_path": "src/MyClass.ts", "line": 5, "character": 10}, "result": "References:\n- src/main.ts:15\n- tests/MyClass.test.ts:8"},
|
| 67 |
-
],
|
| 68 |
-
"ListMcpResourcesTool": [
|
| 69 |
-
{"user": "List available MCP resources", "params": {}, "result": "Resources:\n- server1.file_system\n- server2.database\n- server3.api"},
|
| 70 |
-
],
|
| 71 |
-
"MCPTool": [
|
| 72 |
-
{"user": "Connect to GitHub MCP server", "params": {"server_name": "github"}, "result": "Connected to GitHub MCP server"},
|
| 73 |
-
],
|
| 74 |
-
"NotebookEditTool": [
|
| 75 |
-
{"user": "Add markdown cell to notebook", "params": {"notebook_path": "analysis.ipynb", "cell_index": 0, "cell_type": "markdown", "content": "# Analysis"}, "result": "Cell added"},
|
| 76 |
-
],
|
| 77 |
-
"ReadMcpResourceTool": [
|
| 78 |
-
{"user": "Read resource file from MCP", "params": {"uri": "mcp://server1/file.txt"}, "result": "File content here..."},
|
| 79 |
-
],
|
| 80 |
-
"RemoteTriggerTool": [
|
| 81 |
-
{"user": "Trigger deployment on staging", "params": {"target": "staging-server", "action": "deploy"}, "result": "Deployment triggered, build ID: 12345"},
|
| 82 |
-
],
|
| 83 |
-
"SendMessageTool": [
|
| 84 |
-
{"user": "Message the design team about the mockups", "params": {"to": "design-team", "subject": "Mockups ready", "body": "Please review the new mockups in Figma"}, "result": "Message sent to design-team"},
|
| 85 |
-
],
|
| 86 |
-
"SkillTool": [
|
| 87 |
-
{"user": "Run code review skill", "params": {"skill": "code-review", "inputs": {"code": "function foo() { return 1; }"}}, "result": "Review: Use strict equality, add JSDoc"},
|
| 88 |
-
],
|
| 89 |
-
"TaskCreateTool": [
|
| 90 |
-
{"user": "Create task: Fix login bug", "params": {"title": "Fix login bug", "description": "Users can't log in with valid credentials"}, "result": "Task #123 created"},
|
| 91 |
-
],
|
| 92 |
-
"TaskGetTool": [
|
| 93 |
-
{"user": "Get details of task 123", "params": {"task_id": 123}, "result": "Task #123: Fix login bug\nStatus: in progress\nAssignee: @dev"},
|
| 94 |
-
],
|
| 95 |
-
"TaskListTool": [
|
| 96 |
-
{"user": "List all tasks", "params": {"status": "in_progress"}, "result": "Tasks:\n#123 Fix login bug\n#124 Update docs"},
|
| 97 |
-
],
|
| 98 |
-
"TaskStopTool": [
|
| 99 |
-
{"user": "Stop task 123", "params": {"task_id": 123}, "result": "Task #123 stopped"},
|
| 100 |
-
],
|
| 101 |
-
"TaskUpdateTool": [
|
| 102 |
-
{"user": "Mark task 123 as complete", "params": {"task_id": 123, "status": "completed"}, "result": "Task #123 marked complete"},
|
| 103 |
-
],
|
| 104 |
-
"TeamCreateTool": [
|
| 105 |
-
{"user": "Create a team for backend devs", "params": {"team_name": "backend", "members": ["@alice", "@bob"]}, "result": "Team 'backend' created with 2 members"},
|
| 106 |
-
],
|
| 107 |
-
"TeamDeleteTool": [
|
| 108 |
-
{"user": "Delete the temp team", "params": {"team_name": "temp"}, "result": "Team 'temp' deleted"},
|
| 109 |
-
],
|
| 110 |
-
"TodoWriteTool": [
|
| 111 |
-
{"user": "Add todo: update documentation", "params": {"text": "Update API documentation"}, "result": "Todo added"},
|
| 112 |
-
],
|
| 113 |
-
"ToolSearchTool": [
|
| 114 |
-
{"user": "Search for file search tools", "params": {"query": "find files"}, "result": "Found: GlobTool, GrepTool"},
|
| 115 |
-
],
|
| 116 |
-
"WebFetchTool": [
|
| 117 |
-
{"user": "Fetch the OpenRouter API docs", "params": {"url": "https://openrouter.ai/docs"}, "result": "Fetched 15KB from openrouter.ai/docs"},
|
| 118 |
-
],
|
| 119 |
-
"WebSearchTool": [
|
| 120 |
-
{"user": "Search for 'Node.js best practices 2024'", "params": {"query": "Node.js best practices 2024"}, "result": "Top results:\n1. Node.js Design Patterns\n2. 2024 Node.js Best Practices Guide"},
|
| 121 |
-
]
|
| 122 |
-
}
|
| 123 |
-
|
| 124 |
-
# Realistic value pools
|
| 125 |
-
FILE_PATHS = ["src/main.py", "src/utils.js", "README.md", "package.json", "config.yaml",
|
| 126 |
-
"Dockerfile", "requirements.txt", "tests/test_api.py", "src/components/Button.tsx",
|
| 127 |
-
"lib/helpers.py", "app/models.py", "src/index.js", "Makefile"]
|
| 128 |
-
COMMANDS = ["npm test", "pytest", "make build", "git status", "ls -la",
|
| 129 |
-
"python -m pip install -r requirements.txt", "docker ps", "npm run lint"]
|
| 130 |
-
PATTERNS = ["**/*.py", "**/*.js", "**/*.ts", "*.md", "**/*.test.js", "**/__tests__/**/*.py"]
|
| 131 |
-
QUESTIONS = ["Which framework should we use?", "Is this a bug or a feature?",
|
| 132 |
-
"What's the priority of this task?", "Should we refactor or rewrite?"]
|
| 133 |
-
|
| 134 |
-
def fill_placeholders(text: str, params: Dict[str, Any]) -> str:
|
| 135 |
-
"""Replace all {key} placeholders in text with values from params."""
|
| 136 |
-
for key, value in params.items():
|
| 137 |
-
placeholder = f"{{{key}}}"
|
| 138 |
-
if placeholder in text:
|
| 139 |
-
text = text.replace(placeholder, str(value))
|
| 140 |
-
return text
|
| 141 |
-
|
| 142 |
-
def generate_variations(template: Dict[str, Any], count: int, tool_name: str) -> List[Dict[str, Any]]:
|
| 143 |
-
"""Generate multiple realistic variations of a template."""
|
| 144 |
-
examples = []
|
| 145 |
-
|
| 146 |
-
for i in range(count):
|
| 147 |
-
params = {}
|
| 148 |
-
user_text = template["user"]
|
| 149 |
-
result_text = template["result"]
|
| 150 |
-
|
| 151 |
-
# Build params by scanning template for placeholders
|
| 152 |
-
template_str = user_text + json.dumps(template.get("params", {})) + result_text
|
| 153 |
-
|
| 154 |
-
# Determine what placeholders exist
|
| 155 |
-
for key, default_val in template.get("params", {}).items():
|
| 156 |
-
if isinstance(default_val, str) and ("{" + key + "}") in template_str:
|
| 157 |
-
# This is a placeholder - generate dynamic value
|
| 158 |
-
if key == "file_path":
|
| 159 |
-
params[key] = random.choice(FILE_PATHS)
|
| 160 |
-
elif key == "command":
|
| 161 |
-
params[key] = random.choice(COMMANDS)
|
| 162 |
-
elif key == "pattern":
|
| 163 |
-
params[key] = random.choice(PATTERNS)
|
| 164 |
-
elif key == "question":
|
| 165 |
-
params[key] = random.choice(QUESTIONS)
|
| 166 |
-
elif key == "topic":
|
| 167 |
-
params[key] = random.choice(["project_overview", "architecture", "team", "timeline"])
|
| 168 |
-
elif key == "branch":
|
| 169 |
-
params[key] = random.choice(["feature/new-ui", "bugfix/login", "hotfix/security"])
|
| 170 |
-
elif key == "name":
|
| 171 |
-
params[key] = random.choice(["test_agent", "code_reviewer", "deployment_bot", "test_suite"])
|
| 172 |
-
elif key == "goal":
|
| 173 |
-
params[key] = random.choice(["Write unit tests", "Refactor legacy code", "Add documentation"])
|
| 174 |
-
elif key == "to":
|
| 175 |
-
params[key] = random.choice(["team-backend", "design-team", "product-team"])
|
| 176 |
-
elif key == "subject":
|
| 177 |
-
params[key] = random.choice(["Review needed", "Update available", "Deployment status"])
|
| 178 |
-
elif key == "body":
|
| 179 |
-
params[key] = "Please review the attached documents."
|
| 180 |
-
elif key == "server_name":
|
| 181 |
-
params[key] = random.choice(["github", "jira", "slack", "postgres"])
|
| 182 |
-
elif key == "action":
|
| 183 |
-
params[key] = random.choice(["deploy", "restart", "backup", "migrate"])
|
| 184 |
-
elif key == "target":
|
| 185 |
-
params[key] = random.choice(["staging", "production", "dev"])
|
| 186 |
-
elif key == "skill":
|
| 187 |
-
params[key] = random.choice(["code-review", "security-scan", "performance-test"])
|
| 188 |
-
elif key == "query":
|
| 189 |
-
params[key] = random.choice(["find files", "search code", "list todos"])
|
| 190 |
-
elif key == "url":
|
| 191 |
-
params[key] = random.choice(["https://api.example.com/docs", "https://github.com/repo"])
|
| 192 |
-
elif key == "task_id":
|
| 193 |
-
params[key] = random.randint(100, 999)
|
| 194 |
-
elif key == "title":
|
| 195 |
-
params[key] = random.choice(["Fix bug", "Add feature", "Update docs", "Refactor code"])
|
| 196 |
-
elif key == "description":
|
| 197 |
-
params[key] = "Detailed description of the task..."
|
| 198 |
-
elif key == "team_name":
|
| 199 |
-
params[key] = random.choice(["backend", "frontend", "devops", "qa"])
|
| 200 |
-
elif key == "members":
|
| 201 |
-
params[key] = ["@user1", "@user2"]
|
| 202 |
-
elif key == "status":
|
| 203 |
-
params[key] = random.choice(["in_progress", "completed", "todo"])
|
| 204 |
-
elif key == "text":
|
| 205 |
-
params[key] = "Sample todo item"
|
| 206 |
-
elif key == "cell_index":
|
| 207 |
-
params[key] = random.randint(0, 10)
|
| 208 |
-
elif key == "cell_type":
|
| 209 |
-
params[key] = random.choice(["code", "markdown"])
|
| 210 |
-
elif key == "content":
|
| 211 |
-
params[key] = random.choice(["print('hello')", "# TODO", "import React", "def main():\n pass"])
|
| 212 |
-
elif key == "uri":
|
| 213 |
-
params[key] = "mcp://server/resource"
|
| 214 |
-
elif key in ["line", "character"] and "file_path" in params:
|
| 215 |
-
params[key] = random.randint(1, 100) if key == "line" else random.randint(1, 50)
|
| 216 |
-
else:
|
| 217 |
-
# Generic placeholder
|
| 218 |
-
params[key] = f"value_{random.randint(1, 100)}"
|
| 219 |
-
else:
|
| 220 |
-
# Not a placeholder, use the static value as-is
|
| 221 |
-
params[key] = default_val
|
| 222 |
-
|
| 223 |
-
# Fill user prompt with params
|
| 224 |
-
user_prompt = fill_placeholders(user_text, params)
|
| 225 |
-
user_prompt = user_prompt.replace("{tool}", tool_name)
|
| 226 |
-
|
| 227 |
-
# Build tool input from params
|
| 228 |
-
tool_input = {}
|
| 229 |
-
for key, template_val in template.get("params", {}).items():
|
| 230 |
-
if key in params:
|
| 231 |
-
tool_input[key] = params[key]
|
| 232 |
-
else:
|
| 233 |
-
tool_input[key] = template_val
|
| 234 |
-
|
| 235 |
-
# Fill result
|
| 236 |
-
result = fill_placeholders(result_text, params)
|
| 237 |
-
|
| 238 |
-
# Build conversation
|
| 239 |
-
messages = [
|
| 240 |
-
{"role": "user", "content": user_prompt},
|
| 241 |
-
{
|
| 242 |
-
"role": "assistant",
|
| 243 |
-
"content": random.choice([
|
| 244 |
-
"I'll help with that.",
|
| 245 |
-
"Sure, let me do that.",
|
| 246 |
-
"Processing your request..."
|
| 247 |
-
]),
|
| 248 |
-
"tool_use": {
|
| 249 |
-
"name": tool_name,
|
| 250 |
-
"input": tool_input
|
| 251 |
-
}
|
| 252 |
-
},
|
| 253 |
-
{
|
| 254 |
-
"role": "user",
|
| 255 |
-
"content": "",
|
| 256 |
-
"tool_result": {
|
| 257 |
-
"tool_use_id": "tool_1",
|
| 258 |
-
"content": result
|
| 259 |
-
}
|
| 260 |
-
},
|
| 261 |
-
{"role": "assistant", "content": random.choice(["Done!", "Completed.", "All set!"])}
|
| 262 |
-
]
|
| 263 |
-
|
| 264 |
-
examples.append({
|
| 265 |
-
"messages": messages,
|
| 266 |
-
"source": "synthetic_template",
|
| 267 |
-
"tool": tool_name
|
| 268 |
-
})
|
| 269 |
-
|
| 270 |
-
return examples
|
| 271 |
-
|
| 272 |
-
def main():
|
| 273 |
-
parser = argparse.ArgumentParser()
|
| 274 |
-
parser.add_argument("--output", type=str, default="training-data/scaled/template_synthetic.jsonl")
|
| 275 |
-
parser.add_argument("--examples-per-tool", type=int, default=500)
|
| 276 |
-
parser.add_argument("--tools-limit", type=int, default=None)
|
| 277 |
-
args = parser.parse_args()
|
| 278 |
-
|
| 279 |
-
output_path = Path(args.output)
|
| 280 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 281 |
-
|
| 282 |
-
tools = list(TOOL_TEMPLATES.keys())
|
| 283 |
-
if args.tools_limit:
|
| 284 |
-
tools = tools[:args.tools_limit]
|
| 285 |
-
|
| 286 |
-
print(f"🔧 Generating synthetic examples for {len(tools)} tools")
|
| 287 |
-
print(f" Target: {args.examples_per_tool} per tool")
|
| 288 |
-
print(f" Total expected: ~{len(tools) * args.examples_per_tool}")
|
| 289 |
-
|
| 290 |
-
total_examples = 0
|
| 291 |
-
with open(output_path, 'w') as f:
|
| 292 |
-
for tool_name in tools:
|
| 293 |
-
templates = TOOL_TEMPLATES[tool_name]
|
| 294 |
-
ex_per_template = max(1, args.examples_per_tool // len(templates))
|
| 295 |
-
|
| 296 |
-
for template in templates:
|
| 297 |
-
examples = generate_variations(template, ex_per_template, tool_name)
|
| 298 |
-
for ex in examples:
|
| 299 |
-
f.write(json.dumps(ex) + "\n")
|
| 300 |
-
total_examples += 1
|
| 301 |
-
|
| 302 |
-
print(f"✅ {tool_name}: {ex_per_template * len(templates)} examples")
|
| 303 |
-
|
| 304 |
-
print(f"\n✨ Generated {total_examples} synthetic examples")
|
| 305 |
-
print(f" Saved to: {output_path}")
|
| 306 |
-
|
| 307 |
-
# Show sample
|
| 308 |
-
print("\n📝 Sample example:")
|
| 309 |
-
with open(output_path, 'r') as f:
|
| 310 |
-
sample = json.loads(f.readline())
|
| 311 |
-
print(f" Tool: {sample['tool']}")
|
| 312 |
-
print(f" User: {sample['messages'][0]['content'][:60]}...")
|
| 313 |
-
print(f" Assistant uses: {sample['messages'][1]['tool_use']['name']}")
|
| 314 |
-
|
| 315 |
-
if __name__ == "__main__":
|
| 316 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/generate_tool_data.py
DELETED
|
@@ -1,615 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Synthetic Tool-Calling Training Data Generator for Stack 2.9
|
| 4 |
-
Generates training examples in Qwen2.5-Coder format with tool_calls.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import json
|
| 8 |
-
import random
|
| 9 |
-
import argparse
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from typing import Dict, List, Any
|
| 12 |
-
from datetime import datetime
|
| 13 |
-
|
| 14 |
-
# ============================================================================
|
| 15 |
-
# Tool Definitions (Qwen2.5-Coder format)
|
| 16 |
-
# ============================================================================
|
| 17 |
-
|
| 18 |
-
TOOL_DEFINITIONS = [
|
| 19 |
-
{
|
| 20 |
-
"type": "function",
|
| 21 |
-
"function": {
|
| 22 |
-
"name": "Bash",
|
| 23 |
-
"description": "Execute bash commands in the terminal. Use for running shell commands, scripts, git operations, package managers, and system commands.",
|
| 24 |
-
"parameters": {
|
| 25 |
-
"type": "object",
|
| 26 |
-
"properties": {
|
| 27 |
-
"command": {
|
| 28 |
-
"type": "string",
|
| 29 |
-
"description": "The bash command to execute"
|
| 30 |
-
},
|
| 31 |
-
"timeout": {
|
| 32 |
-
"type": "integer",
|
| 33 |
-
"description": "Timeout in seconds (default: 30)"
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
"required": ["command"]
|
| 37 |
-
}
|
| 38 |
-
}
|
| 39 |
-
},
|
| 40 |
-
{
|
| 41 |
-
"type": "function",
|
| 42 |
-
"function": {
|
| 43 |
-
"name": "FileRead",
|
| 44 |
-
"description": "Read the contents of a file from the filesystem. Use for viewing source code, configuration files, documentation, or any text-based files.",
|
| 45 |
-
"parameters": {
|
| 46 |
-
"type": "object",
|
| 47 |
-
"properties": {
|
| 48 |
-
"path": {
|
| 49 |
-
"type": "string",
|
| 50 |
-
"description": "Path to the file to read"
|
| 51 |
-
},
|
| 52 |
-
"offset": {
|
| 53 |
-
"type": "integer",
|
| 54 |
-
"description": "Line number to start reading from (1-indexed)"
|
| 55 |
-
},
|
| 56 |
-
"limit": {
|
| 57 |
-
"type": "integer",
|
| 58 |
-
"description": "Maximum number of lines to read"
|
| 59 |
-
}
|
| 60 |
-
},
|
| 61 |
-
"required": ["path"]
|
| 62 |
-
}
|
| 63 |
-
}
|
| 64 |
-
},
|
| 65 |
-
{
|
| 66 |
-
"type": "function",
|
| 67 |
-
"function": {
|
| 68 |
-
"name": "FileWrite",
|
| 69 |
-
"description": "Create or overwrite a file with content. Use for creating new files, updating existing files, or writing code, configuration, or documentation.",
|
| 70 |
-
"parameters": {
|
| 71 |
-
"type": "object",
|
| 72 |
-
"properties": {
|
| 73 |
-
"path": {
|
| 74 |
-
"type": "string",
|
| 75 |
-
"description": "Path where the file should be created or written"
|
| 76 |
-
},
|
| 77 |
-
"content": {
|
| 78 |
-
"type": "string",
|
| 79 |
-
"description": "The content to write to the file"
|
| 80 |
-
},
|
| 81 |
-
"append": {
|
| 82 |
-
"type": "boolean",
|
| 83 |
-
"description": "Append to existing file instead of overwriting (default: false)"
|
| 84 |
-
}
|
| 85 |
-
},
|
| 86 |
-
"required": ["path", "content"]
|
| 87 |
-
}
|
| 88 |
-
}
|
| 89 |
-
},
|
| 90 |
-
{
|
| 91 |
-
"type": "function",
|
| 92 |
-
"function": {
|
| 93 |
-
"name": "WebSearch",
|
| 94 |
-
"description": "Search the web for information. Use for finding documentation, looking up error messages, researching libraries, or getting up-to-date information.",
|
| 95 |
-
"parameters": {
|
| 96 |
-
"type": "object",
|
| 97 |
-
"properties": {
|
| 98 |
-
"query": {
|
| 99 |
-
"type": "string",
|
| 100 |
-
"description": "The search query to look up on the web"
|
| 101 |
-
},
|
| 102 |
-
"count": {
|
| 103 |
-
"type": "integer",
|
| 104 |
-
"description": "Number of results to return (default: 5)"
|
| 105 |
-
}
|
| 106 |
-
},
|
| 107 |
-
"required": ["query"]
|
| 108 |
-
}
|
| 109 |
-
}
|
| 110 |
-
},
|
| 111 |
-
{
|
| 112 |
-
"type": "function",
|
| 113 |
-
"function": {
|
| 114 |
-
"name": "Grep",
|
| 115 |
-
"description": "Search for patterns in files. Use for finding specific code, function definitions, imports, TODO comments, error patterns, or any text across the codebase.",
|
| 116 |
-
"parameters": {
|
| 117 |
-
"type": "object",
|
| 118 |
-
"properties": {
|
| 119 |
-
"pattern": {
|
| 120 |
-
"type": "string",
|
| 121 |
-
"description": "The search pattern or regex to match"
|
| 122 |
-
},
|
| 123 |
-
"path": {
|
| 124 |
-
"type": "string",
|
| 125 |
-
"description": "Directory or file path to search in (default: current directory)"
|
| 126 |
-
},
|
| 127 |
-
"recursive": {
|
| 128 |
-
"type": "boolean",
|
| 129 |
-
"description": "Search recursively in subdirectories (default: true)"
|
| 130 |
-
},
|
| 131 |
-
"file_pattern": {
|
| 132 |
-
"type": "string",
|
| 133 |
-
"description": "File pattern to filter results (e.g., '*.py', '*.js')"
|
| 134 |
-
}
|
| 135 |
-
},
|
| 136 |
-
"required": ["pattern"]
|
| 137 |
-
}
|
| 138 |
-
}
|
| 139 |
-
}
|
| 140 |
-
]
|
| 141 |
-
|
| 142 |
-
# ============================================================================
|
| 143 |
-
# Template Data for Generation
|
| 144 |
-
# ============================================================================
|
| 145 |
-
|
| 146 |
-
FILE_PATHS = [
|
| 147 |
-
"src/main.py", "src/utils.py", "src/config.py", "src/models.py",
|
| 148 |
-
"src/api.py", "src/handlers.py", "src/middleware.py",
|
| 149 |
-
"tests/test_main.py", "tests/test_utils.py", "tests/conftest.py",
|
| 150 |
-
"README.md", "LICENSE", "package.json", "requirements.txt",
|
| 151 |
-
"config.yaml", "config.json", ".env.example",
|
| 152 |
-
"src/components/Button.tsx", "src/components/Header.jsx",
|
| 153 |
-
"src/styles.css", "src/index.js", "src/app.js",
|
| 154 |
-
"docs/API.md", "docs/ARCHITECTURE.md", "docs/CONTRIBUTING.md",
|
| 155 |
-
"scripts/setup.sh", "scripts/deploy.py", "Makefile"
|
| 156 |
-
]
|
| 157 |
-
|
| 158 |
-
CODE_SNIPPETS = {
|
| 159 |
-
"python": [
|
| 160 |
-
"def hello():\n print('Hello, World!')",
|
| 161 |
-
"class MyClass:\n def __init__(self):\n self.value = 42",
|
| 162 |
-
"import os\nos.path.join('a', 'b')",
|
| 163 |
-
"async def fetch_data():\n async with aiohttp.ClientSession() as session:\n return await session.get(url)",
|
| 164 |
-
],
|
| 165 |
-
"javascript": [
|
| 166 |
-
"const fetch = require('node-fetch');\nconst data = await fetch(url);",
|
| 167 |
-
"function handleClick() {\n setCount(count + 1);\n}",
|
| 168 |
-
"export default function App() {\n return <div>Hello</div>;\n}",
|
| 169 |
-
"const [state, setState] = useState(null);",
|
| 170 |
-
],
|
| 171 |
-
"bash": [
|
| 172 |
-
"npm install",
|
| 173 |
-
"git status",
|
| 174 |
-
"pytest -v",
|
| 175 |
-
"python -m pytest tests/",
|
| 176 |
-
"make build",
|
| 177 |
-
"docker build -t myapp .",
|
| 178 |
-
"ls -la",
|
| 179 |
-
"curl -X GET https://api.example.com",
|
| 180 |
-
]
|
| 181 |
-
}
|
| 182 |
-
|
| 183 |
-
WEB_SEARCH_QUERIES = [
|
| 184 |
-
"python async await best practices",
|
| 185 |
-
"javascript array methods map filter reduce",
|
| 186 |
-
"TypeScript generics tutorial",
|
| 187 |
-
"React hooks useEffect dependency array",
|
| 188 |
-
"Node.js error handling best practices",
|
| 189 |
-
"Docker vs Kubernetes differences",
|
| 190 |
-
"Git rebase vs merge",
|
| 191 |
-
"SQL join types explained",
|
| 192 |
-
"REST API design principles",
|
| 193 |
-
"Python list comprehension examples",
|
| 194 |
-
"JavaScript promise async await",
|
| 195 |
-
"CSS flexbox vs grid",
|
| 196 |
-
"MongoDB vs PostgreSQL",
|
| 197 |
-
"AWS Lambda cold start optimization",
|
| 198 |
-
"Docker compose tutorial",
|
| 199 |
-
]
|
| 200 |
-
|
| 201 |
-
GREP_PATTERNS = [
|
| 202 |
-
("TODO", "*.py", "Find TODO comments in Python files"),
|
| 203 |
-
("function ", "*.js", "Find function definitions in JS"),
|
| 204 |
-
("import ", "*.py", "Find imports in Python"),
|
| 205 |
-
("class ", "*.ts", "Find class definitions in TypeScript"),
|
| 206 |
-
("def ", "*.py", "Find function definitions in Python"),
|
| 207 |
-
("const ", "*.js", "Find const declarations in JS"),
|
| 208 |
-
("interface ", "*.ts", "Find interface definitions"),
|
| 209 |
-
("#pragma", "*.c", "Find pragma directives in C"),
|
| 210 |
-
("SELECT ", "*.sql", "Find SQL queries"),
|
| 211 |
-
("TODO", "*.md", "Find TODOs in documentation"),
|
| 212 |
-
]
|
| 213 |
-
|
| 214 |
-
# ============================================================================
|
| 215 |
-
# Scenario Templates
|
| 216 |
-
# ============================================================================
|
| 217 |
-
|
| 218 |
-
SCENARIOS = {
|
| 219 |
-
"FileRead": [
|
| 220 |
-
{
|
| 221 |
-
"user": "Read the file at src/main.py and show me what's in it",
|
| 222 |
-
"tool_call": {"name": "FileRead", "arguments": {"path": "src/main.py"}},
|
| 223 |
-
"result": "Successfully read file: src/main.py\n```python\ndef main():\n print('Hello from Stack 2.9!')\n\nif __name__ == '__main__':\n main()\n```",
|
| 224 |
-
"response": "Here's the contents of `src/main.py`:\n\n```python\ndef main():\n print('Hello from Stack 2.9!')\n\nif __name__ == '__main__':\n main()\n```\n\nThis is a simple Python script that prints a greeting when executed directly."
|
| 225 |
-
},
|
| 226 |
-
{
|
| 227 |
-
"user": "Can you show me the README.md file?",
|
| 228 |
-
"tool_call": {"name": "FileRead", "arguments": {"path": "README.md"}},
|
| 229 |
-
"result": "Successfully read file: README.md\n```markdown\n# My Project\n\nA sample project for Stack 2.9.\n\n## Installation\n\npip install -r requirements.txt\n```",
|
| 230 |
-
"response": "Here's the README.md:\n\n```markdown\n# My Project\n\nA sample project for Stack 2.9.\n\n## Installation\n\npip install -r requirements.txt\n```"
|
| 231 |
-
},
|
| 232 |
-
{
|
| 233 |
-
"user": "What's in the config.yaml file?",
|
| 234 |
-
"tool_call": {"name": "FileRead", "arguments": {"path": "config.yaml"}},
|
| 235 |
-
"result": "Successfully read file: config.yaml\n```yaml\ndatabase:\n host: localhost\n port: 5432\n name: myapp\n\napp:\n port: 3000\n debug: true\n```",
|
| 236 |
-
"response": "The config.yaml contains:\n\n```yaml\ndatabase:\n host: localhost\n port: 5432\n name: myapp\n\napp:\n port: 3000\n debug: true\n```\n\nIt has database connection settings and app configuration."
|
| 237 |
-
},
|
| 238 |
-
{
|
| 239 |
-
"user": "Check the package.json to see the dependencies",
|
| 240 |
-
"tool_call": {"name": "FileRead", "arguments": {"path": "package.json"}},
|
| 241 |
-
"result": "Successfully read file: package.json\n```json\n{\n \"name\": \"my-app\",\n \"version\": \"1.0.0\",\n \"dependencies\": {\n \"express\": \"^4.18.0\",\n \"react\": \"^18.2.0\"\n }\n}\n```",
|
| 242 |
-
"response": "The package.json shows these dependencies:\n\n```json\n{\n \"name\": \"my-app\",\n \"version\": \"1.0.0\",\n \"dependencies\": {\n \"express\": \"^4.18.0\",\n \"react\": \"^18.2.0\"\n }\n}\n```"
|
| 243 |
-
},
|
| 244 |
-
],
|
| 245 |
-
"FileWrite": [
|
| 246 |
-
{
|
| 247 |
-
"user": "Create a new file called hello.py with a hello world function",
|
| 248 |
-
"tool_call": {"name": "FileWrite", "arguments": {"path": "hello.py", "content": "def hello():\n print('Hello, World!')\n\nif __name__ == '__main__':\n hello()\n"}},
|
| 249 |
-
"result": "Successfully wrote file: hello.py",
|
| 250 |
-
"response": "Created `hello.py` with a hello world function:\n\n```python\ndef hello():\n print('Hello, World!')\n\nif __name__ == '__main__':\n hello()\n```\n\nYou can run it with `python hello.py`."
|
| 251 |
-
},
|
| 252 |
-
{
|
| 253 |
-
"user": "Write a simple React component to src/components/Button.jsx",
|
| 254 |
-
"tool_call": {"name": "FileWrite", "arguments": {"path": "src/components/Button.jsx", "content": "import React from 'react';\n\nexport default function Button({ onClick, children }) {\n return (\n <button onClick={onClick} className=\"btn\">\n {children}\n </button>\n );\n}\n"}},
|
| 255 |
-
"result": "Successfully wrote file: src/components/Button.jsx",
|
| 256 |
-
"response": "Created `src/components/Button.jsx`:\n\n```jsx\nimport React from 'react';\n\nexport default function Button({ onClick, children }) {\n return (\n <button onClick={onClick} className=\"btn\">\n {children}\n </button>\n );\n}\n```"
|
| 257 |
-
},
|
| 258 |
-
{
|
| 259 |
-
"user": "Create a config.json file with some settings",
|
| 260 |
-
"tool_call": {"name": "FileWrite", "arguments": {"path": "config.json", "content": "{\n \"app\": {\n \"name\": \"MyApp\",\n \"version\": \"1.0.0\"\n },\n \"server\": {\n \"port\": 8080,\n \"host\": \"localhost\"\n }\n}\n"}},
|
| 261 |
-
"result": "Successfully wrote file: config.json",
|
| 262 |
-
"response": "Created `config.json`:\n\n```json\n{\n \"app\": {\n \"name\": \"MyApp\",\n \"version\": \"1.0.0\"\n },\n \"server\": {\n \"port\": 8080,\n \"host\": \"localhost\"\n }\n}\n```"
|
| 263 |
-
},
|
| 264 |
-
],
|
| 265 |
-
"Bash": [
|
| 266 |
-
{
|
| 267 |
-
"user": "Run npm install to install dependencies",
|
| 268 |
-
"tool_call": {"name": "Bash", "arguments": {"command": "npm install"}},
|
| 269 |
-
"result": "added 245 packages in 3.5s",
|
| 270 |
-
"response": "Dependencies installed successfully! Added 245 packages."
|
| 271 |
-
},
|
| 272 |
-
{
|
| 273 |
-
"user": "Check git status",
|
| 274 |
-
"tool_call": {"name": "Bash", "arguments": {"command": "git status"}},
|
| 275 |
-
"result": "On branch main\nYour branch is up to date with 'origin/main'.\n\nChanges not staged for commit:\n modified: README.md\n modified: src/main.py\n\nno changes added to commit (but untracked files present)",
|
| 276 |
-
"response": "Git status shows:\n- Modified: README.md, src/main.py (not staged)\n- Untracked files present"
|
| 277 |
-
},
|
| 278 |
-
{
|
| 279 |
-
"user": "Run the tests with pytest",
|
| 280 |
-
"tool_call": {"name": "Bash", "arguments": {"command": "pytest -v"}},
|
| 281 |
-
"result": "tests/test_main.py::test_hello PASSED\ntests/test_utils.py::test_add PASSED\n\n2 passed in 0.5s",
|
| 282 |
-
"response": "All tests passed!\n- `tests/test_main.py::test_hello` ✓\n- `tests/test_utils.py::test_add` ✓"
|
| 283 |
-
},
|
| 284 |
-
{
|
| 285 |
-
"user": "List all files in the current directory",
|
| 286 |
-
"tool_call": {"name": "Bash", "arguments": {"command": "ls -la"}},
|
| 287 |
-
"result": "total 64\ndrwxr-xr-x 12 user staff 384 Apr 6 00:58 .\ndrwxr-xr-x 9 user staff 288 Apr 6 00:50 ..\n-rw-r--r-- 1 user staff 220 Apr 6 00:50 README.md\ndrwxr-xr-x 3 user staff 96 Apr 6 00:51 src\ndrwxr-xr-x 3 user staff 96 Apr 6 00:51 tests\n...",
|
| 288 |
-
"response": "Files in current directory:\n- README.md\n- src/\n- tests/\n- And more (permissions, dates shown)"
|
| 289 |
-
},
|
| 290 |
-
{
|
| 291 |
-
"user": "Build the project with make",
|
| 292 |
-
"tool_call": {"name": "Bash", "arguments": {"command": "make build"}},
|
| 293 |
-
"result": "Building...\ngcc -o main main.c\nobuild successful!",
|
| 294 |
-
"response": "Build completed successfully!"
|
| 295 |
-
},
|
| 296 |
-
],
|
| 297 |
-
"WebSearch": [
|
| 298 |
-
{
|
| 299 |
-
"user": "How do I use async/await in Python?",
|
| 300 |
-
"tool_call": {"name": "WebSearch", "arguments": {"query": "python async await best practices", "count": 5}},
|
| 301 |
-
"result": "Search results for 'python async await best practices':\n1. Real Python - Async IO in Python: A Complete Walkthrough\n2. Python Docs - asyncio - Asynchronous I/O\n3. Towards Data Science - Mastering Python Async/Await\n4. Stack Overflow - Common async/await mistakes\n5. Medium - Async/Await Patterns in Python",
|
| 302 |
-
"response": "Here are some resources on Python async/await:\n\n1. **Real Python** - Async IO in Python: A Complete Walkthrough\n2. **Python Docs** - asyncio - Asynchronous I/O\n3. **Towards Data Science** - Mastering Python Async/Await\n\nKey points:\n- Use `async def` to define async functions\n- Use `await` to call other async functions\n- `asyncio.run()` to execute the main coroutine"
|
| 303 |
-
},
|
| 304 |
-
{
|
| 305 |
-
"user": "What's the difference between Docker and Kubernetes?",
|
| 306 |
-
"tool_call": {"name": "WebSearch", "arguments": {"query": "Docker vs Kubernetes differences", "count": 5}},
|
| 307 |
-
"result": "Search results for 'Docker vs Kubernetes differences':\n1. Docker Blog - Containers 101\n2. Kubernetes.io - What is Kubernetes\n3. Red Hat - Docker vs Kubernetes explained\n4. DevOps.com - Container orchestration comparison\n5. Container Journal - When to use each",
|
| 308 |
-
"response": "**Docker** is a container platform - it packages applications into containers.\n\n**Kubernetes** is a container orchestrator - it manages multiple containers across machines.\n\nThink of Docker as individual shipping containers, and Kubernetes as the port management system that coordinates all of them."
|
| 309 |
-
},
|
| 310 |
-
{
|
| 311 |
-
"user": "How do I use React hooks properly?",
|
| 312 |
-
"tool_call": {"name": "WebSearch", "arguments": {"query": "React hooks useEffect dependency array", "count": 5}},
|
| 313 |
-
"result": "Search results for 'React hooks useEffect dependency array':\n1. React Docs - Synchronizing with Effects\n2. Kent C. Dodds - A Complete Guide to useEffect\n3. Robin Wieruch - React useEffect Hook Tutorial\n4. Stack Overflow - Common useEffect mistakes\n5. LogRocket - useEffect dependency array deep dive",
|
| 314 |
-
"response": "**React Hooks Guide:**\n\n1. **useState** - For state management\n2. **useEffect** - For side effects, takes dependency array\n3. **useContext** - For global state\n\nThe dependency array controls when the effect runs. Empty `[]` = once on mount. Include variables to re-run when they change."
|
| 315 |
-
},
|
| 316 |
-
],
|
| 317 |
-
"Grep": [
|
| 318 |
-
{
|
| 319 |
-
"user": "Find all TODO comments in the Python files",
|
| 320 |
-
"tool_call": {"name": "Grep", "arguments": {"pattern": "TODO", "path": ".", "recursive": True, "file_pattern": "*.py"}},
|
| 321 |
-
"result": "Found 3 matches:\nsrc/main.py:14: # TODO: Add error handling\nsrc/utils.py:22: # TODO: Implement caching\ntests/test_main.py:8: # TODO: Add more tests",
|
| 322 |
-
"response": "Found 3 TODO comments in Python files:\n- `src/main.py:14` - Add error handling\n- `src/utils.py:22` - Implement caching\n- `tests/test_main.py:8` - Add more tests"
|
| 323 |
-
},
|
| 324 |
-
{
|
| 325 |
-
"user": "Search for function definitions in the JavaScript files",
|
| 326 |
-
"tool_call": {"name": "Grep", "arguments": {"pattern": "function ", "path": "src", "recursive": True, "file_pattern": "*.js"}},
|
| 327 |
-
"result": "Found 5 matches:\nsrc/index.js:3: function setup() {\nsrc/handlers.js:12: function handleClick(event) {\nsrc/handlers.js:25: function handleSubmit(form) {\nsrc/utils.js:8: function formatDate(date) {\nsrc/utils.js:15: function parseJSON(str) {",
|
| 328 |
-
"response": "Found 5 function definitions in src/:\n- `src/index.js:3` - setup()\n- `src/handlers.js:12` - handleClick(event)\n- `src/handlers.js:15` - handleSubmit(form)\n- `src/utils.js:8` - formatDate(date)\n- `src/utils.js:15` - parseJSON(str)"
|
| 329 |
-
},
|
| 330 |
-
{
|
| 331 |
-
"user": "Find all imports in the Python codebase",
|
| 332 |
-
"tool_call": {"name": "Grep", "arguments": {"pattern": "import ", "path": ".", "recursive": True, "file_pattern": "*.py"}},
|
| 333 |
-
"result": "Found 12 matches:\nsrc/main.py:1: import os\nsrc/main.py:2: import sys\nsrc/main.py:3: import json\nsrc/utils.py:1: from typing import Dict\nsrc/api.py:5: import requests\n...",
|
| 334 |
-
"response": "Found 12 import statements across Python files. Main imports:\n- `os`, `sys`, `json` in src/main.py\n- `typing.Dict` in src/utils.py\n- `requests` in src/api.py"
|
| 335 |
-
},
|
| 336 |
-
{
|
| 337 |
-
"user": "Look for class definitions in TypeScript files",
|
| 338 |
-
"tool_call": {"name": "Grep", "arguments": {"pattern": "class ", "path": "src", "recursive": True, "file_pattern": "*.ts"}},
|
| 339 |
-
"result": "Found 2 matches:\nsrc/models/User.ts:3: class User {\nsrc/models/Config.ts:8: class Config {",
|
| 340 |
-
"response": "Found 2 class definitions:\n- `src/models/User.ts:3` - User class\n- `src/models/Config.ts:8` - Config class"
|
| 341 |
-
},
|
| 342 |
-
],
|
| 343 |
-
}
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
# ============================================================================
|
| 347 |
-
# Data Augmentation Helpers
|
| 348 |
-
# ============================================================================
|
| 349 |
-
|
| 350 |
-
def vary_file_paths(template: Dict) -> Dict:
|
| 351 |
-
"""Vary file paths in a scenario to create diversity."""
|
| 352 |
-
import copy
|
| 353 |
-
result = copy.deepcopy(template)
|
| 354 |
-
|
| 355 |
-
# Vary the file paths
|
| 356 |
-
if "path" in result["tool_call"]["arguments"]:
|
| 357 |
-
original_path = result["tool_call"]["arguments"]["path"]
|
| 358 |
-
for old_path in FILE_PATHS:
|
| 359 |
-
if old_path in result["user"]:
|
| 360 |
-
result["user"] = result["user"].replace(old_path, random.choice(FILE_PATHS))
|
| 361 |
-
result["tool_call"]["arguments"]["path"] = random.choice(FILE_PATHS)
|
| 362 |
-
break
|
| 363 |
-
|
| 364 |
-
return result
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
def vary_content(template: Dict) -> Dict:
|
| 368 |
-
"""Vary content in a scenario."""
|
| 369 |
-
import copy
|
| 370 |
-
result = copy.deepcopy(template)
|
| 371 |
-
|
| 372 |
-
if "content" in result["tool_call"]["arguments"]:
|
| 373 |
-
# Vary code snippets
|
| 374 |
-
lang = random.choice(["python", "javascript"])
|
| 375 |
-
result["tool_call"]["arguments"]["content"] = random.choice(CODE_SNIPPETS[lang])
|
| 376 |
-
|
| 377 |
-
return result
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
def vary_bash_command(template: Dict) -> Dict:
|
| 381 |
-
"""Vary bash commands."""
|
| 382 |
-
import copy
|
| 383 |
-
result = copy.deepcopy(template)
|
| 384 |
-
|
| 385 |
-
if "command" in result["tool_call"]["arguments"]:
|
| 386 |
-
original = result["tool_call"]["arguments"]["command"].split()[0] if result["tool_call"]["arguments"]["command"] else ""
|
| 387 |
-
|
| 388 |
-
if "npm" in original:
|
| 389 |
-
commands = ["npm install", "npm run build", "npm test", "npm start"]
|
| 390 |
-
elif "git" in original:
|
| 391 |
-
commands = ["git status", "git log --oneline -5", "git diff", "git branch -a"]
|
| 392 |
-
elif "pytest" in original:
|
| 393 |
-
commands = ["pytest -v", "pytest tests/", "pytest -xvs", "pytest --cov"]
|
| 394 |
-
elif "ls" in original:
|
| 395 |
-
commands = ["ls -la", "ls -1", "ls -lah"]
|
| 396 |
-
elif "make" in original:
|
| 397 |
-
commands = ["make build", "make clean", "make test", "make install"]
|
| 398 |
-
else:
|
| 399 |
-
commands = ["echo 'hello'", "pwd", "whoami", "date"]
|
| 400 |
-
|
| 401 |
-
result["tool_call"]["arguments"]["command"] = random.choice(commands)
|
| 402 |
-
|
| 403 |
-
return result
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
def vary_search_query(template: Dict) -> Dict:
|
| 407 |
-
"""Vary web search queries."""
|
| 408 |
-
import copy
|
| 409 |
-
result = copy.deepcopy(template)
|
| 410 |
-
|
| 411 |
-
if "query" in result["tool_call"]["arguments"]:
|
| 412 |
-
result["tool_call"]["arguments"]["query"] = random.choice(WEB_SEARCH_QUERIES)
|
| 413 |
-
|
| 414 |
-
return result
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
def vary_grep_pattern(template: Dict) -> Dict:
|
| 418 |
-
"""Vary grep patterns."""
|
| 419 |
-
import copy
|
| 420 |
-
result = copy.deepcopy(template)
|
| 421 |
-
|
| 422 |
-
pattern, file_pattern, _ = random.choice(GREP_PATTERNS)
|
| 423 |
-
result["tool_call"]["arguments"]["pattern"] = pattern
|
| 424 |
-
result["tool_call"]["arguments"]["file_pattern"] = file_pattern
|
| 425 |
-
|
| 426 |
-
return result
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
# ============================================================================
|
| 430 |
-
# Main Generation Functions
|
| 431 |
-
# ============================================================================
|
| 432 |
-
|
| 433 |
-
def create_tool_call_message(tool_call: Dict, tool_call_id: str) -> Dict:
|
| 434 |
-
"""Create a tool_calls message in Qwen format."""
|
| 435 |
-
return {
|
| 436 |
-
"role": "assistant",
|
| 437 |
-
"content": None,
|
| 438 |
-
"tool_calls": [
|
| 439 |
-
{
|
| 440 |
-
"id": tool_call_id,
|
| 441 |
-
"type": "function",
|
| 442 |
-
"function": {
|
| 443 |
-
"name": tool_call["name"],
|
| 444 |
-
"arguments": json.dumps(tool_call["arguments"])
|
| 445 |
-
}
|
| 446 |
-
}
|
| 447 |
-
]
|
| 448 |
-
}
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
def create_tool_message(role: str, tool_call_id: str, tool_name: str, result: str) -> Dict:
|
| 452 |
-
"""Create a tool message (result of tool execution)."""
|
| 453 |
-
return {
|
| 454 |
-
"role": role, # typically "tool"
|
| 455 |
-
"content": result,
|
| 456 |
-
"tool_call_id": tool_call_id,
|
| 457 |
-
"name": tool_name
|
| 458 |
-
}
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
def generate_example(scenario: Dict, system_prompt: str = None) -> Dict:
|
| 462 |
-
"""Generate a single training example in Qwen2.5-Coder format."""
|
| 463 |
-
if system_prompt is None:
|
| 464 |
-
system_prompt = "You are a helpful AI assistant that can use tools to help users solve problems. When you need to perform actions like reading files, running commands, searching the web, or searching code, use the appropriate tool."
|
| 465 |
-
|
| 466 |
-
tool_call_id = f"call_${random.randint(1000, 9999)}"
|
| 467 |
-
|
| 468 |
-
messages = [
|
| 469 |
-
{"role": "system", "content": system_prompt},
|
| 470 |
-
{"role": "user", "content": scenario["user"]},
|
| 471 |
-
create_tool_call_message(scenario["tool_call"], tool_call_id),
|
| 472 |
-
create_tool_message("tool", tool_call_id, scenario["tool_call"]["name"], scenario["result"]),
|
| 473 |
-
{"role": "assistant", "content": scenario["response"]}
|
| 474 |
-
]
|
| 475 |
-
|
| 476 |
-
return {
|
| 477 |
-
"messages": messages,
|
| 478 |
-
"tools": TOOL_DEFINITIONS
|
| 479 |
-
}
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
def augment_scenario(scenario: Dict, tool_name: str) -> Dict:
|
| 483 |
-
"""Apply random augmentations to a scenario."""
|
| 484 |
-
import random
|
| 485 |
-
|
| 486 |
-
augmented = scenario.copy()
|
| 487 |
-
|
| 488 |
-
if tool_name == "FileRead":
|
| 489 |
-
augmented = vary_file_paths(augmented)
|
| 490 |
-
elif tool_name == "FileWrite":
|
| 491 |
-
augmented = vary_file_paths(augmented)
|
| 492 |
-
augmented = vary_content(augmented)
|
| 493 |
-
elif tool_name == "Bash":
|
| 494 |
-
augmented = vary_bash_command(augmented)
|
| 495 |
-
elif tool_name == "WebSearch":
|
| 496 |
-
augmented = vary_search_query(augmented)
|
| 497 |
-
elif tool_name == "Grep":
|
| 498 |
-
augmented = vary_grep_pattern(augmented)
|
| 499 |
-
|
| 500 |
-
return augmented
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
def generate_dataset(num_examples: int = 1000, output_path: str = None) -> List[Dict]:
|
| 504 |
-
"""Generate the complete dataset."""
|
| 505 |
-
examples = []
|
| 506 |
-
tools = list(SCENARIOS.keys())
|
| 507 |
-
|
| 508 |
-
# Track counts for balance
|
| 509 |
-
examples_per_tool = num_examples // len(tools)
|
| 510 |
-
remainder = num_examples % len(tools)
|
| 511 |
-
|
| 512 |
-
for i, tool_name in enumerate(tools):
|
| 513 |
-
# Determine how many examples for this tool
|
| 514 |
-
count = examples_per_tool + (1 if i < remainder else 0)
|
| 515 |
-
|
| 516 |
-
base_scenarios = SCENARIOS[tool_name]
|
| 517 |
-
|
| 518 |
-
for j in range(count):
|
| 519 |
-
# Use base scenario and vary
|
| 520 |
-
base = base_scenarios[j % len(base_scenarios)]
|
| 521 |
-
|
| 522 |
-
# Apply augmentations for variety
|
| 523 |
-
if j >= len(base_scenarios):
|
| 524 |
-
scenario = augment_scenario(base, tool_name)
|
| 525 |
-
else:
|
| 526 |
-
scenario = base
|
| 527 |
-
|
| 528 |
-
example = generate_example(scenario)
|
| 529 |
-
examples.append(example)
|
| 530 |
-
|
| 531 |
-
# Shuffle for better training
|
| 532 |
-
random.shuffle(examples)
|
| 533 |
-
|
| 534 |
-
return examples
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
def save_jsonl(examples: List[Dict], output_path: str):
|
| 538 |
-
"""Save examples to JSONL format."""
|
| 539 |
-
output_file = Path(output_path)
|
| 540 |
-
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 541 |
-
|
| 542 |
-
with open(output_file, 'w', encoding='utf-8') as f:
|
| 543 |
-
for example in examples:
|
| 544 |
-
f.write(json.dumps(example, ensure_ascii=False) + '\n')
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
def save_json(examples: List[Dict], output_path: str):
|
| 548 |
-
"""Save examples to JSON format."""
|
| 549 |
-
output_file = Path(output_path)
|
| 550 |
-
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 551 |
-
|
| 552 |
-
with open(output_file, 'w', encoding='utf-8') as f:
|
| 553 |
-
json.dump(examples, f, ensure_ascii=False, indent=2)
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
def main():
|
| 557 |
-
parser = argparse.ArgumentParser(description="Generate synthetic tool-calling training data")
|
| 558 |
-
parser.add_argument("--num-examples", type=int, default=1000, help="Number of examples to generate")
|
| 559 |
-
parser.add_argument("--output-dir", type=str, default="training-data", help="Output directory")
|
| 560 |
-
parser.add_argument("--output-format", choices=["jsonl", "json", "both"], default="jsonl", help="Output format")
|
| 561 |
-
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
| 562 |
-
args = parser.parse_args()
|
| 563 |
-
|
| 564 |
-
# Set seed for reproducibility
|
| 565 |
-
random.seed(args.seed)
|
| 566 |
-
|
| 567 |
-
print(f"🎯 Generating {args.num_examples} tool-calling training examples...")
|
| 568 |
-
print(f" Output directory: {args.output_dir}")
|
| 569 |
-
print(f" Format: {args.output_format}")
|
| 570 |
-
print()
|
| 571 |
-
|
| 572 |
-
# Generate dataset
|
| 573 |
-
examples = generate_dataset(args.num_examples)
|
| 574 |
-
|
| 575 |
-
output_dir = Path(args.output_dir)
|
| 576 |
-
|
| 577 |
-
# Save based on format
|
| 578 |
-
if args.output_format in ["jsonl", "both"]:
|
| 579 |
-
jsonl_path = output_dir / "tool_examples.jsonl"
|
| 580 |
-
save_jsonl(examples, str(jsonl_path))
|
| 581 |
-
print(f"✅ Saved JSONL: {jsonl_path}")
|
| 582 |
-
|
| 583 |
-
if args.output_format in ["json", "both"]:
|
| 584 |
-
json_path = output_dir / "tool_examples.json"
|
| 585 |
-
save_json(examples, str(json_path))
|
| 586 |
-
print(f"✅ Saved JSON: {json_path}")
|
| 587 |
-
|
| 588 |
-
# Statistics
|
| 589 |
-
print(f"\n📊 Statistics:")
|
| 590 |
-
print(f" Total examples: {len(examples)}")
|
| 591 |
-
|
| 592 |
-
# Count by tool
|
| 593 |
-
tool_counts = {}
|
| 594 |
-
for ex in examples:
|
| 595 |
-
for msg in ex["messages"]:
|
| 596 |
-
if msg.get("tool_calls"):
|
| 597 |
-
tool_name = msg["tool_calls"][0]["function"]["name"]
|
| 598 |
-
tool_counts[tool_name] = tool_counts.get(tool_name, 0) + 1
|
| 599 |
-
|
| 600 |
-
print(f" Examples by tool:")
|
| 601 |
-
for tool, count in sorted(tool_counts.items(), key=lambda x: x[1], reverse=True):
|
| 602 |
-
print(f" - {tool}: {count}")
|
| 603 |
-
|
| 604 |
-
# Show sample
|
| 605 |
-
print(f"\n📝 Sample example (first in dataset):")
|
| 606 |
-
sample = examples[0]
|
| 607 |
-
print(f" Tools defined: {len(sample['tools'])}")
|
| 608 |
-
print(f" Messages: {len(sample['messages'])}")
|
| 609 |
-
print(f" First user message: {sample['messages'][1]['content'][:60]}...")
|
| 610 |
-
|
| 611 |
-
print(f"\n✨ Generation complete!")
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
if __name__ == "__main__":
|
| 615 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/generate_tool_use_tests.py
DELETED
|
@@ -1,163 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Tool-Use Evaluation Framework for Stack 2.9.
|
| 4 |
-
Generates test cases and evaluates model's tool selection accuracy.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import json
|
| 8 |
-
import random
|
| 9 |
-
import re
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from typing import Dict, List, Any
|
| 12 |
-
import argparse
|
| 13 |
-
|
| 14 |
-
def load_tool_catalog(path: str) -> List[Dict]:
|
| 15 |
-
with open(path, 'r') as f:
|
| 16 |
-
return json.load(f)
|
| 17 |
-
|
| 18 |
-
def generate_test_case(tool: Dict[str, Any]) -> Dict[str, Any]:
|
| 19 |
-
"""Generate a single test case for a tool."""
|
| 20 |
-
tool_name = tool["tool"]
|
| 21 |
-
|
| 22 |
-
# Templates for each tool (simplified)
|
| 23 |
-
user_prompts = {
|
| 24 |
-
"FileReadTool": [
|
| 25 |
-
"Read {file_path}",
|
| 26 |
-
"Show me the contents of {file_path}",
|
| 27 |
-
"What's in {file_path}?",
|
| 28 |
-
"Open {file_path}"
|
| 29 |
-
],
|
| 30 |
-
"FileWriteTool": [
|
| 31 |
-
"Create a new file {file_path} with content: {content}",
|
| 32 |
-
"Write this to {file_path}: {content}",
|
| 33 |
-
"Save the following as {file_path}: {content}"
|
| 34 |
-
],
|
| 35 |
-
"GlobTool": [
|
| 36 |
-
"Find all {pattern} files",
|
| 37 |
-
"List files matching {pattern}",
|
| 38 |
-
"Show me every {pattern}",
|
| 39 |
-
"Search for files like {pattern}"
|
| 40 |
-
],
|
| 41 |
-
"GrepTool": [
|
| 42 |
-
"Search for {pattern} in {directory}",
|
| 43 |
-
"Find all {pattern}",
|
| 44 |
-
"Grep for {pattern}",
|
| 45 |
-
"Locate {pattern} in the codebase"
|
| 46 |
-
],
|
| 47 |
-
"BashTool": [
|
| 48 |
-
"Run: {command}",
|
| 49 |
-
"Execute {command}",
|
| 50 |
-
"Please run {command}",
|
| 51 |
-
"Can you execute {command}?"
|
| 52 |
-
]
|
| 53 |
-
# ... others use default fallback
|
| 54 |
-
}
|
| 55 |
-
|
| 56 |
-
prompts = user_prompts.get(tool_name, [
|
| 57 |
-
"Use {tool} to do something",
|
| 58 |
-
"Execute {tool}",
|
| 59 |
-
"Call {tool}"
|
| 60 |
-
])
|
| 61 |
-
|
| 62 |
-
# Choose random prompt template
|
| 63 |
-
prompt_template = random.choice(prompts)
|
| 64 |
-
|
| 65 |
-
# Extract placeholders from template
|
| 66 |
-
placeholders = re.findall(r'{(.*?)}', prompt_template)
|
| 67 |
-
|
| 68 |
-
# Generate parameter values for each placeholder
|
| 69 |
-
params = {}
|
| 70 |
-
for ph in placeholders:
|
| 71 |
-
if ph == 'file_path':
|
| 72 |
-
params[ph] = random.choice([
|
| 73 |
-
"src/main.py", "README.md", "package.json",
|
| 74 |
-
"config.yaml", "tests/test_api.py", "src/index.js"
|
| 75 |
-
])
|
| 76 |
-
elif ph == 'pattern':
|
| 77 |
-
params[ph] = random.choice([
|
| 78 |
-
"**/*.py", "**/*.js", "**/*.md", "**/*.test.*",
|
| 79 |
-
"src/**/*.ts", "lib/**/*.py"
|
| 80 |
-
])
|
| 81 |
-
elif ph == 'command':
|
| 82 |
-
params[ph] = random.choice([
|
| 83 |
-
"npm test", "pytest", "git status", "ls -la",
|
| 84 |
-
"make build", "python -m pip install -e ."
|
| 85 |
-
])
|
| 86 |
-
elif ph == 'query':
|
| 87 |
-
params[ph] = random.choice(["TODO", "FIXME", "BUG", "HACK"])
|
| 88 |
-
elif ph == 'directory':
|
| 89 |
-
params[ph] = random.choice([".", "src", "tests", "lib", "app"])
|
| 90 |
-
elif ph == 'content':
|
| 91 |
-
params[ph] = "console.log('test');"
|
| 92 |
-
elif ph == 'tool':
|
| 93 |
-
params[ph] = tool_name
|
| 94 |
-
else:
|
| 95 |
-
params[ph] = f"value_{random.randint(1,100)}"
|
| 96 |
-
|
| 97 |
-
# Fill prompt template
|
| 98 |
-
prompt = prompt_template.format(**params)
|
| 99 |
-
|
| 100 |
-
# Build expected tool call
|
| 101 |
-
expected_tool = tool_name
|
| 102 |
-
# Remove 'tool' param if present (it's just for substitution)
|
| 103 |
-
expected_params = {k: v for k, v in params.items() if k != 'tool'}
|
| 104 |
-
|
| 105 |
-
return {
|
| 106 |
-
"test_id": f"{tool_name}_{random.randint(1000,9999)}",
|
| 107 |
-
"prompt": prompt,
|
| 108 |
-
"expected_tool": expected_tool,
|
| 109 |
-
"expected_params": expected_params,
|
| 110 |
-
"tool_description": tool.get("description", ""),
|
| 111 |
-
"difficulty": random.choice(["easy", "medium", "hard"])
|
| 112 |
-
}
|
| 113 |
-
|
| 114 |
-
def generate_test_suite(catalog: List[Dict], tests_per_tool: int = 10) -> List[Dict]:
|
| 115 |
-
"""Generate test suite for all tools."""
|
| 116 |
-
suite = []
|
| 117 |
-
for tool in catalog:
|
| 118 |
-
for _ in range(tests_per_tool):
|
| 119 |
-
test_case = generate_test_case(tool)
|
| 120 |
-
suite.append(test_case)
|
| 121 |
-
return suite
|
| 122 |
-
|
| 123 |
-
def main():
|
| 124 |
-
parser = argparse.ArgumentParser()
|
| 125 |
-
parser.add_argument("--catalog", type=str, default="training-data/tools/catalog.json")
|
| 126 |
-
parser.add_argument("--output", type=str, default="stack-2.9-eval/tool_use/test_cases.json")
|
| 127 |
-
parser.add_argument("--tests-per-tool", type=int, default=10)
|
| 128 |
-
args = parser.parse_args()
|
| 129 |
-
|
| 130 |
-
catalog_path = Path(args.catalog)
|
| 131 |
-
output_path = Path(args.output)
|
| 132 |
-
|
| 133 |
-
if not catalog_path.exists():
|
| 134 |
-
print(f"❌ Catalog not found: {catalog_path}")
|
| 135 |
-
return
|
| 136 |
-
|
| 137 |
-
tools = load_tool_catalog(catalog_path)
|
| 138 |
-
print(f"🔧 Generating test cases for {len(tools)} tools")
|
| 139 |
-
|
| 140 |
-
suite = generate_test_suite(tools, args.tests_per_tool)
|
| 141 |
-
|
| 142 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 143 |
-
with open(output_path, 'w') as f:
|
| 144 |
-
json.dump(suite, f, indent=2)
|
| 145 |
-
|
| 146 |
-
print(f"\n✨ Generated {len(suite)} test cases")
|
| 147 |
-
print(f" Saved to: {output_path}")
|
| 148 |
-
|
| 149 |
-
# Summary by tool
|
| 150 |
-
by_tool = {}
|
| 151 |
-
for tc in suite:
|
| 152 |
-
tool = tc["expected_tool"]
|
| 153 |
-
by_tool[tool] = by_tool.get(tool, 0) + 1
|
| 154 |
-
|
| 155 |
-
print("\n📊 Test cases per tool (top 10):")
|
| 156 |
-
for tool, count in sorted(by_tool.items(), key=lambda x: x[1], reverse=True)[:10]:
|
| 157 |
-
print(f" {tool}: {count}")
|
| 158 |
-
|
| 159 |
-
print("\n✅ Test suite ready!")
|
| 160 |
-
print(" To evaluate: run tool_use_evaluator.py with a trained model")
|
| 161 |
-
|
| 162 |
-
if __name__ == "__main__":
|
| 163 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/mine_sessions.py
DELETED
|
@@ -1,233 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Mine OpenClaw/Claude sessions for training data.
|
| 4 |
-
Extracts conversations with tool use into JSONL format.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import os
|
| 8 |
-
import json
|
| 9 |
-
import glob
|
| 10 |
-
from pathlib import Path
|
| 11 |
-
from typing import Dict, List, Any
|
| 12 |
-
import argparse
|
| 13 |
-
|
| 14 |
-
def find_session_logs() -> List[Path]:
|
| 15 |
-
"""Find potential session log files in common locations."""
|
| 16 |
-
possible_locations = [
|
| 17 |
-
# OpenClaw sessions
|
| 18 |
-
Path.home() / ".openclaw" / "sessions",
|
| 19 |
-
Path.home() / ".openclaw" / "history",
|
| 20 |
-
# Claude Code sessions
|
| 21 |
-
Path.home() / ".claude" / "sessions",
|
| 22 |
-
Path.home() / ".anthropic" / "sessions",
|
| 23 |
-
# Generic
|
| 24 |
-
Path.home() / "Documents" / "claude_sessions",
|
| 25 |
-
Path.cwd() / "sessions",
|
| 26 |
-
Path.cwd() / ".sessions",
|
| 27 |
-
]
|
| 28 |
-
|
| 29 |
-
log_files = []
|
| 30 |
-
for location in possible_locations:
|
| 31 |
-
if location.exists():
|
| 32 |
-
# Look for JSON, JSONL, or MD files
|
| 33 |
-
for pattern in ["*.json", "*.jsonl", "*.md"]:
|
| 34 |
-
log_files.extend(location.glob(pattern))
|
| 35 |
-
|
| 36 |
-
return log_files
|
| 37 |
-
|
| 38 |
-
def parse_json_conversation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 39 |
-
"""Parse a JSON conversation into training examples."""
|
| 40 |
-
examples = []
|
| 41 |
-
|
| 42 |
-
# Try different known formats
|
| 43 |
-
if "messages" in data:
|
| 44 |
-
# OpenAI format
|
| 45 |
-
messages = data["messages"]
|
| 46 |
-
if is_valid_conversation(messages):
|
| 47 |
-
examples.append({"messages": messages, "source": "openai_format"})
|
| 48 |
-
|
| 49 |
-
elif "conversation" in data:
|
| 50 |
-
# Custom format with conversation array
|
| 51 |
-
messages = data["conversation"]
|
| 52 |
-
if is_valid_conversation(messages):
|
| 53 |
-
examples.append({"messages": messages, "source": "custom"})
|
| 54 |
-
|
| 55 |
-
elif "turns" in data:
|
| 56 |
-
# Turn-based format
|
| 57 |
-
turns = data["turns"]
|
| 58 |
-
messages = []
|
| 59 |
-
for turn in turns:
|
| 60 |
-
if "role" in turn and "content" in turn:
|
| 61 |
-
messages.append({
|
| 62 |
-
"role": turn["role"],
|
| 63 |
-
"content": turn["content"],
|
| 64 |
-
"tool_use": turn.get("tool_use"),
|
| 65 |
-
"tool_result": turn.get("tool_result")
|
| 66 |
-
})
|
| 67 |
-
if is_valid_conversation(messages):
|
| 68 |
-
examples.append({"messages": messages, "source": "turn_based"})
|
| 69 |
-
|
| 70 |
-
return examples
|
| 71 |
-
|
| 72 |
-
def is_valid_conversation(messages: List[Dict[str, Any]]) -> bool:
|
| 73 |
-
"""Check if message list is a valid conversation with tool use."""
|
| 74 |
-
if not isinstance(messages, list) or len(messages) < 2:
|
| 75 |
-
return False
|
| 76 |
-
|
| 77 |
-
# Must have at least one user and one assistant message
|
| 78 |
-
roles = [m.get("role") for m in messages if "role" in m]
|
| 79 |
-
if "user" not in roles or "assistant" not in roles:
|
| 80 |
-
return False
|
| 81 |
-
|
| 82 |
-
return True
|
| 83 |
-
|
| 84 |
-
def parse_markdown_conversation(text: str) -> List[Dict[str, Any]]:
|
| 85 |
-
"""Parse Markdown logs (Claude Code format typically)."""
|
| 86 |
-
examples = []
|
| 87 |
-
|
| 88 |
-
# Claude Code / chat format often has blocks like:
|
| 89 |
-
# User: ...
|
| 90 |
-
# Assistant: ...
|
| 91 |
-
# or with tool use in special blocks
|
| 92 |
-
|
| 93 |
-
lines = text.split("\n")
|
| 94 |
-
current_role = None
|
| 95 |
-
current_content = []
|
| 96 |
-
messages = []
|
| 97 |
-
|
| 98 |
-
for line in lines:
|
| 99 |
-
line = line.rstrip()
|
| 100 |
-
|
| 101 |
-
# Detect role changes
|
| 102 |
-
if line.startswith("**User:**") or line.startswith("User:"):
|
| 103 |
-
if current_role:
|
| 104 |
-
messages.append({
|
| 105 |
-
"role": current_role,
|
| 106 |
-
"content": "\n".join(current_content).strip()
|
| 107 |
-
})
|
| 108 |
-
current_role = "user"
|
| 109 |
-
current_content = [line.split(":", 1)[1].strip()] if ":" in line else []
|
| 110 |
-
elif line.startswith("**Assistant:**") or line.startswith("Assistant:"):
|
| 111 |
-
if current_role:
|
| 112 |
-
messages.append({
|
| 113 |
-
"role": current_role,
|
| 114 |
-
"content": "\n".join(current_content).strip()
|
| 115 |
-
})
|
| 116 |
-
current_role = "assistant"
|
| 117 |
-
current_content = [line.split(":", 1)[1].strip()] if ":" in line else []
|
| 118 |
-
elif line.startswith("**Tool:**") or line.startswith("Tool Use:"):
|
| 119 |
-
if current_role:
|
| 120 |
-
messages.append({
|
| 121 |
-
"role": current_role,
|
| 122 |
-
"content": "\n".join(current_content).strip()
|
| 123 |
-
})
|
| 124 |
-
current_role = "assistant"
|
| 125 |
-
# Start tool use block
|
| 126 |
-
current_content = []
|
| 127 |
-
# Could parse tool name and parameters
|
| 128 |
-
else:
|
| 129 |
-
if current_role:
|
| 130 |
-
current_content.append(line)
|
| 131 |
-
|
| 132 |
-
# Don't forget last message
|
| 133 |
-
if current_role and current_content:
|
| 134 |
-
messages.append({
|
| 135 |
-
"role": current_role,
|
| 136 |
-
"content": "\n".join(current_content).strip()
|
| 137 |
-
})
|
| 138 |
-
|
| 139 |
-
if is_valid_conversation(messages):
|
| 140 |
-
examples.append({"messages": messages, "source": "markdown"})
|
| 141 |
-
|
| 142 |
-
return examples
|
| 143 |
-
|
| 144 |
-
def save_examples(examples: List[Dict[str, Any]], output_path: Path):
|
| 145 |
-
"""Save examples to JSONL file."""
|
| 146 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 147 |
-
|
| 148 |
-
with open(output_path, 'a') as f:
|
| 149 |
-
for ex in examples:
|
| 150 |
-
f.write(json.dumps(ex) + "\n")
|
| 151 |
-
|
| 152 |
-
def main():
|
| 153 |
-
parser = argparse.ArgumentParser()
|
| 154 |
-
parser.add_argument("--output", type=str, default="training-data/scaled/sessions.jsonl")
|
| 155 |
-
parser.add_argument("--dry-run", action="store_true", help="Just list files, don't parse")
|
| 156 |
-
args = parser.parse_args()
|
| 157 |
-
|
| 158 |
-
output_path = Path(args.output)
|
| 159 |
-
|
| 160 |
-
print(f"🔍 Searching for session logs...")
|
| 161 |
-
log_files = find_session_logs()
|
| 162 |
-
|
| 163 |
-
if not log_files:
|
| 164 |
-
print("⚠️ No session logs found in standard locations.")
|
| 165 |
-
print(" Expected locations: ~/.openclaw/sessions, ~/.claude/sessions, ~/.anthropic/sessions")
|
| 166 |
-
return
|
| 167 |
-
|
| 168 |
-
print(f"📁 Found {len(log_files)} log files")
|
| 169 |
-
|
| 170 |
-
if args.dry_run:
|
| 171 |
-
for f in log_files[:10]:
|
| 172 |
-
print(f" - {f}")
|
| 173 |
-
if len(log_files) > 10:
|
| 174 |
-
print(f" ... and {len(log_files)-10} more")
|
| 175 |
-
return
|
| 176 |
-
|
| 177 |
-
total_examples = 0
|
| 178 |
-
for log_file in log_files:
|
| 179 |
-
try:
|
| 180 |
-
with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
|
| 181 |
-
content = f.read()
|
| 182 |
-
|
| 183 |
-
examples = []
|
| 184 |
-
|
| 185 |
-
# Try JSON first
|
| 186 |
-
if log_file.suffix in ['.json', '.jsonl']:
|
| 187 |
-
if log_file.suffix == '.jsonl':
|
| 188 |
-
# Multiple JSON objects per line
|
| 189 |
-
for line in content.split('\n'):
|
| 190 |
-
line = line.strip()
|
| 191 |
-
if line:
|
| 192 |
-
try:
|
| 193 |
-
data = json.loads(line)
|
| 194 |
-
examples.extend(parse_json_conversation(data))
|
| 195 |
-
except json.JSONDecodeError:
|
| 196 |
-
pass
|
| 197 |
-
else:
|
| 198 |
-
# Single JSON object
|
| 199 |
-
try:
|
| 200 |
-
data = json.loads(content)
|
| 201 |
-
examples.extend(parse_json_conversation(data))
|
| 202 |
-
except json.JSONDecodeError:
|
| 203 |
-
# Maybe it's a JSON array
|
| 204 |
-
try:
|
| 205 |
-
data_list = json.loads(content)
|
| 206 |
-
if isinstance(data_list, list):
|
| 207 |
-
for data in data_list:
|
| 208 |
-
examples.extend(parse_json_conversation(data))
|
| 209 |
-
except:
|
| 210 |
-
pass
|
| 211 |
-
else:
|
| 212 |
-
# Markdown or text
|
| 213 |
-
examples.extend(parse_markdown_conversation(content))
|
| 214 |
-
|
| 215 |
-
if examples:
|
| 216 |
-
save_examples(examples, output_path)
|
| 217 |
-
total_examples += len(examples)
|
| 218 |
-
print(f"✅ {log_file.name}: {len(examples)} examples")
|
| 219 |
-
|
| 220 |
-
except Exception as e:
|
| 221 |
-
print(f"❌ Error processing {log_file}: {e}")
|
| 222 |
-
|
| 223 |
-
print(f"\n✨ Extracted {total_examples} examples from session logs")
|
| 224 |
-
print(f" Saved to: {output_path}")
|
| 225 |
-
|
| 226 |
-
if total_examples == 0:
|
| 227 |
-
print("\n⚠️ No valid conversations found. Consider:")
|
| 228 |
-
print(" 1. Check if you have session logs in non-standard locations")
|
| 229 |
-
print(" 2. Your logs may be in a different format")
|
| 230 |
-
print(" 3. You may need to export conversations from your tools")
|
| 231 |
-
|
| 232 |
-
if __name__ == "__main__":
|
| 233 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/quality_validate.py
DELETED
|
@@ -1,158 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Quality validation for Stack 2.9 training dataset.
|
| 4 |
-
Checks: message structure, tool format, schema compliance.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import json
|
| 8 |
-
from pathlib import Path
|
| 9 |
-
from typing import Dict, List, Any
|
| 10 |
-
import argparse
|
| 11 |
-
from collections import Counter
|
| 12 |
-
|
| 13 |
-
def load_tool_catalog(path: str) -> Dict[str, Any]:
|
| 14 |
-
with open(path, 'r') as f:
|
| 15 |
-
return {tool["tool"]: tool for tool in json.load(f)}
|
| 16 |
-
|
| 17 |
-
def validate_example(example: Dict[str, Any], tool_catalog: Dict[str, Any]) -> List[str]:
|
| 18 |
-
"""Validate a single example. Returns list of errors (empty if valid)."""
|
| 19 |
-
errors = []
|
| 20 |
-
|
| 21 |
-
if "messages" not in example:
|
| 22 |
-
errors.append("Missing 'messages' field")
|
| 23 |
-
return errors
|
| 24 |
-
|
| 25 |
-
messages = example["messages"]
|
| 26 |
-
if not isinstance(messages, list) or len(messages) < 2:
|
| 27 |
-
errors.append("Invalid messages: must be list with at least 2 messages")
|
| 28 |
-
return errors
|
| 29 |
-
|
| 30 |
-
# Check roles sequence
|
| 31 |
-
roles = [msg.get("role") for msg in messages]
|
| 32 |
-
valid_roles = {"system", "user", "assistant"}
|
| 33 |
-
if not all(r in valid_roles for r in roles):
|
| 34 |
-
errors.append(f"Invalid roles: {roles}")
|
| 35 |
-
|
| 36 |
-
# Tool use validation
|
| 37 |
-
for msg in messages:
|
| 38 |
-
if msg.get("role") == "assistant" and "tool_use" in msg:
|
| 39 |
-
tool_use = msg["tool_use"]
|
| 40 |
-
if "name" not in tool_use:
|
| 41 |
-
errors.append("Tool use missing 'name'")
|
| 42 |
-
else:
|
| 43 |
-
tool_name = tool_use["name"]
|
| 44 |
-
if tool_name not in tool_catalog:
|
| 45 |
-
errors.append(f"Unknown tool: {tool_name}")
|
| 46 |
-
if "input" not in tool_use:
|
| 47 |
-
errors.append(f"Tool use missing 'input' for {tool_name}")
|
| 48 |
-
|
| 49 |
-
if msg.get("role") == "user" and "tool_result" in msg:
|
| 50 |
-
tool_result = msg["tool_result"]
|
| 51 |
-
if "tool_use_id" not in tool_result:
|
| 52 |
-
errors.append("Tool result missing 'tool_use_id'")
|
| 53 |
-
if "content" not in tool_result:
|
| 54 |
-
errors.append("Tool result missing 'content'")
|
| 55 |
-
|
| 56 |
-
# Check message content is non-empty (except user with tool_result can be empty)
|
| 57 |
-
for i, msg in enumerate(messages):
|
| 58 |
-
role = msg.get("role")
|
| 59 |
-
content = msg.get("content")
|
| 60 |
-
if role == "user" and "tool_result" in msg:
|
| 61 |
-
continue # Tool result user message can have empty content
|
| 62 |
-
if content is not None and not isinstance(content, str):
|
| 63 |
-
errors.append(f"Message content must be string, got {type(content)}")
|
| 64 |
-
if content is not None and len(content.strip()) == 0:
|
| 65 |
-
errors.append(f"Empty content in {role} message")
|
| 66 |
-
|
| 67 |
-
return errors
|
| 68 |
-
|
| 69 |
-
def main():
|
| 70 |
-
parser = argparse.ArgumentParser()
|
| 71 |
-
parser.add_argument("--input", type=str, default="training-data/final/train.jsonl")
|
| 72 |
-
parser.add_argument("--catalog", type=str, default="training-data/tools/catalog.json")
|
| 73 |
-
parser.add_argument("--output-report", type=str, default="training-data/final/quality_report.json")
|
| 74 |
-
args = parser.parse_args()
|
| 75 |
-
|
| 76 |
-
input_path = Path(args.input)
|
| 77 |
-
catalog_path = Path(args.catalog)
|
| 78 |
-
|
| 79 |
-
if not input_path.exists():
|
| 80 |
-
print(f"❌ Input not found: {input_path}")
|
| 81 |
-
return
|
| 82 |
-
|
| 83 |
-
if not catalog_path.exists():
|
| 84 |
-
print(f"⚠️ Catalog not found: {catalog_path}, skipping tool validation")
|
| 85 |
-
tool_catalog = {}
|
| 86 |
-
else:
|
| 87 |
-
tool_catalog = load_tool_catalog(catalog_path)
|
| 88 |
-
print(f"✅ Loaded tool catalog with {len(tool_catalog)} tools")
|
| 89 |
-
|
| 90 |
-
print(f"🔍 Validating {input_path}...")
|
| 91 |
-
|
| 92 |
-
total_examples = 0
|
| 93 |
-
valid_examples = 0
|
| 94 |
-
error_distribution = Counter()
|
| 95 |
-
tool_usage = Counter()
|
| 96 |
-
|
| 97 |
-
with open(input_path, 'r') as f:
|
| 98 |
-
for line in f:
|
| 99 |
-
total_examples += 1
|
| 100 |
-
try:
|
| 101 |
-
example = json.loads(line)
|
| 102 |
-
errors = validate_example(example, tool_catalog)
|
| 103 |
-
|
| 104 |
-
if errors:
|
| 105 |
-
for err in errors:
|
| 106 |
-
error_distribution[err] += 1
|
| 107 |
-
else:
|
| 108 |
-
valid_examples += 1
|
| 109 |
-
|
| 110 |
-
# Track tool usage regardless of validation
|
| 111 |
-
for msg in example.get("messages", []):
|
| 112 |
-
if "tool_use" in msg:
|
| 113 |
-
tool_name = msg["tool_use"]["name"]
|
| 114 |
-
tool_usage[tool_name] += 1
|
| 115 |
-
|
| 116 |
-
except json.JSONDecodeError:
|
| 117 |
-
error_distribution["JSON decode error"] += 1
|
| 118 |
-
|
| 119 |
-
print(f"\n📊 Validation Results:")
|
| 120 |
-
print(f" Total examples: {total_examples}")
|
| 121 |
-
print(f" Valid: {valid_examples} ({valid_examples/total_examples*100:.1f}%)")
|
| 122 |
-
print(f" Invalid: {total_examples - valid_examples}")
|
| 123 |
-
|
| 124 |
-
if error_distribution:
|
| 125 |
-
print("\n Error breakdown:")
|
| 126 |
-
for err, count in error_distribution.most_common(10):
|
| 127 |
-
print(f" - {err}: {count}")
|
| 128 |
-
|
| 129 |
-
print("\n Tool usage (top 10):")
|
| 130 |
-
for tool, count in tool_usage.most_common(10):
|
| 131 |
-
print(f" - {tool}: {count}")
|
| 132 |
-
|
| 133 |
-
# Write report
|
| 134 |
-
report = {
|
| 135 |
-
"total_examples": total_examples,
|
| 136 |
-
"valid_examples": valid_examples,
|
| 137 |
-
"invalid_examples": total_examples - valid_examples,
|
| 138 |
-
"validity_rate": valid_examples / total_examples if total_examples > 0 else 0,
|
| 139 |
-
"error_distribution": dict(error_distribution),
|
| 140 |
-
"tool_usage": dict(tool_usage),
|
| 141 |
-
"generated_at": datetime.datetime.now().isoformat()
|
| 142 |
-
}
|
| 143 |
-
|
| 144 |
-
output_report = Path(args.output_report)
|
| 145 |
-
output_report.parent.mkdir(parents=True, exist_ok=True)
|
| 146 |
-
with open(output_report, 'w') as f:
|
| 147 |
-
json.dump(report, f, indent=2)
|
| 148 |
-
|
| 149 |
-
print(f"\n✅ Report saved: {output_report}")
|
| 150 |
-
|
| 151 |
-
if valid_examples / total_examples < 0.9:
|
| 152 |
-
print("\n⚠️ Quality below 90%. Consider filtering invalid examples before training.")
|
| 153 |
-
else:
|
| 154 |
-
print("\n✅ Dataset quality looks good for training!")
|
| 155 |
-
|
| 156 |
-
if __name__ == "__main__":
|
| 157 |
-
import json, datetime
|
| 158 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/training-data-extractor.js
DELETED
|
@@ -1,1098 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env node
|
| 2 |
-
/**
|
| 3 |
-
* Stack 2.9 - Enhanced Training Data Extractor
|
| 4 |
-
* Extracts training examples from OpenClaw codebase
|
| 5 |
-
*
|
| 6 |
-
* Features:
|
| 7 |
-
* 1. Parse code patterns: function+comment pairs, error messages, test files
|
| 8 |
-
* 2. Real conversation parsing (JSON, JSONL, Markdown formats)
|
| 9 |
-
* 3. Synthetic examples (50+ per tool)
|
| 10 |
-
* 4. JSONL output
|
| 11 |
-
*/
|
| 12 |
-
|
| 13 |
-
import fs from 'fs';
|
| 14 |
-
import path from 'path';
|
| 15 |
-
import { fileURLToPath } from 'url';
|
| 16 |
-
import os from 'os';
|
| 17 |
-
|
| 18 |
-
const __filename = fileURLToPath(import.meta.url);
|
| 19 |
-
const __dirname = path.dirname(__filename);
|
| 20 |
-
|
| 21 |
-
// Paths
|
| 22 |
-
const SRC_DIR = path.join(__dirname, 'src');
|
| 23 |
-
const OUTPUT_DIR = path.join(__dirname, 'training-data');
|
| 24 |
-
const SYNTHETIC_DIR = path.join(OUTPUT_DIR, 'synthetic');
|
| 25 |
-
const TOOLS_SCHEMA_DIR = path.join(OUTPUT_DIR, 'tools');
|
| 26 |
-
const CODE_PAIRS_DIR = path.join(OUTPUT_DIR, 'code-pairs');
|
| 27 |
-
const CONVERSATIONS_DIR = path.join(OUTPUT_DIR, 'conversations');
|
| 28 |
-
|
| 29 |
-
// Ensure directories exist
|
| 30 |
-
for (const dir of [OUTPUT_DIR, SYNTHETIC_DIR, TOOLS_SCHEMA_DIR, CODE_PAIRS_DIR, CONVERSATIONS_DIR]) {
|
| 31 |
-
fs.mkdirSync(dir, { recursive: true });
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
// ============================================================================
|
| 35 |
-
// 1. EXTRACT TOOL SCHEMAS FROM src/tools/
|
| 36 |
-
// ============================================================================
|
| 37 |
-
|
| 38 |
-
function extractToolSchemas() {
|
| 39 |
-
const toolsDir = path.join(SRC_DIR, 'tools');
|
| 40 |
-
if (!fs.existsSync(toolsDir)) {
|
| 41 |
-
console.log('⚠️ Tools directory not found, skipping...');
|
| 42 |
-
return [];
|
| 43 |
-
}
|
| 44 |
-
|
| 45 |
-
const schemas = [];
|
| 46 |
-
const toolDirs = fs.readdirSync(toolsDir).filter(name => {
|
| 47 |
-
const stat = fs.statSync(path.join(toolsDir, name));
|
| 48 |
-
return stat.isDirectory();
|
| 49 |
-
});
|
| 50 |
-
|
| 51 |
-
for (const toolDir of toolDirs) {
|
| 52 |
-
const toolPath = path.join(toolsDir, toolDir);
|
| 53 |
-
const promptFile = path.join(toolPath, 'prompt.ts');
|
| 54 |
-
const toolFile = path.join(toolPath, toolDir + '.tsx') || path.join(toolPath, toolDir + '.ts');
|
| 55 |
-
|
| 56 |
-
if (fs.existsSync(promptFile) || fs.existsSync(toolFile)) {
|
| 57 |
-
try {
|
| 58 |
-
const promptContent = fs.existsSync(promptFile) ? fs.readFileSync(promptFile, 'utf-8') : '';
|
| 59 |
-
const toolContent = fs.existsSync(toolFile) ? fs.readFileSync(toolFile, 'utf-8') : '';
|
| 60 |
-
|
| 61 |
-
// Extract tool description from JSDoc
|
| 62 |
-
const descMatch = promptContent.match(/\/\*\*([\s\S]*?)\*\//);
|
| 63 |
-
let description = '';
|
| 64 |
-
if (descMatch) {
|
| 65 |
-
description = descMatch[1]
|
| 66 |
-
.replace(/^\s*\* ?/gm, '')
|
| 67 |
-
.replace(/^\s*\*/g, '')
|
| 68 |
-
.replace(/\*\/$/, '')
|
| 69 |
-
.trim()
|
| 70 |
-
.substring(0, 300);
|
| 71 |
-
}
|
| 72 |
-
|
| 73 |
-
// Extract input interface from tool file
|
| 74 |
-
let inputSchema = {};
|
| 75 |
-
const interfaceMatch = toolContent.match(/interface\s+(\w+Input\w*)\s*\{([\s\S]*?)\}/);
|
| 76 |
-
if (interfaceMatch) {
|
| 77 |
-
const fields = interfaceMatch[2].match(/(\w+)(\??):\s*([^;]+);/g) || [];
|
| 78 |
-
for (const field of fields) {
|
| 79 |
-
const match = field.match(/(\w+)(\??):\s*([^;]+);/);
|
| 80 |
-
if (match) {
|
| 81 |
-
inputSchema[match[1]] = { type: match[3].trim(), optional: match[2] === '?' };
|
| 82 |
-
}
|
| 83 |
-
}
|
| 84 |
-
}
|
| 85 |
-
|
| 86 |
-
schemas.push({
|
| 87 |
-
tool: toolDir,
|
| 88 |
-
description,
|
| 89 |
-
hasPrompt: !!promptContent,
|
| 90 |
-
hasImplementation: !!toolContent,
|
| 91 |
-
inputSchema
|
| 92 |
-
});
|
| 93 |
-
} catch (e) {
|
| 94 |
-
console.log(`⚠️ Error parsing ${toolDir}: ${e.message}`);
|
| 95 |
-
}
|
| 96 |
-
}
|
| 97 |
-
}
|
| 98 |
-
|
| 99 |
-
// Write tools catalog
|
| 100 |
-
fs.writeFileSync(
|
| 101 |
-
path.join(TOOLS_SCHEMA_DIR, 'catalog.json'),
|
| 102 |
-
JSON.stringify(schemas, null, 2)
|
| 103 |
-
);
|
| 104 |
-
|
| 105 |
-
console.log(`✅ Extracted ${schemas.length} tool schemas`);
|
| 106 |
-
return schemas;
|
| 107 |
-
}
|
| 108 |
-
|
| 109 |
-
// ============================================================================
|
| 110 |
-
// 2. EXTRACT CODE-COMMENT PAIRS FROM src/
|
| 111 |
-
// ============================================================================
|
| 112 |
-
|
| 113 |
-
function extractCodeCommentPairs() {
|
| 114 |
-
console.log('🔍 Extracting code-comment pairs...');
|
| 115 |
-
const pairs = [];
|
| 116 |
-
|
| 117 |
-
// Patterns for JSDoc comments
|
| 118 |
-
const jsdocPattern = /\/\*\*([\s\S]*?)\*\/\s*\n(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)\s*(?::\s*([^{]+))?\{([\s\S]*?)\n\}/g;
|
| 119 |
-
const methodPattern = /\/\*\*([\s\S]*?)\*\/\s*\n\s*(?:async\s+)?(\w+)\s*\([^)]*\)[^:]*\{([\s\S]*?)\n\s*\}/g;
|
| 120 |
-
|
| 121 |
-
// Error message patterns
|
| 122 |
-
const errorPattern = /(?:throw\s+new\s+Error|logger\.error|console\.error)\s*\(\s*[`"']([^`'"]+)[`'"]/g;
|
| 123 |
-
const errorClassPattern = /class\s+(\w+Error\w*)\s+extends\s+Error\s*\{([^}]*)\}/g;
|
| 124 |
-
|
| 125 |
-
function processFile(filePath) {
|
| 126 |
-
try {
|
| 127 |
-
const content = fs.readFileSync(filePath, 'utf-8');
|
| 128 |
-
const relativePath = path.relative(SRC_DIR, filePath);
|
| 129 |
-
|
| 130 |
-
// Skip test files and mock files for now
|
| 131 |
-
if (filePath.includes('__tests__') || filePath.includes('mocks')) return;
|
| 132 |
-
|
| 133 |
-
// Extract function + JSDoc pairs
|
| 134 |
-
let match;
|
| 135 |
-
const funcRegex = /\/\*\*([\s\S]*?)\*\/\s*\n\s*(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)\s*(?::\s*([^;\n]+))?[^{]*\{([\s\S]*?)\n\}/g;
|
| 136 |
-
|
| 137 |
-
while ((match = funcRegex.exec(content)) !== null) {
|
| 138 |
-
const jsdoc = match[1].replace(/^\s*\*\s*/gm, '').trim();
|
| 139 |
-
const funcName = match[2];
|
| 140 |
-
const params = match[3].trim();
|
| 141 |
-
const returnType = match[4]?.trim() || 'void';
|
| 142 |
-
const body = match[5].trim();
|
| 143 |
-
|
| 144 |
-
// Only include if meaningful (not too short, has actual logic)
|
| 145 |
-
if (body.length > 50 && jsdoc.length > 10) {
|
| 146 |
-
pairs.push({
|
| 147 |
-
type: 'function',
|
| 148 |
-
name: funcName,
|
| 149 |
-
path: relativePath,
|
| 150 |
-
code: `function ${funcName}(${params})${returnType ? `: ${returnType}` : ''} { ... }`,
|
| 151 |
-
fullBody: body.substring(0, 500),
|
| 152 |
-
comment: jsdoc.substring(0, 300),
|
| 153 |
-
commentType: 'jsdoc'
|
| 154 |
-
});
|
| 155 |
-
}
|
| 156 |
-
}
|
| 157 |
-
|
| 158 |
-
// Extract error messages and patterns
|
| 159 |
-
const errorRegex = /(?:throw\s+new\s+Error|logger\.error|console\.error)\s*\(\s*[`"']([^`'"]+)[`'"]/g;
|
| 160 |
-
let errorMatch;
|
| 161 |
-
while ((errorMatch = errorRegex.exec(content)) !== null) {
|
| 162 |
-
const errorMsg = errorMatch[1];
|
| 163 |
-
// Categorize error type
|
| 164 |
-
let category = 'general';
|
| 165 |
-
if (errorMsg.includes('not found') || errorMsg.includes('No such')) category = 'not_found';
|
| 166 |
-
else if (errorMsg.includes('permission') || errorMsg.includes('denied')) category = 'permission';
|
| 167 |
-
else if (errorMsg.includes('invalid') || errorMsg.includes('malformed')) category = 'validation';
|
| 168 |
-
else if (errorMsg.includes('timeout')) category = 'timeout';
|
| 169 |
-
else if (errorMsg.includes('already')) category = 'conflict';
|
| 170 |
-
|
| 171 |
-
pairs.push({
|
| 172 |
-
type: 'error_message',
|
| 173 |
-
path: relativePath,
|
| 174 |
-
message: errorMsg,
|
| 175 |
-
category,
|
| 176 |
-
fixSuggestion: generateFixSuggestion(errorMsg, category)
|
| 177 |
-
});
|
| 178 |
-
}
|
| 179 |
-
|
| 180 |
-
// Extract class with error handling
|
| 181 |
-
const classRegex = /class\s+(\w+)\s*\{([^}]*(?:\{[^}]*\}[^}]*)*)\}/g;
|
| 182 |
-
let classMatch;
|
| 183 |
-
while ((classMatch = classRegex.exec(content)) !== null) {
|
| 184 |
-
const className = match[1];
|
| 185 |
-
const classBody = match[2];
|
| 186 |
-
// Look for try-catch patterns
|
| 187 |
-
if (classBody.includes('try') && classBody.includes('catch')) {
|
| 188 |
-
pairs.push({
|
| 189 |
-
type: 'error_handling_class',
|
| 190 |
-
name: className,
|
| 191 |
-
path: relativePath,
|
| 192 |
-
pattern: 'try-catch',
|
| 193 |
-
example: classBody.substring(0, 400)
|
| 194 |
-
});
|
| 195 |
-
}
|
| 196 |
-
}
|
| 197 |
-
|
| 198 |
-
} catch (e) {
|
| 199 |
-
// Skip files that can't be read
|
| 200 |
-
}
|
| 201 |
-
}
|
| 202 |
-
|
| 203 |
-
function walkDir(dir, extensions = ['.ts', '.tsx']) {
|
| 204 |
-
if (!fs.existsSync(dir)) return;
|
| 205 |
-
|
| 206 |
-
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
| 207 |
-
for (const entry of entries) {
|
| 208 |
-
const fullPath = path.join(dir, entry.name);
|
| 209 |
-
if (entry.isDirectory() && !entry.name.startsWith('.') && entry.name !== 'node_modules') {
|
| 210 |
-
walkDir(fullPath, extensions);
|
| 211 |
-
} else if (entry.isFile() && extensions.some(ext => entry.name.endsWith(ext))) {
|
| 212 |
-
processFile(fullPath);
|
| 213 |
-
}
|
| 214 |
-
}
|
| 215 |
-
}
|
| 216 |
-
|
| 217 |
-
walkDir(SRC_DIR);
|
| 218 |
-
|
| 219 |
-
// Save code-comment pairs
|
| 220 |
-
fs.writeFileSync(
|
| 221 |
-
path.join(CODE_PAIRS_DIR, 'pairs.json'),
|
| 222 |
-
JSON.stringify(pairs, null, 2)
|
| 223 |
-
);
|
| 224 |
-
|
| 225 |
-
console.log(`✅ Extracted ${pairs.length} code-comment pairs`);
|
| 226 |
-
return pairs;
|
| 227 |
-
}
|
| 228 |
-
|
| 229 |
-
function generateFixSuggestion(message, category) {
|
| 230 |
-
const suggestions = {
|
| 231 |
-
not_found: 'Check if the resource exists or provide the correct path',
|
| 232 |
-
permission: 'Ensure you have the necessary permissions for this operation',
|
| 233 |
-
validation: 'Verify the input format and required fields',
|
| 234 |
-
timeout: 'Increase timeout duration or check network connectivity',
|
| 235 |
-
conflict: 'Check if the resource already exists or needs to be deleted first',
|
| 236 |
-
general: 'Review the error message and correct the underlying issue'
|
| 237 |
-
};
|
| 238 |
-
return suggestions[category] || suggestions.general;
|
| 239 |
-
}
|
| 240 |
-
|
| 241 |
-
// ============================================================================
|
| 242 |
-
// 3. PARSE TEST FILES FOR TEST-GENERATION EXAMPLES
|
| 243 |
-
// ============================================================================
|
| 244 |
-
|
| 245 |
-
function extractTestExamples() {
|
| 246 |
-
console.log('🧪 Extracting test examples...');
|
| 247 |
-
const testExamples = [];
|
| 248 |
-
|
| 249 |
-
const testPattern = /describe\s*\(\s*['"]([^'"]+)['"](?:\s*,\s*)?\(\s*\)\s*=>\s*\{([^}]*(?:\{[^}]*\}[^}]*)*)\}\s*\)/g;
|
| 250 |
-
const itPattern = /it\s*\(\s*['"]([^'"]+)['"](?:\s*,\s*)?(?:async\s+)?\(\s*\)\s*(?:=>\s*)?\{([\s\S]*?)\n\s*\}/g;
|
| 251 |
-
const expectPattern = /expect\s*\(([^)]+)\)\.(to[^;(]+)\s*\(([^)]+)\)/g;
|
| 252 |
-
|
| 253 |
-
function processTestFile(filePath) {
|
| 254 |
-
try {
|
| 255 |
-
const content = fs.readFileSync(filePath, 'utf-8');
|
| 256 |
-
const relativePath = path.relative(SRC_DIR, filePath);
|
| 257 |
-
|
| 258 |
-
let match;
|
| 259 |
-
while ((match = testPattern.exec(content)) !== null) {
|
| 260 |
-
const testSuite = match[1];
|
| 261 |
-
const testBody = match[2];
|
| 262 |
-
|
| 263 |
-
// Extract individual it() blocks
|
| 264 |
-
const itRegex = /it\s*\(\s*['"]([^'"]+)['"](?:\s*,\s*)?(?:async\s+)?\(\s*\)\s*(?:=>\s*)?\{([\s\S]*?)\n\s*\}/g;
|
| 265 |
-
let itMatch;
|
| 266 |
-
|
| 267 |
-
while ((itMatch = itRegex.exec(testBody)) !== null) {
|
| 268 |
-
const testName = itMatch[1];
|
| 269 |
-
const testCode = itMatch[2].trim();
|
| 270 |
-
|
| 271 |
-
// Extract assertions
|
| 272 |
-
const assertions = [];
|
| 273 |
-
const expectRegex = /expect\s*\(([^)]+)\)\.(\w+)\s*\(([^)]*)\)/g;
|
| 274 |
-
let expectMatch;
|
| 275 |
-
|
| 276 |
-
while ((expectMatch = expectRegex.exec(testCode)) !== null) {
|
| 277 |
-
assertions.push({
|
| 278 |
-
actual: expectMatch[1],
|
| 279 |
-
matcher: expectMatch[2],
|
| 280 |
-
expected: expectMatch[3]
|
| 281 |
-
});
|
| 282 |
-
}
|
| 283 |
-
|
| 284 |
-
if (assertions.length > 0) {
|
| 285 |
-
testExamples.push({
|
| 286 |
-
type: 'test_example',
|
| 287 |
-
suite: testSuite,
|
| 288 |
-
name: testName,
|
| 289 |
-
path: relativePath,
|
| 290 |
-
code: testCode.substring(0, 400),
|
| 291 |
-
assertions,
|
| 292 |
-
isAsync: testCode.includes('await')
|
| 293 |
-
});
|
| 294 |
-
}
|
| 295 |
-
}
|
| 296 |
-
}
|
| 297 |
-
} catch (e) {
|
| 298 |
-
// Skip files that can't be read
|
| 299 |
-
}
|
| 300 |
-
}
|
| 301 |
-
|
| 302 |
-
function walkDir(dir) {
|
| 303 |
-
if (!fs.existsSync(dir)) return;
|
| 304 |
-
|
| 305 |
-
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
| 306 |
-
for (const entry of entries) {
|
| 307 |
-
const fullPath = path.join(dir, entry.name);
|
| 308 |
-
if (entry.isDirectory()) {
|
| 309 |
-
walkDir(fullPath);
|
| 310 |
-
} else if (entry.isFile() && (entry.name.endsWith('.test.ts') || entry.name.endsWith('.test.tsx'))) {
|
| 311 |
-
processTestFile(fullPath);
|
| 312 |
-
}
|
| 313 |
-
}
|
| 314 |
-
}
|
| 315 |
-
|
| 316 |
-
// Look for test files in __tests__ directories
|
| 317 |
-
walkDir(SRC_DIR);
|
| 318 |
-
|
| 319 |
-
// Save test examples
|
| 320 |
-
fs.writeFileSync(
|
| 321 |
-
path.join(CODE_PAIRS_DIR, 'test-examples.json'),
|
| 322 |
-
JSON.stringify(testExamples, null, 2)
|
| 323 |
-
);
|
| 324 |
-
|
| 325 |
-
console.log(`✅ Extracted ${testExamples.length} test examples`);
|
| 326 |
-
return testExamples;
|
| 327 |
-
}
|
| 328 |
-
|
| 329 |
-
// ============================================================================
|
| 330 |
-
// 4. PARSE REAL CONVERSATIONS FROM SESSION LOGS
|
| 331 |
-
// ============================================================================
|
| 332 |
-
|
| 333 |
-
function parseConversations() {
|
| 334 |
-
console.log('💬 Parsing conversations from session logs...');
|
| 335 |
-
const conversations = [];
|
| 336 |
-
|
| 337 |
-
// Common session log locations
|
| 338 |
-
const sessionLogPaths = [
|
| 339 |
-
path.join(os.homedir(), '.claude', 'sessions'),
|
| 340 |
-
path.join(os.homedir(), '.openclaw', 'sessions'),
|
| 341 |
-
path.join(os.homedir(), '.claude', 'conversations'),
|
| 342 |
-
path.join(os.homedir(), '.openclaw', 'conversations'),
|
| 343 |
-
path.join(os.homedir(), '.config', 'claude', 'sessions')
|
| 344 |
-
];
|
| 345 |
-
|
| 346 |
-
function parseJsonFormat(content, source) {
|
| 347 |
-
try {
|
| 348 |
-
const data = JSON.parse(content);
|
| 349 |
-
if (data.messages && Array.isArray(data.messages)) {
|
| 350 |
-
return {
|
| 351 |
-
format: 'json',
|
| 352 |
-
source,
|
| 353 |
-
messages: data.messages,
|
| 354 |
-
metadata: data.metadata || {}
|
| 355 |
-
};
|
| 356 |
-
}
|
| 357 |
-
if (data.conversation && data.conversation.messages) {
|
| 358 |
-
return {
|
| 359 |
-
format: 'json',
|
| 360 |
-
source,
|
| 361 |
-
messages: data.conversation.messages,
|
| 362 |
-
metadata: data.metadata || {}
|
| 363 |
-
};
|
| 364 |
-
}
|
| 365 |
-
} catch (e) {}
|
| 366 |
-
return null;
|
| 367 |
-
}
|
| 368 |
-
|
| 369 |
-
function parseJsonlFormat(content, source) {
|
| 370 |
-
const lines = content.trim().split('\n');
|
| 371 |
-
const conversations = [];
|
| 372 |
-
|
| 373 |
-
for (const line of lines) {
|
| 374 |
-
try {
|
| 375 |
-
const obj = JSON.parse(line);
|
| 376 |
-
if (obj.messages || obj.conversation) {
|
| 377 |
-
conversations.push({
|
| 378 |
-
format: 'jsonl',
|
| 379 |
-
source,
|
| 380 |
-
messages: obj.messages || obj.conversation?.messages || [],
|
| 381 |
-
metadata: obj.metadata || {}
|
| 382 |
-
});
|
| 383 |
-
}
|
| 384 |
-
} catch (e) {}
|
| 385 |
-
}
|
| 386 |
-
|
| 387 |
-
return conversations;
|
| 388 |
-
}
|
| 389 |
-
|
| 390 |
-
function parseMarkdownFormat(content, source) {
|
| 391 |
-
const messages = [];
|
| 392 |
-
const blocks = content.split(/(?=^##?\s+(?:User|Assistant|System|Human|AI))/m);
|
| 393 |
-
|
| 394 |
-
let currentRole = null;
|
| 395 |
-
let currentContent = [];
|
| 396 |
-
|
| 397 |
-
for (const block of blocks) {
|
| 398 |
-
const roleMatch = block.match(/^##?\s+(User|Assistant|System|Human|AI|Assistant \(tool\))/im);
|
| 399 |
-
if (roleMatch) {
|
| 400 |
-
if (currentRole && currentContent.length > 0) {
|
| 401 |
-
messages.push({
|
| 402 |
-
role: currentRole,
|
| 403 |
-
content: currentContent.join('\n').trim()
|
| 404 |
-
});
|
| 405 |
-
}
|
| 406 |
-
currentRole = roleMatch[1].toLowerCase().replace('assistant (tool)', 'tool');
|
| 407 |
-
currentContent = [block.replace(/^##?\s+.*$/m, '').trim()];
|
| 408 |
-
} else if (currentRole) {
|
| 409 |
-
currentContent.push(block.trim());
|
| 410 |
-
}
|
| 411 |
-
}
|
| 412 |
-
|
| 413 |
-
if (currentRole && currentContent.length > 0) {
|
| 414 |
-
messages.push({
|
| 415 |
-
role: currentRole,
|
| 416 |
-
content: currentContent.join('\n').trim()
|
| 417 |
-
});
|
| 418 |
-
}
|
| 419 |
-
|
| 420 |
-
if (messages.length > 0) {
|
| 421 |
-
return {
|
| 422 |
-
format: 'markdown',
|
| 423 |
-
source,
|
| 424 |
-
messages,
|
| 425 |
-
metadata: {}
|
| 426 |
-
};
|
| 427 |
-
}
|
| 428 |
-
return null;
|
| 429 |
-
}
|
| 430 |
-
|
| 431 |
-
function processLogFile(filePath) {
|
| 432 |
-
try {
|
| 433 |
-
const content = fs.readFileSync(filePath, 'utf-8');
|
| 434 |
-
const source = path.relative(os.homedir(), filePath);
|
| 435 |
-
|
| 436 |
-
// Try JSON format
|
| 437 |
-
if (filePath.endsWith('.json')) {
|
| 438 |
-
const parsed = parseJsonFormat(content, source);
|
| 439 |
-
if (parsed) {
|
| 440 |
-
conversations.push(parsed);
|
| 441 |
-
return;
|
| 442 |
-
}
|
| 443 |
-
}
|
| 444 |
-
|
| 445 |
-
// Try JSONL format
|
| 446 |
-
if (filePath.endsWith('.jsonl')) {
|
| 447 |
-
const parsed = parseJsonlFormat(content, source);
|
| 448 |
-
conversations.push(...parsed);
|
| 449 |
-
return;
|
| 450 |
-
}
|
| 451 |
-
|
| 452 |
-
// Try Markdown format
|
| 453 |
-
if (filePath.endsWith('.md') || filePath.endsWith('.mdx')) {
|
| 454 |
-
const parsed = parseMarkdownFormat(content, source);
|
| 455 |
-
if (parsed) {
|
| 456 |
-
conversations.push(parsed);
|
| 457 |
-
}
|
| 458 |
-
}
|
| 459 |
-
} catch (e) {
|
| 460 |
-
// Skip files that can't be read
|
| 461 |
-
}
|
| 462 |
-
}
|
| 463 |
-
|
| 464 |
-
function walkDir(dir) {
|
| 465 |
-
if (!fs.existsSync(dir)) return;
|
| 466 |
-
|
| 467 |
-
try {
|
| 468 |
-
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
| 469 |
-
for (const entry of entries) {
|
| 470 |
-
const fullPath = path.join(dir, entry.name);
|
| 471 |
-
if (entry.isDirectory()) {
|
| 472 |
-
walkDir(fullPath);
|
| 473 |
-
} else if (entry.isFile() &&
|
| 474 |
-
(entry.name.endsWith('.json') ||
|
| 475 |
-
entry.name.endsWith('.jsonl') ||
|
| 476 |
-
entry.name.endsWith('.md') ||
|
| 477 |
-
entry.name.endsWith('.mdx'))) {
|
| 478 |
-
processLogFile(fullPath);
|
| 479 |
-
}
|
| 480 |
-
}
|
| 481 |
-
} catch (e) {
|
| 482 |
-
// Skip directories that can't be accessed
|
| 483 |
-
}
|
| 484 |
-
}
|
| 485 |
-
|
| 486 |
-
// Try all known session log locations
|
| 487 |
-
for (const logPath of sessionLogPaths) {
|
| 488 |
-
walkDir(logPath);
|
| 489 |
-
}
|
| 490 |
-
|
| 491 |
-
// Save parsed conversations
|
| 492 |
-
fs.writeFileSync(
|
| 493 |
-
path.join(CONVERSATIONS_DIR, 'parsed.json'),
|
| 494 |
-
JSON.stringify(conversations, null, 2)
|
| 495 |
-
);
|
| 496 |
-
|
| 497 |
-
console.log(`✅ Parsed ${conversations.length} conversations`);
|
| 498 |
-
return conversations;
|
| 499 |
-
}
|
| 500 |
-
|
| 501 |
-
// ============================================================================
|
| 502 |
-
// 5. GENERATE SYNTHETIC EXAMPLES (50+ per tool)
|
| 503 |
-
// ============================================================================
|
| 504 |
-
|
| 505 |
-
function generateSyntheticExamples(tools) {
|
| 506 |
-
console.log('🎭 Generating synthetic examples...');
|
| 507 |
-
const examples = [];
|
| 508 |
-
|
| 509 |
-
// Base system prompts
|
| 510 |
-
const systemPrompts = [
|
| 511 |
-
'You are Stack, an AI coding assistant. You help with programming tasks, answer questions, and use tools when needed.',
|
| 512 |
-
'You are Stack, an expert programmer specializing in TypeScript, React, and Node.js.',
|
| 513 |
-
'You are Stack, a helpful AI assistant that can read and write files, run commands, and search code.',
|
| 514 |
-
'You are Stack, an AI coding assistant with access to file system tools, search tools, and more.'
|
| 515 |
-
];
|
| 516 |
-
|
| 517 |
-
// =========================================================================
|
| 518 |
-
// EXAMPLES: Basic coding questions and answers
|
| 519 |
-
// =========================================================================
|
| 520 |
-
|
| 521 |
-
const basicQAPairs = [
|
| 522 |
-
{
|
| 523 |
-
q: 'How do I read a file in Node.js?',
|
| 524 |
-
a: 'You can use the `fs` module. Here are the main approaches:\n\n1. Synchronous (simple but blocks):\n```js\nconst fs = require(\'fs\');\nconst content = fs.readFileSync(\'file.txt\', \'utf8\');\n```\n\n2. Async with callbacks:\n```js\nfs.readFile(\'file.txt\', \'utf8\', (err, data) => {\n if (err) throw err;\n console.log(data);\n});\n```\n\n3. Promise-based (recommended):\n```js\nconst fs = require(\'fs\').promises;\nconst data = await fs.readFile(\'file.txt\', \'utf8\');\n```'
|
| 525 |
-
},
|
| 526 |
-
{
|
| 527 |
-
q: 'How do I write to a file in Node.js?',
|
| 528 |
-
a: 'Use the `fs` module:\n\n1. Sync (blocks):\n```js\nfs.writeFileSync(\'output.txt\', \'Hello!\', \'utf8\');\n```\n\n2. Async with promises:\n```js\nawait fs.writeFile(\'output.txt\', \'Hello!\', \'utf8\');\n```\n\n3. Append to existing:\n```js\nawait fs.appendFile(\'log.txt\', \'new entry\\n\', \'utf8\');\n```'
|
| 529 |
-
},
|
| 530 |
-
{
|
| 531 |
-
q: 'How do I list files in a directory?',
|
| 532 |
-
a: 'Use `fs.readdir()`:\n\n```js\nconst files = await fs.readdir(\'./src\', { withFileTypes: true });\n\nfor (const file of files) {\n if (file.isDirectory()) {\n console.log(`📁 ${file.name}/`);\n } else {\n console.log(`📄 ${file.name}`);\n }\n}\n```'
|
| 533 |
-
},
|
| 534 |
-
{
|
| 535 |
-
q: 'What is the difference between let, const, and var?',
|
| 536 |
-
a: 'Key differences:\n\n**const** - Block-scoped, cannot be reassigned (but object properties can change)\n**let** - Block-scoped, can be reassigned\n**var** - Function-scoped (or global), hoisted (avoid in modern JS)\n\n```js\nconst PI = 3.14; // Cannot reassign\nlet count = 0; // Can reassign\ncount = count + 1; // ✓\nvar oldWay = \'hoisted\'; // Avoid - function scoped\n```'
|
| 537 |
-
},
|
| 538 |
-
{
|
| 539 |
-
q: 'How do I use async/await properly?',
|
| 540 |
-
a: 'Async/await is syntactic sugar over promises:\n\n```js\n// Basic usage\nasync function fetchData() {\n try {\n const response = await fetch(url);\n const data = await response.json();\n return data;\n } catch (error) {\n console.error(\'Fetch failed:\', error);\n throw error;\n }\n}\n\n// Parallel execution\nconst [users, posts] = await Promise.all([\n fetch(\'/api/users\').then(r => r.json()),\n fetch(\'/api/posts\').then(r => r.json())\n]);\n```'
|
| 541 |
-
}
|
| 542 |
-
];
|
| 543 |
-
|
| 544 |
-
for (const pair of basicQAPairs) {
|
| 545 |
-
examples.push({
|
| 546 |
-
messages: [
|
| 547 |
-
{ role: 'system', content: systemPrompts[0] },
|
| 548 |
-
{ role: 'user', content: pair.q },
|
| 549 |
-
{ role: 'assistant', content: pair.a }
|
| 550 |
-
]
|
| 551 |
-
});
|
| 552 |
-
}
|
| 553 |
-
|
| 554 |
-
// =========================================================================
|
| 555 |
-
// EXAMPLES: Tool use patterns (per tool)
|
| 556 |
-
// =========================================================================
|
| 557 |
-
|
| 558 |
-
const toolUsePatterns = [
|
| 559 |
-
{
|
| 560 |
-
tool: 'FileReadTool',
|
| 561 |
-
scenarios: [
|
| 562 |
-
{
|
| 563 |
-
user: 'Show me the contents of package.json',
|
| 564 |
-
assistant: 'I\'ll read the package.json file for you.',
|
| 565 |
-
toolName: 'FileReadTool',
|
| 566 |
-
toolInput: { file_path: 'package.json' },
|
| 567 |
-
resultContent: '{\n "name": "my-project",\n "version": "1.0.0",\n "scripts": {\n "dev": "next dev",\n "build": "next build"\n }\n}',
|
| 568 |
-
summary: 'Here\'s what\'s in your package.json:\n\n- **Project name**: my-project\n- **Version**: 1.0.0\n- **Scripts**: dev (next dev), build (next build)'
|
| 569 |
-
},
|
| 570 |
-
{
|
| 571 |
-
user: 'Read the config.ts file from the src directory',
|
| 572 |
-
assistant: 'Let me read that config file.',
|
| 573 |
-
toolName: 'FileReadTool',
|
| 574 |
-
toolInput: { file_path: 'src/config.ts' },
|
| 575 |
-
resultContent: 'export const config = {\n apiUrl: \'https://api.example.com\',\n timeout: 5000,\n retries: 3\n};',
|
| 576 |
-
summary: 'The config file exports a configuration object with:\n\n- **API URL**: https://api.example.com\n- **Timeout**: 5000ms\n- **Retries**: 3'
|
| 577 |
-
},
|
| 578 |
-
{
|
| 579 |
-
user: 'What\'s in the README.md?',
|
| 580 |
-
assistant: 'Reading the README...',
|
| 581 |
-
toolName: 'FileReadTool',
|
| 582 |
-
toolInput: { file_path: 'README.md' },
|
| 583 |
-
resultContent: '# My Project\n\nA brief description of the project.\n\n## Installation\n\nnpm install\n\n## Usage\n\nnpm start',
|
| 584 |
-
summary: 'The README contains:\n\n- Project title and description\n- Installation instructions\n- Usage guide'
|
| 585 |
-
}
|
| 586 |
-
]
|
| 587 |
-
},
|
| 588 |
-
{
|
| 589 |
-
tool: 'GlobTool',
|
| 590 |
-
scenarios: [
|
| 591 |
-
{
|
| 592 |
-
user: 'Find all TypeScript files in src',
|
| 593 |
-
assistant: 'Searching for TypeScript files...',
|
| 594 |
-
toolName: 'GlobTool',
|
| 595 |
-
toolInput: { pattern: 'src/**/*.ts' },
|
| 596 |
-
resultContent: 'src/index.ts\nsrc/utils.ts\nsrc/components/Button.tsx\nsrc/services/api.ts',
|
| 597 |
-
summary: 'Found 4 TypeScript files in src/'
|
| 598 |
-
},
|
| 599 |
-
{
|
| 600 |
-
user: 'List all test files',
|
| 601 |
-
assistant: 'Looking for test files...',
|
| 602 |
-
toolName: 'GlobTool',
|
| 603 |
-
toolInput: { pattern: '**/*.test.{ts,tsx}' },
|
| 604 |
-
resultContent: '__tests__/sample.test.ts\nsrc/components/__tests__/Button.test.tsx\nsrc/utils/__tests__/helpers.test.ts',
|
| 605 |
-
summary: 'Found 3 test files'
|
| 606 |
-
},
|
| 607 |
-
{
|
| 608 |
-
user: 'Find all JSON config files',
|
| 609 |
-
assistant: 'Searching for JSON config files...',
|
| 610 |
-
toolName: 'GlobTool',
|
| 611 |
-
toolInput: { pattern: '**/*.json' },
|
| 612 |
-
resultContent: 'package.json\ntsconfig.json\njest.config.json\nsrc/config.json',
|
| 613 |
-
summary: 'Found 4 JSON files'
|
| 614 |
-
}
|
| 615 |
-
]
|
| 616 |
-
},
|
| 617 |
-
{
|
| 618 |
-
tool: 'GrepTool',
|
| 619 |
-
scenarios: [
|
| 620 |
-
{
|
| 621 |
-
user: 'Find all uses of "TODO" in the code',
|
| 622 |
-
assistant: 'Searching for TODO comments...',
|
| 623 |
-
toolName: 'GrepTool',
|
| 624 |
-
toolInput: { pattern: 'TODO', path: '.' },
|
| 625 |
-
resultContent: 'src/utils.ts: // TODO: implement error handling\nsrc/components/Form.tsx: // TODO: add validation\nsrc/api/client.ts: // TODO: add retry logic',
|
| 626 |
-
summary: 'Found 3 TODO comments in:\n- src/utils.ts\n- src/components/Form.tsx\n- src/api/client.ts'
|
| 627 |
-
},
|
| 628 |
-
{
|
| 629 |
-
user: 'Search for "async function" in the src',
|
| 630 |
-
assistant: 'Finding async functions...',
|
| 631 |
-
toolName: 'GrepTool',
|
| 632 |
-
toolInput: { pattern: 'async function', path: 'src' },
|
| 633 |
-
resultContent: 'src/api/client.ts: async function fetchData() {\nsrc/utils/parser.ts: async function parseJSON() {',
|
| 634 |
-
summary: 'Found 2 async functions'
|
| 635 |
-
}
|
| 636 |
-
]
|
| 637 |
-
},
|
| 638 |
-
{
|
| 639 |
-
tool: 'BashTool',
|
| 640 |
-
scenarios: [
|
| 641 |
-
{
|
| 642 |
-
user: 'Run npm install',
|
| 643 |
-
assistant: 'Installing dependencies...',
|
| 644 |
-
toolName: 'BashTool',
|
| 645 |
-
toolInput: { command: 'npm install' },
|
| 646 |
-
resultContent: 'added 245 packages in 12s\n\n12 packages are looking for funding\n run `npm fund` for details',
|
| 647 |
-
summary: 'Successfully installed 245 packages'
|
| 648 |
-
},
|
| 649 |
-
{
|
| 650 |
-
user: 'Check git status',
|
| 651 |
-
assistant: 'Checking git status...',
|
| 652 |
-
toolName: 'BashTool',
|
| 653 |
-
toolInput: { command: 'git status' },
|
| 654 |
-
resultContent: 'On branch main\nChanges not staged for commit:\n modified: src/index.ts\n modified: package.json',
|
| 655 |
-
summary: 'On branch main with 2 modified files'
|
| 656 |
-
},
|
| 657 |
-
{
|
| 658 |
-
user: 'Run the tests',
|
| 659 |
-
assistant: 'Running tests...',
|
| 660 |
-
toolName: 'BashTool',
|
| 661 |
-
toolInput: { command: 'npm test' },
|
| 662 |
-
resultContent: 'PASS __tests__/sample.test.ts\nPASS src/components/__tests__/Button.test.tsx\n\nTest Suites: 2 passed, 2 total\nTests: 5 passed, 5 total',
|
| 663 |
-
summary: 'All tests passed! 2 test suites, 5 tests total'
|
| 664 |
-
}
|
| 665 |
-
]
|
| 666 |
-
},
|
| 667 |
-
{
|
| 668 |
-
tool: 'FileWriteTool',
|
| 669 |
-
scenarios: [
|
| 670 |
-
{
|
| 671 |
-
user: 'Create a new config file',
|
| 672 |
-
assistant: 'Creating the config file...',
|
| 673 |
-
toolName: 'FileWriteTool',
|
| 674 |
-
toolInput: { file_path: 'config/local.json', content: '{\n "name": "local-config",\n "debug": true\n}' },
|
| 675 |
-
resultContent: '',
|
| 676 |
-
summary: 'Created config/local.json'
|
| 677 |
-
},
|
| 678 |
-
{
|
| 679 |
-
user: 'Write a simple test file',
|
| 680 |
-
assistant: 'Writing the test file...',
|
| 681 |
-
toolName: 'FileWriteTool',
|
| 682 |
-
toolInput: { file_path: 'test/example.test.ts', content: 'describe(\'Example\', () => {\n it(\'should pass\', () => {\n expect(1 + 1).toBe(2);\n });\n});' },
|
| 683 |
-
resultContent: '',
|
| 684 |
-
summary: 'Created test/example.test.ts'
|
| 685 |
-
}
|
| 686 |
-
]
|
| 687 |
-
},
|
| 688 |
-
{
|
| 689 |
-
tool: 'FileEditTool',
|
| 690 |
-
scenarios: [
|
| 691 |
-
{
|
| 692 |
-
user: 'Add a new export to the index file',
|
| 693 |
-
assistant: 'Editing the index file...',
|
| 694 |
-
toolName: 'FileEditTool',
|
| 695 |
-
toolInput: {
|
| 696 |
-
file_path: 'src/index.ts',
|
| 697 |
-
old_string: 'export { Config } from \'./config\';',
|
| 698 |
-
new_string: 'export { Config } from \'./config\';\nexport { Logger } from \'./logger\';'
|
| 699 |
-
},
|
| 700 |
-
resultContent: '',
|
| 701 |
-
summary: 'Added Logger export to src/index.ts'
|
| 702 |
-
}
|
| 703 |
-
]
|
| 704 |
-
},
|
| 705 |
-
{
|
| 706 |
-
tool: 'WebSearchTool',
|
| 707 |
-
scenarios: [
|
| 708 |
-
{
|
| 709 |
-
user: 'Search for the latest React documentation',
|
| 710 |
-
assistant: 'Searching the web...',
|
| 711 |
-
toolName: 'WebSearchTool',
|
| 712 |
-
toolInput: { query: 'React 18 documentation' },
|
| 713 |
-
resultContent: 'Found results for React documentation...',
|
| 714 |
-
summary: 'Found relevant documentation resources'
|
| 715 |
-
}
|
| 716 |
-
]
|
| 717 |
-
},
|
| 718 |
-
{
|
| 719 |
-
tool: 'WebFetchTool',
|
| 720 |
-
scenarios: [
|
| 721 |
-
{
|
| 722 |
-
user: 'Fetch the content of a GitHub README',
|
| 723 |
-
assistant: 'Fetching the README...',
|
| 724 |
-
toolName: 'WebFetchTool',
|
| 725 |
-
toolInput: { url: 'https://github.com/facebook/react' },
|
| 726 |
-
resultContent: 'README content...',
|
| 727 |
-
summary: 'Successfully fetched the README'
|
| 728 |
-
}
|
| 729 |
-
]
|
| 730 |
-
}
|
| 731 |
-
];
|
| 732 |
-
|
| 733 |
-
// Generate tool use examples
|
| 734 |
-
for (const toolPattern of toolUsePatterns) {
|
| 735 |
-
for (const scenario of toolPattern.scenarios) {
|
| 736 |
-
examples.push({
|
| 737 |
-
messages: [
|
| 738 |
-
{ role: 'system', content: systemPrompts[0] },
|
| 739 |
-
{ role: 'user', content: scenario.user },
|
| 740 |
-
{ role: 'assistant', content: scenario.assistant, tool_use: { name: scenario.toolName, input: scenario.toolInput } },
|
| 741 |
-
{ role: 'user', content: '', tool_result: { tool_use_id: 'tool_1', content: scenario.resultContent } },
|
| 742 |
-
{ role: 'assistant', content: scenario.summary }
|
| 743 |
-
]
|
| 744 |
-
});
|
| 745 |
-
}
|
| 746 |
-
}
|
| 747 |
-
|
| 748 |
-
// =========================================================================
|
| 749 |
-
// EXAMPLES: Error handling patterns
|
| 750 |
-
// =========================================================================
|
| 751 |
-
|
| 752 |
-
const errorHandlingExamples = [
|
| 753 |
-
{
|
| 754 |
-
user: 'Why did my file write fail?',
|
| 755 |
-
context: 'Error: ENOENT: no such file or directory, open \'missing/dir/file.txt\'',
|
| 756 |
-
response: 'The error indicates the directory doesn\'t exist. You need to create the parent directory first:\n\n```js\nimport fs from \'fs\'.promises;\nimport path from \'path\';\n\nasync function ensureDir(dirPath) {\n await fs.mkdir(path.dirname(dirPath), { recursive: true });\n}\n\n// Then write\nawait ensureDir(\'missing/dir/file.txt\');\nawait fs.writeFile(\'missing/dir/file.txt\', \'content\');\n```\n\nOr use `fs.mkdirSync(path.dirname(filePath), { recursive: true })` synchronously.'
|
| 757 |
-
},
|
| 758 |
-
{
|
| 759 |
-
user: 'My async function is throwing an error',
|
| 760 |
-
context: 'Error: Cannot read property \'map\' of undefined',
|
| 761 |
-
response: 'This usually means the data you\'re trying to map over isn\'t defined yet. Here\'s how to handle it:\n\n```js\nasync function getItems() {\n try {\n const data = await fetchData();\n // Check before using\n return data?.items?.map(item => item.id) || [];\n } catch (error) {\n console.error(\'Failed to fetch:\', error);\n return []; // Return fallback\n }\n}\n```\n\nUse optional chaining (`?.`) to safely access nested properties.'
|
| 762 |
-
}
|
| 763 |
-
];
|
| 764 |
-
|
| 765 |
-
for (const error of errorHandlingExamples) {
|
| 766 |
-
examples.push({
|
| 767 |
-
messages: [
|
| 768 |
-
{ role: 'system', content: systemPrompts[0] },
|
| 769 |
-
{ role: 'user', content: error.user + '\n\nError context: ' + error.context },
|
| 770 |
-
{ role: 'assistant', content: error.response }
|
| 771 |
-
]
|
| 772 |
-
});
|
| 773 |
-
}
|
| 774 |
-
|
| 775 |
-
// =========================================================================
|
| 776 |
-
// EXAMPLES: Multi-step workflows
|
| 777 |
-
// =========================================================================
|
| 778 |
-
|
| 779 |
-
const multiStepWorkflows = [
|
| 780 |
-
{
|
| 781 |
-
name: 'Create and test a new component',
|
| 782 |
-
steps: [
|
| 783 |
-
{ role: 'user', content: 'Create a new React component called UserCard' },
|
| 784 |
-
{ role: 'assistant', content: 'I\'ll create the UserCard component for you.', tool_use: { name: 'FileWriteTool', input: { file_path: 'src/components/UserCard.tsx', content: 'import React from \'react\';\n\ninterface UserCardProps {\n name: string;\n email: string;\n}\n\nexport const UserCard: React.FC<UserCardProps> = ({ name, email }) => {\n return (\n <div className="user-card">\n <h3>{name}</h3>\n <p>{email}</p>\n </div>\n );\n};' } } },
|
| 785 |
-
{ role: 'user', content: '', tool_result: { tool_use_id: 'tool_1', content: 'File created successfully' } },
|
| 786 |
-
{ role: 'assistant', content: 'Created the component. Now let me write a test for it.', tool_use: { name: 'FileWriteTool', input: { file_path: 'src/components/__tests__/UserCard.test.tsx', content: 'import React from \'react\';\nimport { render, screen } from \'@testing-library/react\';\nimport { UserCard } from \'../UserCard\';\n\ndescribe(\'UserCard\', () => {\n it(\'renders user name and email\', () => {\n render(<UserCard name="John" email="john@example.com" />);\n expect(screen.getByText(\'John\')).toBeInTheDocument();\n expect(screen.getByText(\'john@example.com\')).toBeInTheDocument();\n });\n});' } } },
|
| 787 |
-
{ role: 'user', content: '', tool_result: { tool_use_id: 'tool_2', content: 'Test file created' } },
|
| 788 |
-
{ role: 'assistant', content: 'Now let me run the tests to verify it works.', tool_use: { name: 'BashTool', input: { command: 'npm test -- --testPathPattern=UserCard' } } },
|
| 789 |
-
{ role: 'user', content: '', tool_result: { tool_use_id: 'tool_3', content: 'PASS src/components/__tests__/UserCard.test.tsx\n\n UserCard\n ✓ renders user name and email' } },
|
| 790 |
-
{ role: 'assistant', content: 'All tests pass! The UserCard component is ready to use.\n\n```tsx\nimport { UserCard } from \'./components/UserCard\';\n\n<UserCard name="John" email="john@example.com" />\n```' }
|
| 791 |
-
]
|
| 792 |
-
},
|
| 793 |
-
{
|
| 794 |
-
name: 'Debug and fix a failing test',
|
| 795 |
-
steps: [
|
| 796 |
-
{ role: 'user', content: 'Run the tests - I think something is broken' },
|
| 797 |
-
{ role: 'assistant', content: 'Running tests...', tool_use: { name: 'BashTool', input: { command: 'npm test 2>&1' } } },
|
| 798 |
-
{ role: 'user', content: '', tool_result: { tool_use_id: 'tool_1', content: 'FAIL src/utils/__tests__/format.test.ts\n\n Format Date\n ✕ should format date as YYYY-MM-DD (5ms)\n\n expect(received).toBe(expected)\n\n Expected: "2024-01-15"\n Received: "15-01-2024"' } },
|
| 799 |
-
{ role: 'assistant', content: 'I see the test is failing. Let me look at the formatDate function to understand the issue.', tool_use: { name: 'GrepTool', input: { pattern: 'formatDate', path: 'src/utils' } } },
|
| 800 |
-
{ role: 'user', content: '', tool_result: { tool_use_id: 'tool_2', content: 'src/utils/format.ts:export function formatDate(date: Date): string {\n const d = new Date(date);\n return `${d.getDate()}-${d.getMonth() + 1}-${d.getFullYear()}`;\n}' } },
|
| 801 |
-
{ role: 'assistant', content: 'Found it! The function returns DD-MM-YYYY but the test expects YYYY-MM-DD. Let me fix it.', tool_use: { name: 'FileEditTool', input: { file_path: 'src/utils/format.ts', old_string: 'return `${d.getDate()}-${d.getMonth() + 1}-${d.getFullYear()}`;', new_string: 'return `${d.getFullYear()}-${String(d.getMonth() + 1).padStart(2, \'0\')}-${String(d.getDate()).padStart(2, \'0\')}`;' } } },
|
| 802 |
-
{ role: 'user', content: '', tool_result: { tool_use_id: 'tool_3', content: 'File updated' } },
|
| 803 |
-
{ role: 'assistant', content: 'Fixed the format. Let me run the tests again to confirm.', tool_use: { name: 'BashTool', input: { command: 'npm test -- --testPathPattern=format.test' } } },
|
| 804 |
-
{ role: 'user', content: '', tool_result: { tool_use_id: 'tool_4', content: 'PASS src/utils/__tests__/format.test.ts\n\n Format Date\n ✓ should format date as YYYY-MM-DD' } },
|
| 805 |
-
{ role: 'assistant', content: 'All tests pass now! The date is correctly formatted as YYYY-MM-DD.' }
|
| 806 |
-
]
|
| 807 |
-
}
|
| 808 |
-
];
|
| 809 |
-
|
| 810 |
-
for (const workflow of multiStepWorkflows) {
|
| 811 |
-
examples.push({
|
| 812 |
-
messages: [
|
| 813 |
-
{ role: 'system', content: systemPrompts[2] },
|
| 814 |
-
...workflow.steps
|
| 815 |
-
]
|
| 816 |
-
});
|
| 817 |
-
}
|
| 818 |
-
|
| 819 |
-
// =========================================================================
|
| 820 |
-
// EXAMPLES: Edge cases
|
| 821 |
-
// =========================================================================
|
| 822 |
-
|
| 823 |
-
const edgeCases = [
|
| 824 |
-
{
|
| 825 |
-
user: 'Handle an empty array in my reduce function',
|
| 826 |
-
context: 'My reduce throws when the array is empty',
|
| 827 |
-
response: 'Here are ways to handle empty arrays with reduce:\n\n```js\n// 1. Provide initial value\nconst sum = [].reduce((acc, val) => acc + val, 0); // Returns 0\n\n// 2. Check first\nif (arr.length === 0) return defaultValue;\n\n// 3. Use reduce with optional initial value\nfunction safeReduce(arr, reducer, initial) {\n if (arr.length === 0 && initial === undefined) {\n throw new Error(\'Cannot reduce empty array without initial value\');\n }\n return arr.reduce(reducer, initial);\n}\n```'
|
| 828 |
-
},
|
| 829 |
-
{
|
| 830 |
-
user: 'Deal with optional function parameters',
|
| 831 |
-
context: 'How do I make some parameters optional?',
|
| 832 |
-
response: 'You can use optional parameters in several ways:\n\n```ts\n// TypeScript optional parameter\nfunction greet(name: string, greeting?: string) {\n return `${greeting || \'Hello\'}, ${name}!`;\n}\n\n// Default parameter\nfunction createUser(name: string, role = \'user\') {\n return { name, role };\n}\n\n// Nullish coalescing for defaults\nfunction process(value: string | null) {\n const input = value ?? \'default\';\n}\n```'
|
| 833 |
-
},
|
| 834 |
-
{
|
| 835 |
-
user: 'Handle API rate limiting',
|
| 836 |
-
context: 'Getting 429 errors from the API',
|
| 837 |
-
response: 'Here\'s a pattern for handling rate limits:\n\n```js\nasync function fetchWithRetry(url, retries = 3) {\n for (let i = 0; i < retries; i++) {\n const response = await fetch(url);\n \n if (response.status === 429) {\n const retryAfter = response.headers.get(\'Retry-After\') || 60;\n console.log(`Rate limited. Waiting ${retryAfter}s...`);\n await new Promise(r => setTimeout(r, retryAfter * 1000));\n continue;\n }\n \n return response.json();\n }\n throw new Error(\'Max retries exceeded\');\n}\n```\n\nUse exponential backoff for more aggressive retrying.'
|
| 838 |
-
}
|
| 839 |
-
];
|
| 840 |
-
|
| 841 |
-
for (const edge of edgeCases) {
|
| 842 |
-
examples.push({
|
| 843 |
-
messages: [
|
| 844 |
-
{ role: 'system', content: systemPrompts[1] },
|
| 845 |
-
{ role: 'user', content: edge.user + '\n\nContext: ' + edge.context },
|
| 846 |
-
{ role: 'assistant', content: edge.response }
|
| 847 |
-
]
|
| 848 |
-
});
|
| 849 |
-
}
|
| 850 |
-
|
| 851 |
-
// =========================================================================
|
| 852 |
-
// GENERATE 50+ EXAMPLES PER TOOL (tool-specific variations)
|
| 853 |
-
// =========================================================================
|
| 854 |
-
|
| 855 |
-
const toolNames = tools.map(t => t.tool);
|
| 856 |
-
const variationsPerTool = {
|
| 857 |
-
FileReadTool: [
|
| 858 |
-
'Read the first 100 lines of a large log file',
|
| 859 |
-
'Show me the contents of .env.example',
|
| 860 |
-
'What\'s in the tsconfig.json?',
|
| 861 |
-
'Read the package-lock.json to check versions',
|
| 862 |
-
'Show me the gitignore file'
|
| 863 |
-
],
|
| 864 |
-
FileWriteTool: [
|
| 865 |
-
'Create a .gitignore file with common ignores',
|
| 866 |
-
'Write a new entry to the changelog',
|
| 867 |
-
'Create a simple JSON config file',
|
| 868 |
-
'Write the test results to output.txt'
|
| 869 |
-
],
|
| 870 |
-
GlobTool: [
|
| 871 |
-
'Find all .test.ts files',
|
| 872 |
-
'List all files in src/ directory',
|
| 873 |
-
'Find all files with "helper" in the name',
|
| 874 |
-
'Search for *.config.js files',
|
| 875 |
-
'Find all files in any __tests__ directory'
|
| 876 |
-
],
|
| 877 |
-
GrepTool: [
|
| 878 |
-
'Find all console.log statements',
|
| 879 |
-
'Search for "export default"',
|
| 880 |
-
'Find all imports from "react"',
|
| 881 |
-
'Search for password or secret patterns',
|
| 882 |
-
'Find all unused imports'
|
| 883 |
-
],
|
| 884 |
-
BashTool: [
|
| 885 |
-
'Initialize a new git repository',
|
| 886 |
-
'Show the last 10 commits',
|
| 887 |
-
'List all npm scripts available',
|
| 888 |
-
'Check the current directory',
|
| 889 |
-
'Show the difference between branches'
|
| 890 |
-
]
|
| 891 |
-
};
|
| 892 |
-
|
| 893 |
-
// Generate 50+ examples by varying prompts for each tool
|
| 894 |
-
let exampleCount = examples.length;
|
| 895 |
-
|
| 896 |
-
for (const tool of tools) {
|
| 897 |
-
const variations = variationsPerTool[tool.tool] || [];
|
| 898 |
-
|
| 899 |
-
for (let i = 0; i < 5; i++) {
|
| 900 |
-
const variation = variations[i % variations.length];
|
| 901 |
-
const idx = i % variations.length;
|
| 902 |
-
|
| 903 |
-
examples.push({
|
| 904 |
-
messages: [
|
| 905 |
-
{ role: 'system', content: systemPrompts[i % systemPrompts.length] },
|
| 906 |
-
{ role: 'user', content: `${variation || 'process'} (variant ${i + 1})` },
|
| 907 |
-
{ role: 'assistant', content: `I'll help you with that using ${tool.tool}. This is a variant example showing different ways to phrase the same intent.`, tool_use: { name: tool.tool, input: generateMockInput(tool.tool, i) } },
|
| 908 |
-
{ role: 'user', content: '', tool_result: { tool_use_id: `tool_${i}`, content: getMockResult(tool.tool, i) } },
|
| 909 |
-
{ role: 'assistant', content: `Done! Here's the result for variant ${i + 1} of ${(variation || 'task').toLowerCase()}.` }
|
| 910 |
-
]
|
| 911 |
-
});
|
| 912 |
-
}
|
| 913 |
-
}
|
| 914 |
-
|
| 915 |
-
// Write examples to JSONL
|
| 916 |
-
const outputPath = path.join(SYNTHETIC_DIR, 'examples.jsonl');
|
| 917 |
-
const stream = fs.createWriteStream(outputPath);
|
| 918 |
-
for (const ex of examples) {
|
| 919 |
-
stream.write(JSON.stringify(ex) + '\n');
|
| 920 |
-
}
|
| 921 |
-
stream.end();
|
| 922 |
-
|
| 923 |
-
console.log(`✅ Generated ${examples.length} synthetic examples`);
|
| 924 |
-
return examples;
|
| 925 |
-
}
|
| 926 |
-
|
| 927 |
-
function generateMockInput(toolName, variant) {
|
| 928 |
-
const inputs = {
|
| 929 |
-
FileReadTool: [{ file_path: `example-${variant}.txt` }, { file_path: 'src/index.ts' }, { file_path: 'config.json' }],
|
| 930 |
-
GlobTool: [{ pattern: `**/*.${variant === 0 ? 'ts' : 'js'}` }, { pattern: 'src/**/*.tsx' }],
|
| 931 |
-
GrepTool: [{ pattern: 'TODO', path: 'src' }],
|
| 932 |
-
BashTool: [{ command: 'ls -la' }, { command: 'git status' }],
|
| 933 |
-
FileWriteTool: [{ file_path: 'output.txt', content: 'test' }]
|
| 934 |
-
};
|
| 935 |
-
return inputs[toolName]?.[variant % (inputs[toolName]?.length || 1)] || { query: `variant-${variant}` };
|
| 936 |
-
}
|
| 937 |
-
|
| 938 |
-
function getMockResult(toolName, variant) {
|
| 939 |
-
const results = {
|
| 940 |
-
FileReadTool: 'File contents here...',
|
| 941 |
-
GlobTool: `file1.${variant === 0 ? 'ts' : 'js'}\nfile2.${variant === 0 ? 'ts' : 'js'}`,
|
| 942 |
-
GrepTool: 'Found 3 matches',
|
| 943 |
-
BashTool: 'Command output here',
|
| 944 |
-
FileWriteTool: ''
|
| 945 |
-
};
|
| 946 |
-
return results[toolName] || 'Done';
|
| 947 |
-
}
|
| 948 |
-
|
| 949 |
-
// ============================================================================
|
| 950 |
-
// 6. CREATE TRAINING MANIFEST
|
| 951 |
-
// ============================================================================
|
| 952 |
-
|
| 953 |
-
function createManifest(tools, stats) {
|
| 954 |
-
const manifest = {
|
| 955 |
-
dataset: {
|
| 956 |
-
name: 'Stack 2.9 Training Data',
|
| 957 |
-
version: '0.2.0',
|
| 958 |
-
description: 'Training data for Stack 2.9, an open-source coding assistant based on Qwen2.5-Coder',
|
| 959 |
-
source: 'OpenClaw architecture + synthetic examples + code analysis',
|
| 960 |
-
license: 'Apache 2.0'
|
| 961 |
-
},
|
| 962 |
-
stats: {
|
| 963 |
-
toolSchemas: tools.length,
|
| 964 |
-
syntheticExamples: stats.syntheticExamples,
|
| 965 |
-
codeCommentPairs: stats.codeCommentPairs,
|
| 966 |
-
testExamples: stats.testExamples,
|
| 967 |
-
conversations: stats.conversations,
|
| 968 |
-
totalExamples: stats.syntheticExamples
|
| 969 |
-
},
|
| 970 |
-
model_config: {
|
| 971 |
-
base_model: 'Qwen2.5-Coder-32B',
|
| 972 |
-
fine_tuning_method: 'LoRA',
|
| 973 |
-
lora_rank: 64,
|
| 974 |
-
lora_alpha: 128,
|
| 975 |
-
target_modules: [
|
| 976 |
-
'q_proj', 'k_proj', 'v_proj', 'o_proj',
|
| 977 |
-
'gate_proj', 'up_proj', 'down_proj'
|
| 978 |
-
],
|
| 979 |
-
quantization: 'AWQ 4-bit (inference)',
|
| 980 |
-
max_seq_length: 131072,
|
| 981 |
-
template: 'chatml'
|
| 982 |
-
},
|
| 983 |
-
tokenizer: {
|
| 984 |
-
family: 'Qwen2',
|
| 985 |
-
pad_token: '<|endoftext|>',
|
| 986 |
-
bos_token: '<|endoftext|>',
|
| 987 |
-
eos_token: '<|endoftext|>'
|
| 988 |
-
},
|
| 989 |
-
training_data: {
|
| 990 |
-
synthetic_examples: `${SYNTHETIC_DIR}/examples.jsonl`,
|
| 991 |
-
tools_catalog: `${TOOLS_SCHEMA_DIR}/catalog.json`,
|
| 992 |
-
code_pairs: `${CODE_PAIRS_DIR}/pairs.json`,
|
| 993 |
-
test_examples: `${CODE_PAIRS_DIR}/test-examples.json`,
|
| 994 |
-
conversations: `${CONVERSATIONS_DIR}/parsed.json`,
|
| 995 |
-
estimated_tokens: '~50M tokens total',
|
| 996 |
-
recommended_dataset_size: '100K - 1M examples'
|
| 997 |
-
},
|
| 998 |
-
deployment: {
|
| 999 |
-
inference_engine: 'vLLM',
|
| 1000 |
-
api_compatibility: 'OpenAI-compatible (chat/completions)',
|
| 1001 |
-
expected_throughput: '~50 tokens/s on A100 80GB',
|
| 1002 |
-
platforms: ['Hugging Face', 'OpenRouter', 'self-hosted']
|
| 1003 |
-
}
|
| 1004 |
-
};
|
| 1005 |
-
|
| 1006 |
-
fs.writeFileSync(
|
| 1007 |
-
path.join(OUTPUT_DIR, 'manifest.json'),
|
| 1008 |
-
JSON.stringify(manifest, null, 2)
|
| 1009 |
-
);
|
| 1010 |
-
|
| 1011 |
-
console.log('✅ Created training manifest');
|
| 1012 |
-
return manifest;
|
| 1013 |
-
}
|
| 1014 |
-
|
| 1015 |
-
// ============================================================================
|
| 1016 |
-
// 7. CREATE TRAINING CONFIG
|
| 1017 |
-
// ============================================================================
|
| 1018 |
-
|
| 1019 |
-
function createTrainingConfig() {
|
| 1020 |
-
const config = {
|
| 1021 |
-
model_name: 'Qwen/Qwen2.5-Coder-32B',
|
| 1022 |
-
dataset_path: './training-data/synthetic/examples.jsonl',
|
| 1023 |
-
max_seq_length: 131072,
|
| 1024 |
-
load_in_4bit: true,
|
| 1025 |
-
bf16: true,
|
| 1026 |
-
batch_size: 1,
|
| 1027 |
-
gradient_accumulation_steps: 16,
|
| 1028 |
-
learning_rate: 1e-4,
|
| 1029 |
-
num_train_epochs: 3,
|
| 1030 |
-
warmup_steps: 100,
|
| 1031 |
-
save_steps: 1000,
|
| 1032 |
-
eval_steps: 500,
|
| 1033 |
-
logging_steps: 10,
|
| 1034 |
-
output_dir: './stack-2.9-lora',
|
| 1035 |
-
push_to_hub: false,
|
| 1036 |
-
hub_model_id: 'your-username/stack-2.9',
|
| 1037 |
-
lora_config: {
|
| 1038 |
-
r: 64,
|
| 1039 |
-
lora_alpha: 128,
|
| 1040 |
-
target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
|
| 1041 |
-
lora_dropout: 0.05,
|
| 1042 |
-
bias: 'none'
|
| 1043 |
-
}
|
| 1044 |
-
};
|
| 1045 |
-
|
| 1046 |
-
fs.writeFileSync(
|
| 1047 |
-
path.join(OUTPUT_DIR, 'training-config.json'),
|
| 1048 |
-
JSON.stringify(config, null, 2)
|
| 1049 |
-
);
|
| 1050 |
-
|
| 1051 |
-
console.log('✅ Created training config template');
|
| 1052 |
-
return config;
|
| 1053 |
-
}
|
| 1054 |
-
|
| 1055 |
-
// ============================================================================
|
| 1056 |
-
// MAIN
|
| 1057 |
-
// ============================================================================
|
| 1058 |
-
|
| 1059 |
-
console.log('🔧 Stack 2.9 - Enhanced Training Data Extractor\n');
|
| 1060 |
-
console.log(`📂 Source: ${SRC_DIR}`);
|
| 1061 |
-
console.log(`📁 Output: ${OUTPUT_DIR}\n`);
|
| 1062 |
-
|
| 1063 |
-
// Run extraction pipeline
|
| 1064 |
-
const tools = extractToolSchemas();
|
| 1065 |
-
const codePairs = extractCodeCommentPairs();
|
| 1066 |
-
const testExamples = extractTestExamples();
|
| 1067 |
-
const conversations = parseConversations();
|
| 1068 |
-
const syntheticExamples = generateSyntheticExamples(tools);
|
| 1069 |
-
createManifest(tools, {
|
| 1070 |
-
syntheticExamples: syntheticExamples.length,
|
| 1071 |
-
codeCommentPairs: codePairs.length,
|
| 1072 |
-
testExamples: testExamples.length,
|
| 1073 |
-
conversations: conversations.length
|
| 1074 |
-
});
|
| 1075 |
-
createTrainingConfig();
|
| 1076 |
-
|
| 1077 |
-
console.log('\n✨ Extraction complete!');
|
| 1078 |
-
console.log('\n📋 Summary:');
|
| 1079 |
-
console.log(` - Tool schemas: ${tools.length} tools`);
|
| 1080 |
-
console.log(` - Synthetic examples: ${syntheticExamples.length}`);
|
| 1081 |
-
console.log(` - Code-comment pairs: ${codePairs.length}`);
|
| 1082 |
-
console.log(` - Test examples: ${testExamples.length}`);
|
| 1083 |
-
console.log(` - Conversations: ${conversations.length}`);
|
| 1084 |
-
console.log('\n📁 Output files:');
|
| 1085 |
-
console.log(` - ${TOOLS_SCHEMA_DIR}/catalog.json`);
|
| 1086 |
-
console.log(` - ${SYNTHETIC_DIR}/examples.jsonl`);
|
| 1087 |
-
console.log(` - ${CODE_PAIRS_DIR}/pairs.json`);
|
| 1088 |
-
console.log(` - ${CODE_PAIRS_DIR}/test-examples.json`);
|
| 1089 |
-
console.log(` - ${CONVERSATIONS_DIR}/parsed.json`);
|
| 1090 |
-
console.log(` - ${OUTPUT_DIR}/manifest.json`);
|
| 1091 |
-
console.log(` - ${OUTPUT_DIR}/training-config.json`);
|
| 1092 |
-
console.log('\n🚀 Next steps:');
|
| 1093 |
-
console.log(' 1. Review extracted code-comment pairs for quality');
|
| 1094 |
-
console.log(' 2. Add real conversation logs from ~/.claude/sessions');
|
| 1095 |
-
console.log(' 3. Scale: aim for 50+ examples per tool');
|
| 1096 |
-
console.log(' 4. Convert to Parquet for faster loading');
|
| 1097 |
-
console.log(' 5. Launch LoRA fine-tuning on Qwen2.5-Coder-32B');
|
| 1098 |
-
console.log(' 6. Deploy with vLLM and submit to OpenRouter');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/{fuse_lora_adapters.py → training/fuse_lora_adapters.py}
RENAMED
|
File without changes
|
scripts/{merge_lora_adapters.py → training/merge_lora_adapters.py}
RENAMED
|
File without changes
|
scripts/update_context_window.py
DELETED
|
@@ -1,190 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Update all configuration files to use 128K context window.
|
| 4 |
-
Updates: manifest, training config, prepare_dataset, vLLM server, deploy scripts, docs.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import json
|
| 8 |
-
import re
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
import argparse
|
| 11 |
-
|
| 12 |
-
def update_json_file(path: Path, updates: dict):
|
| 13 |
-
"""Update JSON file with key->value updates."""
|
| 14 |
-
if not path.exists():
|
| 15 |
-
print(f" ⚠️ Not found: {path}")
|
| 16 |
-
return False
|
| 17 |
-
|
| 18 |
-
with open(path, 'r') as f:
|
| 19 |
-
data = json.load(f)
|
| 20 |
-
|
| 21 |
-
changed = False
|
| 22 |
-
for key, value in updates.items():
|
| 23 |
-
if key in data and data[key] != value:
|
| 24 |
-
data[key] = value
|
| 25 |
-
changed = True
|
| 26 |
-
|
| 27 |
-
if changed:
|
| 28 |
-
with open(path, 'w') as f:
|
| 29 |
-
json.dump(data, f, indent=2)
|
| 30 |
-
print(f" ✅ Updated {path.name}")
|
| 31 |
-
else:
|
| 32 |
-
print(f" ℹ️ {path.name} already up-to-date")
|
| 33 |
-
return changed
|
| 34 |
-
|
| 35 |
-
def update_python_file(path: Path, old_pattern: str, new_value: str):
|
| 36 |
-
"""Replace a constant in a Python file."""
|
| 37 |
-
if not path.exists():
|
| 38 |
-
print(f" ⚠️ Not found: {path}")
|
| 39 |
-
return False
|
| 40 |
-
|
| 41 |
-
content = path.read_text()
|
| 42 |
-
if old_pattern in content:
|
| 43 |
-
new_content = content.replace(old_pattern, new_value)
|
| 44 |
-
path.write_text(new_content)
|
| 45 |
-
print(f" ✅ Updated {path.name}")
|
| 46 |
-
return True
|
| 47 |
-
else:
|
| 48 |
-
print(f" ℹ️ {path.name} - pattern not found, may be already updated")
|
| 49 |
-
return False
|
| 50 |
-
|
| 51 |
-
def update_shell_script(path: Path, old_var: str, new_value: str):
|
| 52 |
-
"""Update shell script variable."""
|
| 53 |
-
if not path.exists():
|
| 54 |
-
print(f" ⚠️ Not found: {path}")
|
| 55 |
-
return False
|
| 56 |
-
|
| 57 |
-
content = path.read_text()
|
| 58 |
-
if old_var in content:
|
| 59 |
-
new_content = re.sub(
|
| 60 |
-
rf'{old_var}=.+',
|
| 61 |
-
f'{old_var}={new_value}',
|
| 62 |
-
content
|
| 63 |
-
)
|
| 64 |
-
path.write_text(new_content)
|
| 65 |
-
print(f" ✅ Updated {path.name}")
|
| 66 |
-
return True
|
| 67 |
-
else:
|
| 68 |
-
print(f" ℹ️ {path.name} - variable not found")
|
| 69 |
-
return False
|
| 70 |
-
|
| 71 |
-
def update_markdown_file(path: Path, old_text: str, new_text: str):
|
| 72 |
-
"""Update markdown documentation."""
|
| 73 |
-
if not path.exists():
|
| 74 |
-
print(f" ⚠️ Not found: {path}")
|
| 75 |
-
return False
|
| 76 |
-
|
| 77 |
-
content = path.read_text()
|
| 78 |
-
if old_text in content:
|
| 79 |
-
new_content = content.replace(old_text, new_text)
|
| 80 |
-
path.write_text(new_content)
|
| 81 |
-
print(f" ✅ Updated {path.name}")
|
| 82 |
-
return True
|
| 83 |
-
else:
|
| 84 |
-
print(f" ℹ️ {path.name} - pattern not found")
|
| 85 |
-
return False
|
| 86 |
-
|
| 87 |
-
def main():
|
| 88 |
-
parser = argparse.ArgumentParser()
|
| 89 |
-
parser.add_argument("--workspace", type=str, default=".")
|
| 90 |
-
args = parser.parse_args()
|
| 91 |
-
|
| 92 |
-
root = Path(args.workspace)
|
| 93 |
-
|
| 94 |
-
print("🚀 Updating context window to 128K (131072 tokens)")
|
| 95 |
-
|
| 96 |
-
# 1. Training manifest
|
| 97 |
-
manifest_path = root / "training-data/manifest.json"
|
| 98 |
-
update_json_file(manifest_path, {
|
| 99 |
-
"max_seq_length": 131072,
|
| 100 |
-
"context_length": 131072
|
| 101 |
-
})
|
| 102 |
-
|
| 103 |
-
# 2. Training config
|
| 104 |
-
training_config_path = root / "training-data/training-config.json"
|
| 105 |
-
update_json_file(training_config_path, {
|
| 106 |
-
"max_seq_length": 131072
|
| 107 |
-
})
|
| 108 |
-
|
| 109 |
-
# 3. Python scripts
|
| 110 |
-
prepare_script = root / "stack-2.9-training/prepare_dataset.py"
|
| 111 |
-
if prepare_script.exists():
|
| 112 |
-
content = prepare_script.read_text()
|
| 113 |
-
if "max_length=32768" in content:
|
| 114 |
-
new_content = content.replace("max_length=32768", "max_length=131072")
|
| 115 |
-
prepare_script.write_text(new_content)
|
| 116 |
-
print(f" ✅ Updated prepare_dataset.py (max_length)")
|
| 117 |
-
else:
|
| 118 |
-
print(f" ℹ️ prepare_dataset.py - already 128K or pattern not found")
|
| 119 |
-
|
| 120 |
-
# 4. vLLM server
|
| 121 |
-
vllm_script = root / "stack-2.9-deploy/vllm_server.py"
|
| 122 |
-
if vllm_script.exists():
|
| 123 |
-
content = vllm_script.read_text()
|
| 124 |
-
if "max_model_len" in content:
|
| 125 |
-
# Update max_model_len parameter
|
| 126 |
-
new_content = re.sub(
|
| 127 |
-
r'--max-model-len\s+\d+',
|
| 128 |
-
'--max-model-len 131072',
|
| 129 |
-
content
|
| 130 |
-
)
|
| 131 |
-
vllm_script.write_text(new_content)
|
| 132 |
-
print(f" ✅ Updated vllm_server.py (--max-model-len)")
|
| 133 |
-
else:
|
| 134 |
-
print(f" ℹ️ vllm_server.py - max_model_len not found directly, check manually")
|
| 135 |
-
|
| 136 |
-
# 5. Local deploy script
|
| 137 |
-
deploy_script = root / "stack-2.9-deploy/local_deploy.sh"
|
| 138 |
-
if deploy_script.exists():
|
| 139 |
-
content = deploy_script.read_text()
|
| 140 |
-
# Update any context-related env var
|
| 141 |
-
new_content = content.replace("MAX_MODEL_LEN=32768", "MAX_MODEL_LEN=131072") \
|
| 142 |
-
.replace("max_model_len=32768", "max_model_len=131072")
|
| 143 |
-
if new_content != content:
|
| 144 |
-
deploy_script.write_text(new_content)
|
| 145 |
-
print(f" ✅ Updated local_deploy.sh")
|
| 146 |
-
else:
|
| 147 |
-
print(f" ℹ️ local_deploy.sh - no changes needed")
|
| 148 |
-
|
| 149 |
-
# 6. README.md performance table
|
| 150 |
-
readme_path = root / "README.md"
|
| 151 |
-
if readme_path.exists():
|
| 152 |
-
content = readme_path.read_text()
|
| 153 |
-
# Update context length from 32K to 128K
|
| 154 |
-
new_content = content.replace("32,768 tokens", "131,072 tokens (128K)") \
|
| 155 |
-
.replace("32K tokens", "128K tokens")
|
| 156 |
-
if new_content != content:
|
| 157 |
-
readme_path.write_text(new_content)
|
| 158 |
-
print(f" ✅ Updated README.md (context length)")
|
| 159 |
-
else:
|
| 160 |
-
print(f" ℹ️ README.md - context length already correct")
|
| 161 |
-
|
| 162 |
-
# 7. Create configuration note
|
| 163 |
-
config_note = """# Context Window Configuration
|
| 164 |
-
|
| 165 |
-
Stack 2.9 uses full 128K context window (131072 tokens) to provide complete repository awareness.
|
| 166 |
-
|
| 167 |
-
## Settings
|
| 168 |
-
- max_model_len: 131072
|
| 169 |
-
- max_seq_length: 131072
|
| 170 |
-
- block_size: 16 or 32 (adjust for memory/performance tradeoff)
|
| 171 |
-
|
| 172 |
-
## Memory Requirements
|
| 173 |
-
| Context | A100 80GB (4-bit) | H100 80GB (4-bit) |
|
| 174 |
-
|---------|-------------------|-------------------|
|
| 175 |
-
| 32K | ~20GB | ~18GB |
|
| 176 |
-
| 64K | ~35GB | ~32GB |
|
| 177 |
-
| 128K | ~60GB | ~55GB |
|
| 178 |
-
|
| 179 |
-
Throughput decreases slightly at longer contexts (~30% slower at 128K vs 32K) but provides full repository context.
|
| 180 |
-
|
| 181 |
-
"""
|
| 182 |
-
note_path = root / "stack-2.9-docs/CONTEXT_CONFIG.md"
|
| 183 |
-
note_path.write_text(config_note)
|
| 184 |
-
print(f" ✅ Created CONTEXT_CONFIG.md")
|
| 185 |
-
|
| 186 |
-
print("\n✅ Context window update complete!")
|
| 187 |
-
print(" All configs now set to 128K (131072 tokens)")
|
| 188 |
-
|
| 189 |
-
if __name__ == "__main__":
|
| 190 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|