walidsobhie-code commited on
Commit
c7f1596
·
1 Parent(s): 5ddf5f9

chore: Rename MCP server to Stack2.9

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. audit_async.py → audits/audit_async.py +0 -0
  2. audit_tools.py → audits/audit_tools.py +0 -0
  3. audit_tools_async.py → audits/audit_tools_async.py +0 -0
  4. cli/run_mcp_server.py +3 -2
  5. load_local.py → loaders/load_local.py +0 -0
  6. load_model_fix.py → loaders/load_model_fix.py +0 -0
  7. load_pure.py → loaders/load_pure.py +0 -0
  8. load_pytorch.py → loaders/load_pytorch.py +0 -0
  9. load_silent.py → loaders/load_silent.py +0 -0
  10. run_auto.py → runners/run_auto.py +0 -0
  11. run_cache.py → runners/run_cache.py +0 -0
  12. run_final.py → runners/run_final.py +0 -0
  13. run_full.py → runners/run_full.py +0 -0
  14. run_local.py → runners/run_local.py +0 -0
  15. run_quiet.py → runners/run_quiet.py +0 -0
  16. run_qwen.py → runners/run_qwen.py +0 -0
  17. run_simple.py → runners/run_simple.py +0 -0
  18. run_v2.py → runners/run_v2.py +0 -0
  19. scripts/augment_data.py +0 -124
  20. scripts/augment_training_data.py +0 -324
  21. scripts/combine_all.py +0 -135
  22. scripts/combine_datasets.py +0 -144
  23. scripts/convert_gguf.py +0 -141
  24. scripts/convert_to_gguf.py +0 -210
  25. scripts/create_mini_dataset.py +0 -180
  26. scripts/download_benchmark_datasets.py +0 -127
  27. scripts/download_public_datasets.py +0 -170
  28. scripts/{compare_models.py → eval/compare_models.py} +0 -0
  29. scripts/{humaneval_eval.py → eval/humaneval_eval.py} +0 -0
  30. scripts/{mbpp_eval.py → eval/mbpp_eval.py} +0 -0
  31. scripts/{model_info.py → eval/model_info.py} +0 -0
  32. scripts/{tool_use_evaluator.py → eval/tool_use_evaluator.py} +0 -0
  33. scripts/extract_code_pairs.py +0 -215
  34. scripts/extract_patterns_from_git.py +0 -309
  35. scripts/extract_rtmp_tools.py +0 -174
  36. scripts/extract_rtmp_tools.ts +0 -115
  37. scripts/extract_rtmp_tools_advanced.py +0 -199
  38. scripts/generate_code_completion_data.py +0 -262
  39. scripts/generate_from_rtmp.ts +0 -114
  40. scripts/generate_random_synthetic.py +0 -141
  41. scripts/generate_synthetic.py +0 -256
  42. scripts/generate_synthetic_v2.py +0 -316
  43. scripts/generate_tool_data.py +0 -615
  44. scripts/generate_tool_use_tests.py +0 -163
  45. scripts/mine_sessions.py +0 -233
  46. scripts/quality_validate.py +0 -158
  47. scripts/training-data-extractor.js +0 -1098
  48. scripts/{fuse_lora_adapters.py → training/fuse_lora_adapters.py} +0 -0
  49. scripts/{merge_lora_adapters.py → training/merge_lora_adapters.py} +0 -0
  50. scripts/update_context_window.py +0 -190
audit_async.py → audits/audit_async.py RENAMED
File without changes
audit_tools.py → audits/audit_tools.py RENAMED
File without changes
audit_tools_async.py → audits/audit_tools_async.py RENAMED
File without changes
cli/run_mcp_server.py CHANGED
@@ -4,8 +4,9 @@
4
  import sys
5
  import os
6
 
7
- # Ensure src/ is on the path
8
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 
9
 
10
  from src.mcp_server import main
11
 
 
4
  import sys
5
  import os
6
 
7
+ # Ensure project root is on the path
8
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
9
+ sys.path.insert(0, project_root)
10
 
11
  from src.mcp_server import main
12
 
load_local.py → loaders/load_local.py RENAMED
File without changes
load_model_fix.py → loaders/load_model_fix.py RENAMED
File without changes
load_pure.py → loaders/load_pure.py RENAMED
File without changes
load_pytorch.py → loaders/load_pytorch.py RENAMED
File without changes
load_silent.py → loaders/load_silent.py RENAMED
File without changes
run_auto.py → runners/run_auto.py RENAMED
File without changes
run_cache.py → runners/run_cache.py RENAMED
File without changes
run_final.py → runners/run_final.py RENAMED
File without changes
run_full.py → runners/run_full.py RENAMED
File without changes
run_local.py → runners/run_local.py RENAMED
File without changes
run_quiet.py → runners/run_quiet.py RENAMED
File without changes
run_qwen.py → runners/run_qwen.py RENAMED
File without changes
run_simple.py → runners/run_simple.py RENAMED
File without changes
run_v2.py → runners/run_v2.py RENAMED
File without changes
scripts/augment_data.py DELETED
@@ -1,124 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Data augmentation for training examples.
4
- Increases dataset size by paraphrasing and variations.
5
- """
6
-
7
- import json
8
- import random
9
- from pathlib import Path
10
- from typing import List, Dict, Any
11
- import argparse
12
-
13
- # Paraphrase templates (rule-based, no LLM)
14
- PARAPHRASES = {
15
- "Read the file": ["Show me the contents of", "Open", "Display", "Fetch", "Get"],
16
- "Create a new file": ["Write a file", "Generate", "Make a new file", "Add file"],
17
- "Run": ["Execute", "Start", "Launch", "Invoke"],
18
- "Search for": ["Find", "Look for", "Locate", "Grep for"],
19
- "List all": ["Show all", "Display every", "Get list of"],
20
- "Can you": ["Please", "Would you", "Kindly"],
21
- "I need": ["I want", "I require", "Please provide"],
22
- }
23
-
24
- def paraphrase_text(text: str) -> str:
25
- """Apply simple paraphrasing to user prompt."""
26
- result = text
27
- for original, alternatives in PARAPHRASES.items():
28
- if original in result:
29
- replacement = random.choice(alternatives)
30
- result = result.replace(original, replacement, 1)
31
- return result
32
-
33
- def augment_example(example: Dict[str, Any], variation_factor: float = 0.3) -> List[Dict[str, Any]]:
34
- """Generate variations of a single example."""
35
- variations = [example] # Keep original
36
-
37
- # Paraphrase user message
38
- if random.random() < variation_factor:
39
- new_ex = json.loads(json.dumps(example)) # Deep copy
40
- original_user = new_ex["messages"][0]["content"]
41
- new_ex["messages"][0]["content"] = paraphrase_text(original_user)
42
- new_ex["source"] = "augmented_paraphrase"
43
- variations.append(new_ex)
44
-
45
- # Vary tool parameters (if any)
46
- if "tool_use" in example["messages"][1]:
47
- tool_input = example["messages"][1]["tool_use"]["input"]
48
- if isinstance(tool_input, dict) and tool_input:
49
- new_ex = json.loads(json.dumps(example))
50
- # Randomly change file paths, commands, etc.
51
- for key, val in new_ex["messages"][1]["tool_use"]["input"].items():
52
- if key == "file_path" and isinstance(val, str):
53
- # Change to a different plausible file
54
- new_ex["messages"][1]["tool_use"]["input"][key] = random.choice([
55
- "src/main.py", "README.md", "package.json", "config.yaml"
56
- ])
57
- # Also update result if it contains the old file path
58
- result_content = new_ex["messages"][2]["tool_result"]["content"]
59
- new_ex["messages"][2]["tool_result"]["content"] = result_content.replace(val, new_ex["messages"][1]["tool_use"]["input"][key])
60
- new_ex["source"] = "augmented_params"
61
- variations.append(new_ex)
62
-
63
- # Add filler words to user message
64
- if random.random() < variation_factor * 0.5:
65
- new_ex = json.loads(json.dumps(example))
66
- fillers = [" please", " if you can", " when you have time", " thanks"]
67
- user_msg = new_ex["messages"][0]["content"]
68
- filler = random.choice(fillers)
69
- new_ex["messages"][0]["content"] = user_msg + filler
70
- new_ex["source"] = "augmented_filler"
71
- variations.append(new_ex)
72
-
73
- return variations
74
-
75
- def main():
76
- parser = argparse.ArgumentParser()
77
- parser.add_argument("--input", type=str, default="training-data/scaled/template_synthetic.jsonl")
78
- parser.add_argument("--output", type=str, default="training-data/scaled/augmented.jsonl")
79
- parser.add_argument("--multiplier", type=int, default=3, help="How many times to multiply dataset")
80
- args = parser.parse_args()
81
-
82
- input_path = Path(args.input)
83
- output_path = Path(args.output)
84
-
85
- if not input_path.exists():
86
- print(f"❌ Input file not found: {input_path}")
87
- return
88
-
89
- print(f"📈 Augmenting dataset: {input_path}")
90
- examples = []
91
- with open(input_path, 'r') as f:
92
- for line in f:
93
- examples.append(json.loads(line))
94
-
95
- original_count = len(examples)
96
- target_count = original_count * args.multiplier
97
- print(f" Original: {original_count} examples")
98
- print(f" Target: ~{target_count} examples (x{args.multiplier})")
99
-
100
- output_path.parent.mkdir(parents=True, exist_ok=True)
101
-
102
- generated = 0
103
- with open(output_path, 'w') as f:
104
- for ex in examples:
105
- # Write original and variations
106
- f.write(json.dumps(ex) + "\n")
107
- generated += 1
108
-
109
- # Generate variations until we reach multiplier
110
- variations = augment_example(ex)
111
- for var in variations[1:]: # Skip original (already written)
112
- if generated < target_count:
113
- f.write(json.dumps(var) + "\n")
114
- generated += 1
115
-
116
- if generated % 1000 == 0:
117
- print(f" Generated {generated}/{target_count}...", end='\r')
118
-
119
- print(f"\n✨ Augmented to {generated} examples")
120
- print(f" Saved to: {output_path}")
121
- print(f" Total dataset now: {original_count} → {generated} (x{generated/original_count:.1f})")
122
-
123
- if __name__ == "__main__":
124
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/augment_training_data.py DELETED
@@ -1,324 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Data augmentation script for tool_examples.jsonl.
4
- Generates 2x-5x more training examples from existing data through:
5
- - Paraphrasing user prompts
6
- - Difficulty scaling (simpler/complex variations)
7
- - Edge case generation
8
- """
9
-
10
- import json
11
- import random
12
- import argparse
13
- from pathlib import Path
14
- from typing import List, Dict, Any, Optional
15
- from itertools import product
16
- import copy
17
-
18
- # Random seed for reproducibility
19
- random.seed(42)
20
-
21
- # Paraphrase templates
22
- PARAPHRASES = {
23
- "Can you": ["Please", "Would you kindly", "Could you", "Kindly"],
24
- "I need": ["I'd like", "I require", "I want", "I must have"],
25
- "show me": ["display", "show", "reveal", "let me see"],
26
- "the file": ["this file", "that file", "a file"],
27
- "run": ["execute", "launch", "start", "run"],
28
- "create": ["make", "generate", "add", "write"],
29
- "delete": ["remove", "erase", "drop", "destroy"],
30
- "list": ["show", "display", "enumerate", "get"],
31
- "search": ["find", "look for", "grep", "locate"],
32
- "help me": ["assist me", "I need help", "please assist", "support"],
33
- }
34
-
35
- # Difficulty modifiers
36
- EASY_MODIFIERS = [
37
- "quickly",
38
- "simply",
39
- "just",
40
- "easily",
41
- ]
42
-
43
- COMPLEX_MODIFIERS = [
44
- "carefully",
45
- "thoroughly",
46
- "in detail",
47
- "completely",
48
- "with all options",
49
- ]
50
-
51
- # Edge case patterns
52
- EDGE_CASE_PATTERNS = [
53
- ("empty_input", lambda ex: _create_empty_variant(ex)),
54
- ("multi_step", lambda ex: _create_multistep_variant(ex)),
55
- ("error_handling", lambda ex: _create_error_variant(ex)),
56
- ]
57
-
58
-
59
- def _deep_copy(obj: Any) -> Any:
60
- """Create a deep copy of a JSON-serializable object."""
61
- return json.loads(json.dumps(obj))
62
-
63
-
64
- def _create_empty_variant(example: Dict[str, Any]) -> Optional[Dict[str, Any]]:
65
- """Create variant with empty/blank user input."""
66
- new_ex = _deep_copy(example)
67
- # Keep system message, empty user message
68
- for msg in new_ex["messages"]:
69
- if msg["role"] == "user":
70
- msg["content"] = " "
71
- break
72
- new_ex["source"] = "augmented_edge_empty"
73
- return new_ex
74
-
75
-
76
- def _create_multistep_variant(example: Dict[str, Any]) -> Optional[Dict[str, Any]]:
77
- """Create variant simulating multi-step reasoning."""
78
- new_ex = _deep_copy(example)
79
- # Add reasoning step before tool call
80
- for i, msg in enumerate(new_ex["messages"]):
81
- if msg.get("tool_calls"):
82
- reasoning = {
83
- "role": "assistant",
84
- "content": "Let me think about this step by step. First, I need to understand what the user is asking for."
85
- }
86
- new_ex["messages"].insert(i, reasoning)
87
- break
88
- new_ex["source"] = "augmented_edge_multistep"
89
- return new_ex
90
-
91
-
92
- def _create_error_variant(example: Dict[str, Any]) -> Optional[Dict[str, Any]]:
93
- """Create variant simulating error handling."""
94
- new_ex = _deep_copy(example)
95
- for msg in new_ex["messages"]:
96
- if msg.get("role") == "tool":
97
- # Simulate an error in tool result
98
- if "Successfully" in msg.get("content", ""):
99
- msg["content"] = msg["content"].replace("Successfully", "Error occurred:")
100
- elif "error" not in msg.get("content", "").lower():
101
- msg["content"] = "Operation failed: Permission denied"
102
- break
103
- new_ex["source"] = "augmented_edge_error"
104
- return new_ex
105
-
106
-
107
- def paraphrase_text(text: str) -> str:
108
- """Apply simple paraphrasing to text."""
109
- if not text:
110
- return text
111
- result = text
112
- for original, alternatives in PARAPHRASES.items():
113
- if original.lower() in result.lower():
114
- # Case-insensitive replace, preserve original case pattern
115
- idx = result.lower().find(original.lower())
116
- prefix = result[:idx]
117
- suffix = result[idx + len(original):]
118
- replacement = random.choice(alternatives)
119
- # Preserve case
120
- if result[idx].isupper():
121
- replacement = replacement.capitalize()
122
- result = prefix + replacement + suffix
123
- break
124
- return result
125
-
126
-
127
- def apply_difficulty(example: Dict[str, Any], level: str) -> Dict[str, Any]:
128
- """Apply difficulty scaling to an example."""
129
- new_ex = _deep_copy(example)
130
- modifiers = EASY_MODIFIERS if level == "easy" else COMPLEX_MODIFIERS
131
-
132
- for msg in new_ex["messages"]:
133
- if msg["role"] == "user" and msg.get("content"):
134
- content = msg["content"]
135
- if level == "easy":
136
- # Simplify the request
137
- content = content.replace("please", "").replace("kindly", "")
138
- content = content.strip()
139
- elif level == "complex":
140
- # Add complexity
141
- modifier = random.choice(modifiers)
142
- content = f"{content} {modifier}"
143
- msg["content"] = content
144
- break
145
-
146
- new_ex["source"] = f"augmented_difficulty_{level}"
147
- return new_ex
148
-
149
-
150
- def vary_tool_parameters(example: Dict[str, Any]) -> List[Dict[str, Any]]:
151
- """Generate variations with different tool parameters."""
152
- variations = []
153
-
154
- for msg in example.get("messages", []):
155
- if msg.get("tool_calls"):
156
- for tc in msg["tool_calls"]:
157
- func = tc.get("function", {})
158
- args_str = func.get("arguments", "{}")
159
- try:
160
- args = json.loads(args_str) if isinstance(args_str, str) else args_str
161
- except (json.JSONDecodeError, TypeError):
162
- continue
163
-
164
- if not isinstance(args, dict):
165
- continue
166
-
167
- # Common parameter variations
168
- param_variations = [
169
- ("file_path", ["src/main.py", "README.md", "config.yaml", "package.json", "tests/test.py"]),
170
- ("command", ["ls -la", "echo hello", "pwd", "whoami"]),
171
- ("pattern", ["*.py", "*.js", "*.md", "*.json"]),
172
- ("path", ["src", "lib", "docs", "."]),
173
- ]
174
-
175
- for param_name, alternatives in param_variations:
176
- if param_name in args:
177
- original_val = args[param_name]
178
- for alt_val in alternatives:
179
- if alt_val != original_val:
180
- new_ex = _deep_copy(example)
181
- for new_msg in new_ex["messages"]:
182
- if new_msg.get("tool_calls"):
183
- for new_tc in new_msg["tool_calls"]:
184
- new_func = new_tc.get("function", {})
185
- new_args = json.loads(new_func.get("arguments", "{}"))
186
- if param_name in new_args:
187
- new_args[param_name] = alt_val
188
- new_func["arguments"] = json.dumps(new_args)
189
- new_ex["source"] = "augmented_params"
190
- variations.append(new_ex)
191
- break
192
-
193
- return variations
194
-
195
-
196
- def add_filler_variant(example: Dict[str, Any]) -> Optional[Dict[str, Any]]:
197
- """Add polite filler words to user message."""
198
- fillers = [" please", " if you could", " when you get a chance", " thanks"]
199
-
200
- new_ex = _deep_copy(example)
201
- for msg in new_ex["messages"]:
202
- if msg["role"] == "user" and msg.get("content"):
203
- filler = random.choice(fillers)
204
- msg["content"] = msg["content"].rstrip() + filler
205
- break
206
-
207
- new_ex["source"] = "augmented_filler"
208
- return new_ex
209
-
210
-
211
- def generate_edge_cases(example: Dict[str, Any], num_cases: int = 2) -> List[Dict[str, Any]]:
212
- """Generate edge case variations."""
213
- cases = []
214
- selected_patterns = random.sample(EDGE_CASE_PATTERNS, min(num_cases, len(EDGE_CASE_PATTERNS)))
215
-
216
- for name, generator in selected_patterns:
217
- try:
218
- variant = generator(example)
219
- if variant:
220
- cases.append(variant)
221
- except Exception:
222
- continue
223
-
224
- return cases
225
-
226
-
227
- def augment_example(example: Dict[str, Any], target_multiplier: int = 3) -> List[Dict[str, Any]]:
228
- """Generate multiple augmented variations of a single example."""
229
- variations = [example] # Always keep original
230
-
231
- # 1. Paraphrase variant
232
- if random.random() < 0.7:
233
- new_ex = _deep_copy(example)
234
- for msg in new_ex["messages"]:
235
- if msg["role"] == "user" and msg.get("content"):
236
- msg["content"] = paraphrase_text(msg["content"])
237
- break
238
- new_ex["source"] = "augmented_paraphrase"
239
- variations.append(new_ex)
240
-
241
- # 2. Difficulty variants (easy and complex)
242
- if random.random() < 0.5:
243
- variations.append(apply_difficulty(example, "easy"))
244
- if random.random() < 0.5:
245
- variations.append(apply_difficulty(example, "complex"))
246
-
247
- # 3. Filler variant
248
- if random.random() < 0.3:
249
- filler_ex = add_filler_variant(example)
250
- if filler_ex:
251
- variations.append(filler_ex)
252
-
253
- # 4. Tool parameter variations
254
- param_variations = vary_tool_parameters(example)
255
- variations.extend(param_variations[:2]) # Limit to 2
256
-
257
- # 5. Edge cases
258
- if random.random() < 0.3:
259
- edge_cases = generate_edge_cases(example)
260
- variations.extend(edge_cases[:1])
261
-
262
- return variations[:target_multiplier] # Limit total variations
263
-
264
-
265
- def main():
266
- parser = argparse.ArgumentParser(description="Augment training data for Stack 2.9")
267
- parser.add_argument("--input", type=str,
268
- default="training-data/tool_examples.jsonl",
269
- help="Input JSONL file")
270
- parser.add_argument("--output", type=str,
271
- default="training-data/augmented_tool_examples.jsonl",
272
- help="Output JSONL file")
273
- parser.add_argument("--multiplier", type=int, default=3,
274
- help="Target multiplication factor (2-5)")
275
- parser.add_argument("--seed", type=int, default=42,
276
- help="Random seed for reproducibility")
277
-
278
- args = parser.parse_args()
279
- random.seed(args.seed)
280
-
281
- input_path = Path(args.input)
282
- output_path = Path(args.output)
283
-
284
- if not input_path.exists():
285
- print(f"Error: Input file not found: {input_path}")
286
- return
287
-
288
- print(f"Loading data from: {input_path}")
289
- examples = []
290
- with open(input_path, 'r', encoding='utf-8') as f:
291
- for line in f:
292
- line = line.strip()
293
- if line:
294
- try:
295
- examples.append(json.loads(line))
296
- except json.JSONDecodeError:
297
- continue
298
-
299
- original_count = len(examples)
300
- print(f"Loaded {original_count} examples")
301
-
302
- # Generate augmented examples
303
- all_variations = []
304
- for ex in examples:
305
- variations = augment_example(ex, target_multiplier=args.multiplier)
306
- all_variations.extend(variations)
307
-
308
- total_count = len(all_variations)
309
-
310
- # Write output
311
- output_path.parent.mkdir(parents=True, exist_ok=True)
312
- with open(output_path, 'w', encoding='utf-8') as f:
313
- for var in all_variations:
314
- f.write(json.dumps(var, ensure_ascii=False) + "\n")
315
-
316
- print(f"\nAugmentation complete!")
317
- print(f" Original: {original_count} examples")
318
- print(f" Augmented: {total_count} examples")
319
- print(f" Multiplier: {total_count/original_count:.1f}x")
320
- print(f" Output: {output_path}")
321
-
322
-
323
- if __name__ == "__main__":
324
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/combine_all.py DELETED
@@ -1,135 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Final dataset combiner - loads all sources, deduplicates, splits.
4
- """
5
-
6
- import json
7
- import hashlib
8
- import random
9
- from pathlib import Path
10
- from datetime import datetime
11
- import glob
12
-
13
- def hash_messages(messages: list) -> str:
14
- """Create hash for deduplication."""
15
- return hashlib.md5(json.dumps(messages, sort_keys=True).encode()).hexdigest()
16
-
17
- def main():
18
- output_dir = Path("training-data/final")
19
- output_dir.mkdir(parents=True, exist_ok=True)
20
-
21
- # Glob all potential source files
22
- source_files = []
23
-
24
- # Synthetic sources
25
- source_files.extend(glob.glob("training-data/synthetic/*.jsonl"))
26
- source_files.extend(glob.glob("training-data/advanced-patterns/*.jsonl"))
27
- source_files.extend(glob.glob("training-data/scaled/*.jsonl"))
28
-
29
- # Code pairs (JSON format)
30
- source_files.extend(glob.glob("training-data/code-pairs/*.json"))
31
-
32
- print(f"🔍 Found {len(source_files)} source files")
33
-
34
- all_examples = []
35
- seen_hashes = set()
36
- source_counts = {}
37
-
38
- for file_path in source_files:
39
- path = Path(file_path)
40
- source_name = path.stem
41
- count = 0
42
-
43
- try:
44
- with open(path, 'r') as f:
45
- for line in f:
46
- line = line.strip()
47
- if not line:
48
- continue
49
-
50
- try:
51
- ex = json.loads(line)
52
-
53
- # Convert code-pair format to message format
54
- if "code" in ex and "comment" in ex and "messages" not in ex:
55
- ex = {
56
- "messages": [
57
- {"role": "user", "content": f"Show me code for: {ex['comment'][:100]}"},
58
- {"role": "assistant", "content": f"Here's a {ex.get('type', 'function')}:\n{ex['code']}"}
59
- ],
60
- "source_original": source_name,
61
- "type": "code_pair"
62
- }
63
-
64
- # Deduplication
65
- if "messages" in ex:
66
- msg_hash = hash_messages(ex["messages"])
67
- if msg_hash in seen_hashes:
68
- continue
69
- seen_hashes.add(msg_hash)
70
-
71
- # Track source
72
- ex["source_original"] = ex.get("source_original", source_name)
73
- all_examples.append(ex)
74
- count += 1
75
- except json.JSONDecodeError:
76
- continue
77
-
78
- except Exception as e:
79
- print(f" ⚠️ Error reading {path}: {e}")
80
- continue
81
-
82
- source_counts[source_name] = count
83
- if count > 0:
84
- print(f" ✅ {source_name}: {count} examples")
85
-
86
- print(f"\n✨ Total unique examples: {len(all_examples)}")
87
- print(f" Deduplication removed: {sum(source_counts.values()) - len(all_examples)}")
88
-
89
- # Shuffle
90
- random.seed(42)
91
- random.shuffle(all_examples)
92
-
93
- # Splits (80/10/10)
94
- n = len(all_examples)
95
- n_train = int(n * 0.8)
96
- n_val = int(n * 0.1)
97
-
98
- splits = {
99
- "train": all_examples[:n_train],
100
- "val": all_examples[n_train:n_train+n_val],
101
- "test": all_examples[n_train+n_val:]
102
- }
103
-
104
- for split_name, data in splits.items():
105
- out_path = output_dir / f"{split_name}.jsonl"
106
- with open(out_path, 'w') as f:
107
- for ex in data:
108
- f.write(json.dumps(ex) + "\n")
109
- print(f" 📁 {split_name}: {len(data)} -> {out_path}")
110
-
111
- # Manifest
112
- manifest = {
113
- "dataset": "Stack 2.9 Training Data",
114
- "version": "1.0",
115
- "created": datetime.now().isoformat(),
116
- "total_examples": len(all_examples),
117
- "splits": {name: len(data) for name, data in splits.items()},
118
- "source_breakdown": source_counts,
119
- "note": "Combined from multiple synthetic and code-pair sources"
120
- }
121
-
122
- with open(output_dir / "manifest.json", 'w') as f:
123
- json.dump(manifest, f, indent=2)
124
-
125
- print(f"\n✅ Final dataset ready!")
126
- print(f" Total: {len(all_examples)} examples")
127
- print(f" Manifest: {output_dir / 'manifest.json'}")
128
-
129
- if len(all_examples) >= 50000:
130
- print("\n🎉 TARGET ACHIEVED: 50,000+ examples!")
131
- else:
132
- print(f"\n⚠️ Still need {50000 - len(all_examples)} more to reach 50K target")
133
-
134
- if __name__ == "__main__":
135
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/combine_datasets.py DELETED
@@ -1,144 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Combine all training data sources into final dataset.
4
- Applies deduplication and quality filtering.
5
- """
6
-
7
- import json
8
- import hashlib
9
- from pathlib import Path
10
- import argparse
11
- from datetime import datetime
12
-
13
- def hash_messages(messages: list) -> str:
14
- """Create a hash of messages to detect duplicates."""
15
- m = hashlib.md5()
16
- m.update(json.dumps(messages, sort_keys=True).encode())
17
- return m.hexdigest()
18
-
19
- def main():
20
- parser = argparse.ArgumentParser()
21
- parser.add_argument("--output", type=str, default="training-data/final/dataset.jsonl")
22
- parser.add_argument("--train-size", type=float, default=0.8)
23
- parser.add_argument("--val-size", type=float, default=0.1)
24
- parser.add_argument("--max-dataset", type=int, default=50000, help="Max examples to include")
25
- args = parser.parse_args()
26
-
27
- output_path = Path(args.output)
28
- output_path.parent.mkdir(parents=True, exist_ok=True)
29
-
30
- # List all source files
31
- sources = [
32
- ("training-data/synthetic/examples.jsonl", "original_synthetic"),
33
- ("training-data/advanced-patterns/examples.jsonl", "advanced_patterns"),
34
- ("training-data/code-pairs/pairs.json", "code_pairs"),
35
- ("training-data/code-pairs/extended_pairs.json", "code_pairs_extended"),
36
- ("training-data/scaled/synthetic_final.jsonl", "synthetic_augmented"),
37
- ("training-data/scaled/random_10k.jsonl", "random_10k"),
38
- ("training-data/scaled/random_5_5k.jsonl", "random_5k"),
39
- ]
40
-
41
- all_examples = []
42
- seen_hashes = set()
43
- duplicates_removed = 0
44
-
45
- print("📦 Combining datasets...")
46
- for file_path, source in sources:
47
- path = Path(file_path)
48
- if not path.exists():
49
- print(f" ⚠️ Not found: {path}")
50
- continue
51
-
52
- print(f" Loading {source}...")
53
- count = 0
54
- with open(path, 'r') as f:
55
- for line in f:
56
- try:
57
- ex = json.loads(line)
58
-
59
- # Convert code-pair format if needed
60
- if "code" in ex and "comment" in ex:
61
- # Convert code-pair to message format
62
- ex = {
63
- "messages": [
64
- {"role": "user", "content": ex["comment"]},
65
- {"role": "assistant", "content": f"Here's the code:\n{ex['code']}"}
66
- ],
67
- "source": source,
68
- "type": "code_pair"
69
- }
70
-
71
- # Deduplication
72
- msg_hash = hash_messages(ex["messages"])
73
- if msg_hash in seen_hashes:
74
- duplicates_removed += 1
75
- continue
76
- seen_hashes.add(msg_hash)
77
-
78
- # Add metadata
79
- ex["source_original"] = source
80
- all_examples.append(ex)
81
- count += 1
82
-
83
- if len(all_examples) >= args.max_dataset:
84
- break
85
-
86
- except json.JSONDecodeError:
87
- continue
88
-
89
- print(f" ✅ Added {count} examples")
90
-
91
- print(f"\n✨ Total collected: {len(all_examples)} examples")
92
- print(f" Duplicates removed: {duplicates_removed}")
93
-
94
- # Shuffle
95
- random.seed(42)
96
- random.shuffle(all_examples)
97
-
98
- # Split
99
- n_total = len(all_examples)
100
- n_train = int(n_total * args.train_size)
101
- n_val = int(n_total * args.val_size)
102
- n_test = n_total - n_train - n_val
103
-
104
- train_set = all_examples[:n_train]
105
- val_set = all_examples[n_train:n_train+n_val]
106
- test_set = all_examples[n_train+n_val:]
107
-
108
- print(f"\n📊 Split:")
109
- print(f" Train: {len(train_set)}")
110
- print(f" Val: {len(val_set)}")
111
- print(f" Test: {len(test_set)}")
112
-
113
- # Save splits
114
- for split_name, split_data in [("train", train_set), ("val", val_set), ("test", test_set)]:
115
- split_path = output_path.parent / f"{split_name}.jsonl"
116
- with open(split_path, 'w') as f:
117
- for ex in split_data:
118
- f.write(json.dumps(ex) + "\n")
119
- print(f" Saved {split_name} to {split_path}")
120
-
121
- # Create manifest
122
- manifest = {
123
- "dataset": "Stack 2.9 Training Data",
124
- "version": "1.0",
125
- "created": datetime.now().isoformat(),
126
- "total_examples": n_total,
127
- "splits": {
128
- "train": len(train_set),
129
- "val": len(val_set),
130
- "test": len(test_set)
131
- },
132
- "sources": {src: sum(1 for ex in all_examples if ex.get("source_original") == src) for src in set(ex.get("source_original") for ex in all_examples)}
133
- }
134
-
135
- manifest_path = output_path.parent / "manifest.json"
136
- with open(manifest_path, 'w') as f:
137
- json.dump(manifest, f, indent=2)
138
- print(f"\n📄 Manifest: {manifest_path}")
139
-
140
- print("\n✅ Dataset complete!")
141
-
142
- if __name__ == "__main__":
143
- import random
144
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/convert_gguf.py DELETED
@@ -1,141 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Convert GGUF file to HuggingFace format
4
- """
5
- import os
6
- import sys
7
-
8
- # Check for required packages
9
- try:
10
- import gguf
11
- except ImportError:
12
- print("Installing gguf...")
13
- os.system("pip install gguf -q")
14
- import gguf
15
-
16
- try:
17
- from transformers import AutoModel, AutoTokenizer
18
- except ImportError:
19
- print("Installing transformers...")
20
- os.system("pip install transformers -q")
21
- from transformers import AutoModel, AutoTokenizer
22
-
23
- import torch
24
-
25
- GGUF_PATH = "/Users/walidsobhi/.ollama/models/blobs/sha256-60e05f2100071479f596b964f89f510f057ce397ea22f2833a0cfe029bfc2463"
26
- OUTPUT_DIR = "/Users/walidsobhi/.openclaw/workspace/stack-2.9/base_model_qwen7b"
27
-
28
- print(f"Reading GGUF from: {GGUF_PATH}")
29
-
30
- # Read the GGUF file
31
- reader = gguf.GGUFReader(GGUF_PATH)
32
-
33
- # Get tensor info
34
- print("\n GGUF Tensors:")
35
- for i, tensor in enumerate(reader.tensors):
36
- print(f" {i}: {tensor.name} - shape {tensor.shape}, dtype {tensor.tensor_type}")
37
-
38
- # Extract to HF format
39
- print("\n Converting to HuggingFace format...")
40
-
41
- # Create output directory
42
- os.makedirs(OUTPUT_DIR, exist_ok=True)
43
-
44
- # Save model files
45
- model_path = os.path.join(OUTPUT_DIR, "model.safetensors")
46
-
47
- # Map GGUF types to PyTorch types
48
- def gguf_to_torch_type(gguf_type):
49
- type_map = {
50
- "F32": torch.float32,
51
- "F16": torch.float16,
52
- "BF16": torch.bfloat16,
53
- "I8": torch.int8,
54
- "I16": torch.int16,
55
- "I32": torch.int32,
56
- "I64": torch.int64,
57
- "U8": torch.uint8,
58
- }
59
- return type_map.get(gguf_type, torch.float32)
60
-
61
- # Export tensors
62
- state_dict = {}
63
- for tensor in reader.tensors:
64
- print(f" Converting {tensor.name}...")
65
- # Read tensor data
66
- data = reader.get_tensor(tensor.name)
67
- state_dict[tensor.name] = data
68
-
69
- # Save as safetensors
70
- try:
71
- from safetensors.torch import save_file
72
- save_file(state_dict, model_path)
73
- print(f"Model saved to: {model_path}")
74
- except ImportError:
75
- # Fallback to torch
76
- torch.save(state_dict, model_path.replace(".safetensors", ".pt"))
77
- print(f"Model saved to: {model_path.replace('.safetensors', '.pt')}")
78
-
79
- # Save config.json
80
- config = {
81
- "model_type": "qwen2",
82
- "architectures": ["Qwen2ForCausalLM"],
83
- "vocab_size": 151936,
84
- "hidden_size": 3584,
85
- "intermediate_size": 18944,
86
- "num_hidden_layers": 28,
87
- "num_attention_heads": 28,
88
- "num_key_value_heads": 4,
89
- "max_position_embeddings": 32768,
90
- "sliding_window": 32768,
91
- "torch_dtype": "bfloat16",
92
- "transformers_version": "4.37.0",
93
- }
94
-
95
- import json
96
- config_path = os.path.join(OUTPUT_DIR, "config.json")
97
- with open(config_path, "w") as f:
98
- json.dump(config, f, indent=2)
99
- print(f"Config saved to: {config_path}")
100
-
101
- # Create tokenizer files
102
- print("\n Creating tokenizer...")
103
-
104
- # Use Qwen2 tokenizer config
105
- tokenizer_config = {
106
- "add_bos_token": False,
107
- "add_eos_token": False,
108
- "add_prefix_space": False,
109
- "added_tokens_decoder": {},
110
- "bos_token": "<|im_end|>",
111
- "clean_up_tokenization_spaces": False,
112
- "eos_token": "<|im_end|>",
113
- "errors": "replace",
114
- "model_max_length": 32768,
115
- "pad_token": "<|im_end|>",
116
- "tokenizer_class": "Qwen2Tokenizer",
117
- "unk_token": "<|endoftext|>",
118
- }
119
-
120
- tokenizer_config_path = os.path.join(OUTPUT_DIR, "tokenizer_config.json")
121
- with open(tokenizer_config_path, "w") as f:
122
- json.dump(tokenizer_config, f, indent=2)
123
-
124
- # Create a simple vocab file (this is a placeholder - real vocab is in the GGUF)
125
- # The GGUF reader should have tokenizer data
126
- vocab = {}
127
- for i in range(151936):
128
- vocab[f"<|token_{i}|>"] = i
129
-
130
- vocab_path = os.path.join(OUTPUT_DIR, "vocab.json")
131
- with open(vocab_path, "w") as f:
132
- json.dump(vocab, f)
133
- print(f"Vocab saved to: {vocab_path}")
134
-
135
- print("\n✓ Conversion complete!")
136
- print(f"Output directory: {OUTPUT_DIR}")
137
- print("\nFiles created:")
138
- for f in os.listdir(OUTPUT_DIR):
139
- fpath = os.path.join(OUTPUT_DIR, f)
140
- size = os.path.getsize(fpath) / (1024*1024)
141
- print(f" {f}: {size:.1f} MB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/convert_to_gguf.py DELETED
@@ -1,210 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- GGUF Conversion Script for Stack 2.9 Model
4
-
5
- Converts the fine-tuned Stack 2.9 model to GGUF format for Ollama.
6
- """
7
-
8
- import os
9
- import sys
10
- import subprocess
11
- import argparse
12
- from pathlib import Path
13
-
14
-
15
- def find_llama_cpp():
16
- """Find llama.cpp directory in common locations."""
17
- # Check common locations relative to this script
18
- script_dir = Path(__file__).parent
19
- workspace_root = script_dir.parent
20
-
21
- possible_paths = [
22
- workspace_root / "llama.cpp",
23
- workspace_root / "extensions" / "llama.cpp",
24
- Path.home() / "llama.cpp",
25
- Path("/usr/local/llama.cpp"),
26
- ]
27
-
28
- for path in possible_paths:
29
- if path.exists() and (path / "convert.py").exists():
30
- return path
31
-
32
- return None
33
-
34
-
35
- def run_command(cmd, check=True):
36
- """Run a shell command and stream output."""
37
- print(f"Running: {' '.join(cmd)}")
38
- result = subprocess.run(cmd, capture_output=False, text=True)
39
-
40
- if check and result.returncode != 0:
41
- print(f"Error: Command failed with exit code {result.returncode}")
42
- sys.exit(1)
43
-
44
- return result
45
-
46
-
47
- def convert_model(model_path, output_path, quantize_type="q4_0", llama_cpp_path=None):
48
- """
49
- Convert a HuggingFace model to GGUF format using llama.cpp's convert.py.
50
-
51
- Args:
52
- model_path: Path to the input model (HuggingFace format)
53
- output_path: Path where the GGUF file should be saved
54
- quantize_type: Quantization type (e.g., q4_0, q5_0, q8_0)
55
- llama_cpp_path: Path to llama.cpp directory (auto-detected if None)
56
- """
57
- model_path = Path(model_path).resolve()
58
- output_path = Path(output_path).resolve()
59
-
60
- # Validate input model exists
61
- if not model_path.exists():
62
- print(f"Error: Model directory not found: {model_path}")
63
- sys.exit(1)
64
-
65
- # Find llama.cpp if not provided
66
- if llama_cpp_path is None:
67
- llama_cpp_path = find_llama_cpp()
68
- if llama_cpp_path is None:
69
- print("Error: llama.cpp not found!")
70
- print("\nPlease install llama.cpp and ensure convert.py is available.")
71
- print("You can clone it with:")
72
- print(" git clone https://github.com/ggerganov/llama.cpp.git")
73
- print("\nOr specify the path manually:")
74
- print(" python convert_to_gguf.py --llama-cpp /path/to/llama.cpp")
75
- sys.exit(1)
76
-
77
- llama_cpp_path = Path(llama_cpp_path).resolve()
78
- convert_script = llama_cpp_path / "convert.py"
79
-
80
- if not convert_script.exists():
81
- print(f"Error: convert.py not found at {convert_script}")
82
- sys.exit(1)
83
-
84
- # Create output directory
85
- output_path.parent.mkdir(parents=True, exist_ok=True)
86
-
87
- # Step 1: Convert to intermediate GGUF (unquantized)
88
- print(f"\n=== Step 1: Converting to GGUF ===")
89
- temp_gguf = output_path.parent / f"{output_path.stem}_temp.gguf"
90
-
91
- convert_cmd = [
92
- sys.executable,
93
- str(convert_script),
94
- str(model_path),
95
- "--outfile", str(temp_gguf),
96
- "--outtype", "f16", # Intermediate full precision
97
- ]
98
-
99
- run_command(convert_cmd)
100
-
101
- if not temp_gguf.exists():
102
- print(f"Error: Conversion failed, {temp_gguf} not created")
103
- sys.exit(1)
104
-
105
- # Step 2: Quantize the GGUF file (if not full precision)
106
- if quantize_type != "f16":
107
- print(f"\n=== Step 2: Applying quantization ({quantize_type}) ===")
108
-
109
- # llama.cpp quantize tool
110
- quantize_tool = llama_cpp_path / "quantize"
111
-
112
- # Try different possible names for quantize
113
- if not quantize_tool.exists():
114
- quantize_tool = llama_cpp_path / "build" / "bin" / "quantize"
115
- if not quantize_tool.exists():
116
- quantize_tool = llama_cpp_path / "build" / "quantize"
117
-
118
- if not quantize_tool.exists():
119
- print("Warning: quantize tool not found. Skipping quantization step.")
120
- print("You may need to build llama.cpp first:")
121
- print(f" cd {llama_cpp_path} && make quantize")
122
- print("Using unquantized model as fallback.")
123
- final_gguf = temp_gguf
124
- else:
125
- run_command([
126
- str(quantize_tool),
127
- str(temp_gguf),
128
- str(output_path),
129
- quantize_type
130
- ])
131
- temp_gguf.unlink() # Remove temp file
132
- final_gguf = output_path
133
- else:
134
- final_gguf = temp_gguf
135
- if final_gguf != output_path:
136
- temp_gguf.rename(output_path)
137
-
138
- # Step 3: Validate the GGUF file
139
- print(f"\n=== Step 3: Validating GGUF file ===")
140
-
141
- if not final_gguf.exists():
142
- print(f"Error: Final GGUF file not found: {final_gguf}")
143
- sys.exit(1)
144
-
145
- file_size = final_gguf.stat().st_size / (1024**3)
146
- print(f"✓ GGUF file created: {final_gguf}")
147
- print(f" Size: {file_size:.2f} GB")
148
- print(f" Quantization: {quantize_type}")
149
-
150
- # Step 4: Print Ollama import command
151
- print(f"\n=== Ollama Import Command ===")
152
- print(f"ollama import {output_path} --alias stack-2.9:7b")
153
- print("\nAfter importing, you can run the model with:")
154
- print(f" ollama run stack-2.9:7b")
155
-
156
- print("\n✅ Conversion complete!")
157
-
158
-
159
- def main():
160
- parser = argparse.ArgumentParser(
161
- description="Convert Stack 2.9 model to GGUF format for Ollama"
162
- )
163
- parser.add_argument(
164
- "--model-dir",
165
- type=str,
166
- default="./output/stack-2.9-7b-merged",
167
- help="Path to the merged model directory (default: ./output/stack-2.9-7b-merged)"
168
- )
169
- parser.add_argument(
170
- "--output",
171
- type=str,
172
- default="./ollama_model/stack-2.9-7b.gguf",
173
- help="Output GGUF file path (default: ./ollama_model/stack-2.9-7b.gguf)"
174
- )
175
- parser.add_argument(
176
- "--qtype",
177
- type=str,
178
- default="q4_0",
179
- choices=["f16", "q4_0", "q5_0", "q8_0", "q2_K", "q3_K_S", "q3_K_M", "q3_K_L", "q4_K_S", "q4_K_M", "q5_K_S", "q5_K_M", "q6_K"],
180
- help="Quantization type (default: q4_0)"
181
- )
182
- parser.add_argument(
183
- "--llama-cpp",
184
- type=str,
185
- default=None,
186
- help="Path to llama.cpp directory (auto-detected if not provided)"
187
- )
188
-
189
- args = parser.parse_args()
190
-
191
- # Resolve paths relative to workspace root
192
- workspace_root = Path(__file__).parent.parent
193
- model_path = (workspace_root / args.model_dir).resolve()
194
- output_path = (workspace_root / args.output).resolve()
195
-
196
- print("=== GGUF Conversion for Stack 2.9 ===\n")
197
- print(f"Input model: {model_path}")
198
- print(f"Output: {output_path}")
199
- print(f"Quantization: {args.qtype}\n")
200
-
201
- convert_model(
202
- model_path=model_path,
203
- output_path=output_path,
204
- quantize_type=args.qtype,
205
- llama_cpp_path=args.llama_cpp
206
- )
207
-
208
-
209
- if __name__ == "__main__":
210
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/create_mini_dataset.py DELETED
@@ -1,180 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Create a minimal training dataset for rapid prototyping.
4
- Samples N examples from the full data/final/train.jsonl ensuring tool diversity.
5
- """
6
-
7
- import argparse
8
- import json
9
- import random
10
- from pathlib import Path
11
- from typing import List, Dict
12
- from collections import defaultdict, Counter
13
-
14
- def load_full_dataset(train_path: str = "data/final/train.jsonl") -> List[Dict]:
15
- """Load the full dataset."""
16
- path = Path(train_path)
17
- if not path.exists():
18
- raise FileNotFoundError(f"Training data not found at {path}. Please ensure data/final/train.jsonl exists.")
19
-
20
- data = []
21
- with open(path, 'r') as f:
22
- for line in f:
23
- data.append(json.loads(line))
24
- return data
25
-
26
- def extract_tool_calls(example: Dict) -> List[str]:
27
- """Extract tool names used in an example."""
28
- tools = []
29
- messages = example.get("messages", [])
30
- for msg in messages:
31
- if msg.get("role") == "assistant" and msg.get("tool_calls"):
32
- for tc in msg["tool_calls"]:
33
- func = tc.get("function", {})
34
- name = func.get("name", "")
35
- if name:
36
- tools.append(name)
37
- return tools
38
-
39
- def create_mini_dataset(
40
- output_path: str,
41
- n_samples: int = 5000,
42
- train_source: str = "data/final/train.jsonl",
43
- seed: int = 42
44
- ):
45
- """Create a stratified mini dataset."""
46
- random.seed(seed)
47
-
48
- print(f"Loading full dataset from {train_source}...")
49
- full_data = load_full_dataset(train_source)
50
- print(f"Loaded {len(full_data)} total examples")
51
-
52
- # Group by tool usage
53
- tool_groups = defaultdict(list)
54
- unknown_tools = []
55
-
56
- for ex in full_data:
57
- tools = extract_tool_calls(ex)
58
- if tools:
59
- # Use first tool as primary category
60
- primary_tool = tools[0]
61
- tool_groups[primary_tool].append(ex)
62
- else:
63
- unknown_tools.append(ex)
64
-
65
- print(f"\nTool distribution in full dataset:")
66
- total_tool_examples = sum(len(v) for v in tool_groups.values())
67
- for tool, examples in sorted(tool_groups.items(), key=lambda x: len(x[1]), reverse=True)[:15]:
68
- pct = len(examples) / len(full_data) * 100
69
- print(f" {tool}: {len(examples)} examples ({pct:.1f}%)")
70
-
71
- print(f" No-tool examples: {len(unknown_tools)} ({len(unknown_tools)/len(full_data)*100:.1f}%)")
72
-
73
- # Determine sampling strategy
74
- # Allocate samples proportionally, but ensure minimum 3 examples per tool
75
- samples_per_tool = {}
76
- min_per_tool = 3
77
- remaining = n_samples
78
-
79
- # First pass: assign minimum to all tools that have enough
80
- for tool, examples in tool_groups.items():
81
- if len(examples) >= min_per_tool:
82
- samples_per_tool[tool] = min_per_tool
83
- remaining -= min_per_tool
84
-
85
- # Second pass: distribute remaining proportionally
86
- if remaining > 0:
87
- total_weight = sum(len(v) for v in tool_groups.values() if len(v) >= min_per_tool)
88
- for tool, examples in tool_groups.items():
89
- if len(examples) >= min_per_tool:
90
- weight = len(examples) / total_weight
91
- extra = int(remaining * weight)
92
- samples_per_tool[tool] += extra
93
- remaining -= extra
94
-
95
- # Fill any leftover with no-tool examples
96
- if remaining > 0 and unknown_tools:
97
- samples_per_tool["__notool__"] = min(remaining, len(unknown_tools))
98
- remaining -= min(remaining, len(unknown_tools))
99
-
100
- # If we still have remaining, just take from the largest tool groups
101
- if remaining > 0:
102
- sorted_tools = sorted(tool_groups.items(), key=lambda x: len(x[1]), reverse=True)
103
- for tool, examples in sorted_tools:
104
- if remaining <= 0:
105
- break
106
- can_take = min(remaining, len(examples) - samples_per_tool.get(tool, 0))
107
- if can_take > 0:
108
- samples_per_tool[tool] = samples_per_tool.get(tool, 0) + can_take
109
- remaining -= can_take
110
-
111
- print(f"\nSampling plan (target {n_samples}):")
112
- total_sampled = 0
113
- for tool, n in sorted(samples_per_tool.items(), key=lambda x: x[1], reverse=True):
114
- if n > 0:
115
- available = len(tool_groups.get(tool, [])) if tool != "__notool__" else len(unknown_tools)
116
- pct = n / n_samples * 100
117
- print(f" {tool}: {n} examples ({pct:.1f}%) from {available} available")
118
- total_sampled += n
119
-
120
- # Perform sampling
121
- mini_dataset = []
122
- for tool, n_to_sample in samples_per_tool.items():
123
- if n_to_sample <= 0:
124
- continue
125
-
126
- source_pool = tool_groups[tool] if tool != "__notool__" else unknown_tools
127
- if len(source_pool) < n_to_sample:
128
- n_to_sample = len(source_pool)
129
-
130
- sampled = random.sample(source_pool, n_to_sample)
131
- mini_dataset.extend(sampled)
132
-
133
- # Shuffle the final dataset
134
- random.shuffle(mini_dataset)
135
-
136
- # Write output
137
- output_path = Path(output_path)
138
- output_path.parent.mkdir(parents=True, exist_ok=True)
139
-
140
- with open(output_path, 'w') as f:
141
- for ex in mini_dataset:
142
- f.write(json.dumps(ex) + '\n')
143
-
144
- print(f"\n✅ Mini dataset created: {len(mini_dataset)} examples")
145
- print(f" Saved to: {output_path}")
146
-
147
- # Stats
148
- tool_counts = Counter()
149
- for ex in mini_dataset:
150
- tools = extract_tool_calls(ex)
151
- if tools:
152
- tool_counts[tools[0]] += 1
153
- else:
154
- tool_counts["__notool__"] += 1
155
-
156
- print(f"\nFinal tool distribution:")
157
- for tool, count in tool_counts.most_common(15):
158
- pct = count / len(mini_dataset) * 100
159
- print(f" {tool}: {count} ({pct:.1f}%)")
160
-
161
- return mini_dataset
162
-
163
- def main():
164
- parser = argparse.ArgumentParser(description="Create mini dataset for fast prototyping")
165
- parser.add_argument("--size", type=int, default=5000, help="Number of examples in mini dataset")
166
- parser.add_argument("--output", type=str, default="./data_mini/train_mini.jsonl", help="Output file path")
167
- parser.add_argument("--source", type=str, default="data/final/train.jsonl", help="Source full dataset")
168
- parser.add_argument("--seed", type=int, default=42, help="Random seed for sampling")
169
-
170
- args = parser.parse_args()
171
-
172
- create_mini_dataset(
173
- output_path=args.output,
174
- n_samples=args.size,
175
- train_source=args.source,
176
- seed=args.seed
177
- )
178
-
179
- if __name__ == "__main__":
180
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/download_benchmark_datasets.py DELETED
@@ -1,127 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Download benchmark datasets (HumanEval and MBPP) into ./data/ directory.
4
- Uses huggingface datasets library for reliable downloads.
5
- """
6
-
7
- import os
8
- import json
9
- from pathlib import Path
10
- from datasets import load_dataset
11
- import argparse
12
-
13
- def download_humaneval(output_dir: str = "./data"):
14
- """Download HumanEval dataset (164 problems)."""
15
- output_path = Path(output_dir) / "humaneval"
16
- output_path.mkdir(parents=True, exist_ok=True)
17
-
18
- print(f"⬇️ Downloading HumanEval to {output_path}...")
19
-
20
- try:
21
- # Load HumanEval from huggingface
22
- dataset = load_dataset("openai_humaneval", split="test")
23
-
24
- problems = {}
25
- for idx, item in enumerate(dataset):
26
- problem_id = f"HumanEval/{idx}"
27
- problems[problem_id] = {
28
- "task_id": problem_id,
29
- "prompt": item["prompt"],
30
- "canonical_solution": item["canonical_solution"],
31
- "test": item["test"],
32
- "entry_point": item["entry_point"]
33
- }
34
-
35
- # Save as JSONL (one problem per line)
36
- output_file = output_path / "humaneval.jsonl"
37
- with open(output_file, 'w') as f:
38
- for problem in problems.values():
39
- f.write(json.dumps(problem) + '\n')
40
-
41
- # Also save a meta file
42
- meta_file = output_path / "meta.json"
43
- with open(meta_file, 'w') as f:
44
- json.dump({
45
- "name": "HumanEval",
46
- "num_problems": len(problems),
47
- "source": "openai_humaneval",
48
- "description": "164 hand-written programming problems"
49
- }, f, indent=2)
50
-
51
- print(f"✅ HumanEval: {len(problems)} problems saved to {output_file}")
52
- return len(problems)
53
-
54
- except Exception as e:
55
- print(f"❌ Failed to download HumanEval: {e}")
56
- return 0
57
-
58
- def download_mbpp(output_dir: str = "./data"):
59
- """Download MBPP dataset (500 problems)."""
60
- output_path = Path(output_dir) / "mbpp"
61
- output_path.mkdir(parents=True, exist_ok=True)
62
-
63
- print(f"⬇️ Downloading MBPP to {output_path}...")
64
-
65
- try:
66
- # Load MBPP from huggingface
67
- dataset = load_dataset("mbpp", split="test")
68
-
69
- problems = {}
70
- for idx, item in enumerate(dataset):
71
- problem_id = f"MBPP/{idx}"
72
- problems[problem_id] = {
73
- "task_id": problem_id,
74
- "text": item["text"],
75
- "code": item["code"],
76
- "test_list": item["test_list"],
77
- "test_func": item["test_func"],
78
- "challenge_test_list": item.get("challenge_test_list", [])
79
- }
80
-
81
- # Save as JSONL
82
- output_file = output_path / "mbpp.jsonl"
83
- with open(output_file, 'w') as f:
84
- for problem in problems.values():
85
- f.write(json.dumps(problem) + '\n')
86
-
87
- # Meta file
88
- meta_file = output_path / "meta.json"
89
- with open(meta_file, 'w') as f:
90
- json.dump({
91
- "name": "MBPP",
92
- "num_problems": len(problems),
93
- "source": "mbpp",
94
- "description": "500 beginner-friendly Python programming problems"
95
- }, f, indent=2)
96
-
97
- print(f"✅ MBPP: {len(problems)} problems saved to {output_file}")
98
- return len(problems)
99
-
100
- except Exception as e:
101
- print(f"❌ Failed to download MBPP: {e}")
102
- return 0
103
-
104
- def main():
105
- parser = argparse.ArgumentParser(description="Download benchmark datasets")
106
- parser.add_argument("--output-dir", type=str, default="./data",
107
- help="Output directory (default: ./data)")
108
- parser.add_argument("--benchmark", type=str, choices=["humaneval", "mbpp", "both"],
109
- default="both", help="Which benchmark to download")
110
- args = parser.parse_args()
111
-
112
- print("📥 Benchmark Dataset Downloader")
113
- print(f"📁 Target directory: {args.output_dir}")
114
-
115
- total_downloaded = 0
116
-
117
- if args.benchmark in ["humaneval", "both"]:
118
- total_downloaded += download_humaneval(args.output_dir)
119
-
120
- if args.benchmark in ["mbpp", "both"]:
121
- total_downloaded += download_mbpp(args.output_dir)
122
-
123
- print(f"\n🎉 Total problems downloaded: {total_downloaded}")
124
- print(f"📂 Data saved in: {Path(args.output_dir).resolve()}")
125
-
126
- if __name__ == "__main__":
127
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/download_public_datasets.py DELETED
@@ -1,170 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Download and integrate public coding datasets.
4
- Datasets: OpenAssistant, CodeAct, CodeContests
5
- Converts to Stack 2.9 format.
6
- """
7
-
8
- import json
9
- import os
10
- from datasets import load_dataset
11
- from pathlib import Path
12
- import argparse
13
-
14
- def download_openassistant(output_path: Path, limit: int = 10000):
15
- """Download OpenAssistant and filter for coding conversations."""
16
- print("📥 Downloading OpenAssistant dataset...")
17
- try:
18
- dataset = load_dataset("OpenAssistant/oasst1", split="train")
19
- except Exception as e:
20
- print(f"❌ Failed to load OpenAssistant: {e}")
21
- return []
22
-
23
- coding_examples = []
24
- count = 0
25
-
26
- for item in dataset:
27
- # Filter for coding-related conversations
28
- text = item.get("text", "").lower()
29
- if any(keyword in text for keyword in ["code", "programming", "python", "javascript", "function", "api", "development"]):
30
- # Convert to our format
31
- messages = [
32
- {"role": "user", "content": item.get("text", "")[:1000]}, # truncated
33
- {"role": "assistant", "content": "Here's a coding assistant response..."}
34
- ]
35
- coding_examples.append({
36
- "messages": messages,
37
- "source": "openassistant",
38
- "dataset": "oasst1"
39
- })
40
- count += 1
41
- if count >= limit:
42
- break
43
-
44
- print(f" Extracted {len(coding_examples)} coding-related examples from OpenAssistant")
45
- return coding_examples
46
-
47
- def download_codeact(output_path: Path, limit: int = 10000):
48
- """Download CodeAct dataset."""
49
- print("📥 Downloading CodeAct dataset...")
50
- try:
51
- dataset = load_dataset("nuprl/CodeAct", split="train")
52
- except Exception as e:
53
- print(f"❌ Failed to load CodeAct: {e}")
54
- return []
55
-
56
- examples = []
57
- count = 0
58
-
59
- for item in dataset:
60
- # CodeAct has actions - convert to tool calls
61
- action = item.get("action", {})
62
- if action:
63
- messages = [
64
- {"role": "user", "content": item.get("instruction", "")},
65
- {
66
- "role": "assistant",
67
- "content": "Executing action...",
68
- "tool_use": {
69
- "name": "CodeActTool",
70
- "input": action
71
- }
72
- },
73
- {
74
- "role": "user",
75
- "content": "",
76
- "tool_result": {
77
- "tool_use_id": "tool_1",
78
- "content": json.dumps(item.get("observation", {}))
79
- }
80
- },
81
- {"role": "assistant", "content": item.get("final_answer", "Done.")}
82
- ]
83
- examples.append({
84
- "messages": messages,
85
- "source": "codeact",
86
- "dataset": "CodeAct"
87
- })
88
- count += 1
89
- if count >= limit:
90
- break
91
-
92
- print(f" Extracted {len(examples)} examples from CodeAct")
93
- return examples
94
-
95
- def download_codecontests(output_path: Path, limit: int = 5000):
96
- """Download CodeContests (competition problems)."""
97
- print("📥 Downloading CodeContests dataset...")
98
- try:
99
- dataset = load_dataset("m-a-p/CodeContests", split="train")
100
- except Exception as e:
101
- print(f"❌ Failed to load CodeContests: {e}")
102
- return []
103
-
104
- examples = []
105
- count = 0
106
-
107
- for item in dataset:
108
- if item.get("problem") and item.get("solution"):
109
- messages = [
110
- {"role": "user", "content": f"Solve this problem:\n{item['problem']}"},
111
- {"role": "assistant", "content": f"Here's a solution:\n```python\n{item['solution']}\n```"}
112
- ]
113
- examples.append({
114
- "messages": messages,
115
- "source": "codecontests",
116
- "dataset": "CodeContests"
117
- })
118
- count += 1
119
- if count >= limit:
120
- break
121
-
122
- print(f" Extracted {len(examples)} examples from CodeContests")
123
- return examples
124
-
125
- def main():
126
- parser = argparse.ArgumentParser()
127
- parser.add_argument("--output", type=str, default="training-data/scaled/public_datasets.jsonl")
128
- parser.add_argument("--limit-per-dataset", type=int, default=10000)
129
- parser.add_argument("--skip-download", action="store_true", help="Use only existing datasets")
130
- args = parser.parse_args()
131
-
132
- output_path = Path(args.output)
133
- output_path.parent.mkdir(parents=True, exist_ok=True)
134
-
135
- all_examples = []
136
-
137
- if not args.skip_download:
138
- # OpenAssistant
139
- all_examples.extend(download_openassistant(output_path, args.limit_per_dataset))
140
-
141
- # CodeAct
142
- all_examples.extend(download_codeact(output_path, args.limit_per_dataset))
143
-
144
- # CodeContests
145
- all_examples.extend(download_codecontests(output_path, min(5000, args.limit_per_dataset)))
146
- else:
147
- print("⚠️ Skipping downloads (--skip-download flag)")
148
-
149
- # Write all examples
150
- with open(output_path, 'w') as f:
151
- for ex in all_examples:
152
- f.write(json.dumps(ex) + "\n")
153
-
154
- print(f"\n✨ Saved {len(all_examples)} examples from public datasets")
155
- print(f" to: {output_path}")
156
-
157
- # Show breakdown
158
- sources = {}
159
- for ex in all_examples:
160
- src = ex.get("source", "unknown")
161
- sources[src] = sources.get(src, 0) + 1
162
-
163
- print("\n📊 Breakdown:")
164
- for src, count in sources.items():
165
- print(f" {src}: {count}")
166
-
167
- print("\n⚠️ Note: These are raw integrations. May need format conversion to match tool-use patterns.")
168
-
169
- if __name__ == "__main__":
170
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/{compare_models.py → eval/compare_models.py} RENAMED
File without changes
scripts/{humaneval_eval.py → eval/humaneval_eval.py} RENAMED
File without changes
scripts/{mbpp_eval.py → eval/mbpp_eval.py} RENAMED
File without changes
scripts/{model_info.py → eval/model_info.py} RENAMED
File without changes
scripts/{tool_use_evaluator.py → eval/tool_use_evaluator.py} RENAMED
File without changes
scripts/extract_code_pairs.py DELETED
@@ -1,215 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Extract code-comment pairs from the src/ directory.
4
- Pairs: function/class code + its documentation comment.
5
- """
6
-
7
- import os
8
- import re
9
- import json
10
- from pathlib import Path
11
- from typing import List, Dict, Any
12
- import argparse
13
-
14
- def extract_jsdoc_comments(content: str) -> List[Dict[str, Any]]:
15
- """Extract JSDoc comments and associated code from JS/TS files."""
16
- pairs = []
17
-
18
- # Pattern to match JSDoc comment block followed by code
19
- # Matches: /** ... */ followed by function/class/interface
20
- pattern = re.compile(
21
- r'/\*\*\s*(.*?)\s*\*/\s*' # JSDoc comment
22
- r'(export\s+)?(async\s+)?(function|const|let|var|class|interface|type)\s+(\w+)',
23
- re.DOTALL
24
- )
25
-
26
- for match in pattern.finditer(content):
27
- comment_lines = match.group(1).strip().split('\n')
28
- # Clean up comment markers
29
- comment = []
30
- for line in comment_lines:
31
- line = line.strip()
32
- if line.startswith('* '):
33
- line = line[2:]
34
- elif line.startswith('*'):
35
- line = line[1:]
36
- comment.append(line.strip())
37
- comment_text = ' '.join(comment).strip()
38
-
39
- code_start = match.end()
40
- # Extract the function signature or class definition (up to opening brace or newline)
41
- code_lines = []
42
- lines = content[code_start:].split('\n')
43
- for line in lines[:5]: # Take first few lines
44
- code_lines.append(line)
45
- if line.strip().endswith('{') or line.strip().endswith('>'):
46
- break
47
- code = '\n'.join(code_lines).strip()
48
-
49
- if comment_text and code and len(code.split('\n')) >= 2:
50
- pairs.append({
51
- "code": code,
52
- "comment": comment_text,
53
- "type": match.group(3), # function/class/interface
54
- "name": match.group(4)
55
- })
56
-
57
- return pairs
58
-
59
- def extract_python_docstrings(content: str) -> List[Dict[str, Any]]:
60
- """Extract Python docstrings and associated code."""
61
- pairs = []
62
-
63
- # Pattern for triple-quoted docstring before function/class
64
- pattern = re.compile(
65
- r'''(?P<quote>''' + r'"""' + r'''|\'\'\')\s*(?P<doc>.*?)(?P=quote)\s*'''
66
- r'(?:@\w+\s+)*def\s+(\w+)|class\s+(\w+)',
67
- re.DOTALL
68
- )
69
-
70
- for match in pattern.finditer(content):
71
- doc = match.group('doc').strip()
72
- func_name = match.group(3) or match.group(4)
73
- if func_name:
74
- # Get the signature line
75
- signature = content[match.end():].split('\n')[0].strip()
76
- code = f"def {func_name}{signature}" if 'def' in signature else f"class {func_name}{signature}"
77
-
78
- pairs.append({
79
- "code": code,
80
- "comment": doc,
81
- "type": "function" if 'def' in signature else "class",
82
- "name": func_name
83
- })
84
-
85
- return pairs
86
-
87
- def extract_inline_comments(content: str, file_ext: str) -> List[Dict[str, Any]]:
88
- """Extract code block with preceding inline comment."""
89
- pairs = []
90
-
91
- lines = content.split('\n')
92
- i = 0
93
- while i < len(lines):
94
- line = lines[i].rstrip()
95
- # Check for // comment or # comment
96
- if line.strip().startswith('//') or line.strip().startswith('#'):
97
- comment = line.strip()[2:].strip()
98
- # Look at next few lines for code
99
- code_lines = []
100
- j = i + 1
101
- while j < len(lines) and len(code_lines) < 5:
102
- next_line = lines[j].rstrip()
103
- if next_line.strip() and not next_line.strip().startswith('//') and not next_line.strip().startswith('#'):
104
- code_lines.append(next_line)
105
- elif next_line.strip().startswith(('//', '#')):
106
- break # Another comment block
107
- j += 1
108
-
109
- if comment and code_lines:
110
- code = '\n'.join(code_lines)
111
- # Only keep if comment is meaningful (>5 words or contains specific keywords)
112
- if len(comment.split()) > 3 or any(kw in comment.lower() for kw in ['function', 'return', 'parameter', 'args', 'handle', 'process']):
113
- pairs.append({
114
- "code": code,
115
- "comment": comment,
116
- "type": "inline",
117
- "name": None
118
- })
119
- i = j # Skip processed lines
120
- else:
121
- i += 1
122
- else:
123
- i += 1
124
-
125
- return pairs
126
-
127
- def process_file(file_path: Path) -> List[Dict[str, Any]]:
128
- """Process a single file and extract code-comment pairs."""
129
- try:
130
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
131
- content = f.read()
132
- except Exception as e:
133
- print(f"❌ Error reading {file_path}: {e}")
134
- return []
135
-
136
- pairs = []
137
-
138
- # Extract by file type
139
- if file_path.suffix in ['.js', '.ts', '.jsx', '.tsx']:
140
- pairs.extend(extract_jsdoc_comments(content))
141
- elif file_path.suffix == '.py':
142
- pairs.extend(extract_python_docstrings(content))
143
-
144
- # Inline comments for all types
145
- pairs.extend(extract_inline_comments(content, file_path.suffix))
146
-
147
- return pairs
148
-
149
- def walk_source_files(src_dir: Path) -> List[Path]:
150
- """Walk src/ directory and return all relevant source files."""
151
- extensions = ['.ts', '.tsx', '.js', '.jsx', '.py']
152
- files = []
153
- for ext in extensions:
154
- files.extend(src_dir.rglob(f'*{ext}'))
155
- return files
156
-
157
- def main():
158
- parser = argparse.ArgumentParser()
159
- parser.add_argument("--src-dir", type=str, default="src")
160
- parser.add_argument("--output", type=str, default="training-data/code-pairs/extended_pairs.json")
161
- parser.add_argument("--limit", type=int, default=10000, help="Maximum pairs to extract")
162
- args = parser.parse_args()
163
-
164
- src_dir = Path(args.src_dir)
165
- output_path = Path(args.output)
166
-
167
- if not src_dir.exists():
168
- print(f"❌ Source directory not found: {src_dir}")
169
- return
170
-
171
- print(f"🔍 Scanning {src_dir} for source files...")
172
- files = walk_source_files(src_dir)
173
- print(f" Found {len(files)} source files")
174
-
175
- all_pairs = []
176
- for file_path in files:
177
- pairs = process_file(file_path)
178
- if pairs:
179
- all_pairs.extend(pairs)
180
- print(f" {file_path.name}: {len(pairs)} pairs", end='\r')
181
-
182
- if len(all_pairs) >= args.limit:
183
- break
184
-
185
- print(f"\n✨ Extracted {len(all_pairs)} code-comment pairs")
186
-
187
- # Deduplicate (by comment+code hash)
188
- seen = set()
189
- unique_pairs = []
190
- for pair in all_pairs:
191
- key = (pair['comment'][:100], pair['code'][:100])
192
- if key not in seen:
193
- seen.add(key)
194
- unique_pairs.append(pair)
195
-
196
- print(f" After deduplication: {len(unique_pairs)} unique pairs")
197
-
198
- # Save
199
- output_path.parent.mkdir(parents=True, exist_ok=True)
200
- with open(output_path, 'w') as f:
201
- json.dump(unique_pairs, f, indent=2)
202
-
203
- print(f"✅ Saved to: {output_path}")
204
-
205
- # Stats
206
- types = {}
207
- for pair in unique_pairs:
208
- t = pair.get('type', 'unknown')
209
- types[t] = types.get(t, 0) + 1
210
- print("\n📊 By type:")
211
- for t, cnt in types.items():
212
- print(f" {t}: {cnt}")
213
-
214
- if __name__ == "__main__":
215
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/extract_patterns_from_git.py DELETED
@@ -1,309 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Extract Code Patterns from Git History
4
-
5
- Scans Git commit history to identify bug fixes and feature additions,
6
- extracting "before → after" patterns for training data generation.
7
-
8
- Usage:
9
- python extract_patterns_from_git.py --repo-path . --output patterns.jsonl
10
- python extract_patterns_from_git.py --repo-path . --output patterns.jsonl --since-date "2024-01-01"
11
- """
12
-
13
- import argparse
14
- import hashlib
15
- import json
16
- import os
17
- import subprocess
18
- import sys
19
- from datetime import datetime
20
- from pathlib import Path
21
- from typing import Optional
22
-
23
- try:
24
- from tqdm import tqdm
25
- except ImportError:
26
- tqdm = None
27
-
28
-
29
- # Keywords that indicate bug fixes or improvements
30
- BUG_FIX_KEYWORDS = [
31
- "fix", "bug", "hotfix", "patch", "resolve", "correct", "repair",
32
- "error", "crash", "fail", "issue", "problem", "broken"
33
- ]
34
-
35
- FEATURE_KEYWORDS = [
36
- "feat", "feature", "add", "new", "implement", "enhance", "improve",
37
- "optimize", "refactor", "support", "introduce"
38
- ]
39
-
40
-
41
- def is_text_file(filepath: str) -> bool:
42
- """Check if a file is likely a text file (not binary)."""
43
- binary_extensions = {
44
- '.pyc', '.so', '.dll', '.exe', '.bin', '.dat', '.pickle',
45
- '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.ico', '.svg',
46
- '.mp3', '.mp4', '.wav', '.avi', '.mov', '.pdf', '.zip',
47
- '.tar', '.gz', '.rar', '.7z', '.whl', '.egg',
48
- '.class', '.jar', '.war', '.ear',
49
- '.db', '.sqlite', '.sqlite3',
50
- '.ttf', '.otf', '.woff', '.woff2',
51
- '.pem', '.key', '.crt', '.cer',
52
- '.DS_Store', '.gitignore'
53
- }
54
-
55
- ext = Path(filepath).suffix.lower()
56
- if ext in binary_extensions:
57
- return False
58
-
59
- # Try to read as text
60
- try:
61
- with open(filepath, 'rb') as f:
62
- chunk = f.read(1024)
63
- # Check for null bytes (common in binary files)
64
- if b'\x00' in chunk:
65
- return False
66
- return True
67
- except (OSError, IOError):
68
- return False
69
-
70
-
71
- def get_commit_messages(repo_path: str, since_date: Optional[str] = None) -> list[dict]:
72
- """Get commit information from git log."""
73
- cmd = ["git", "-C", repo_path, "log", "--pretty=format:%H|%s|%an|%ad|%ae", "--date=iso"]
74
-
75
- if since_date:
76
- cmd.extend([f"--since={since_date}"])
77
-
78
- try:
79
- result = subprocess.run(cmd, capture_output=True, text=True, check=True)
80
- commits = []
81
-
82
- for line in result.stdout.strip().split('\n'):
83
- if not line:
84
- continue
85
- parts = line.split('|')
86
- if len(parts) >= 5:
87
- commits.append({
88
- 'hash': parts[0],
89
- 'message': parts[1],
90
- 'author': parts[2],
91
- 'date': parts[3],
92
- 'email': parts[4] if len(parts) > 4 else ''
93
- })
94
-
95
- return commits
96
- except subprocess.CalledProcessError as e:
97
- print(f"Error reading git log: {e}", file=sys.stderr)
98
- return []
99
-
100
-
101
- def get_changed_files(repo_path: str, commit_hash: str) -> list[str]:
102
- """Get list of files changed in a commit."""
103
- cmd = ["git", "-C", repo_path, "diff-tree", "--no-commit-id", "--name-only", "-r", commit_hash]
104
-
105
- try:
106
- result = subprocess.run(cmd, capture_output=True, text=True, check=True)
107
- files = []
108
- for line in result.stdout.strip().split('\n'):
109
- if line.strip():
110
- files.append(line.strip())
111
- return files
112
- except subprocess.CalledProcessError:
113
- return []
114
-
115
-
116
- def get_file_diff(repo_path: str, commit_hash: str, filepath: str) -> tuple[Optional[str], Optional[str]]:
117
- """Get before and after content of a file in a commit."""
118
- # Get the file content AFTER the commit
119
- cmd_after = ["git", "-C", repo_path, "show", f"{commit_hash}:{filepath}"]
120
- # Get the file content BEFORE the commit (parent)
121
- cmd_before = ["git", "-C", repo_path, "show", f"{commit_hash}^:{filepath}"]
122
-
123
- after_content = None
124
- before_content = None
125
-
126
- try:
127
- result_after = subprocess.run(cmd_after, capture_output=True, text=True, check=True)
128
- after_content = result_after.stdout
129
- except subprocess.CalledProcessError:
130
- # File might be new (no parent)
131
- after_content = None
132
-
133
- try:
134
- result_before = subprocess.run(cmd_before, capture_output=True, text=True, check=True)
135
- before_content = result_before.stdout
136
- except subprocess.CalledProcessError:
137
- # File was added in this commit
138
- before_content = None
139
-
140
- return before_content, after_content
141
-
142
-
143
- def infer_problem_type(message: str) -> str:
144
- """Infer the problem type from commit message."""
145
- msg_lower = message.lower()
146
-
147
- # Check for bug fix indicators
148
- for keyword in BUG_FIX_KEYWORDS:
149
- if keyword in msg_lower:
150
- return "bug_fix"
151
-
152
- # Check for feature indicators
153
- for keyword in FEATURE_KEYWORDS:
154
- if keyword in msg_lower:
155
- return "feature_addition"
156
-
157
- return "unknown"
158
-
159
-
160
- def compute_confidence(message: str, before: Optional[str], after: Optional[str]) -> float:
161
- """Compute confidence score for the extracted pattern."""
162
- confidence = 0.5 # Base confidence
163
-
164
- # Higher confidence if message contains clear keywords
165
- msg_lower = message.lower()
166
- if any(k in msg_lower for k in ["fix", "bug", "hotfix", "patch"]):
167
- confidence += 0.2
168
- if any(k in msg_lower for k in ["feat", "feature", "add", "implement"]):
169
- confidence += 0.15
170
-
171
- # Higher confidence if we have both before and after
172
- if before and after:
173
- confidence += 0.15
174
- elif before or after:
175
- confidence += 0.05
176
-
177
- # Higher confidence for substantial changes
178
- if before and after:
179
- content_len = max(len(before), len(after))
180
- if content_len > 100:
181
- confidence += 0.1
182
- if content_len > 500:
183
- confidence += 0.1
184
-
185
- return min(confidence, 1.0)
186
-
187
-
188
- def generate_pattern_id(commit_hash: str, filepath: str) -> str:
189
- """Generate a unique pattern ID."""
190
- content = f"{commit_hash}:{filepath}"
191
- return hashlib.sha256(content.encode()).hexdigest()[:16]
192
-
193
-
194
- def extract_patterns(
195
- repo_path: str,
196
- output_path: str,
197
- since_date: Optional[str] = None
198
- ) -> int:
199
- """Extract patterns from git history and write to JSONL file."""
200
-
201
- print(f"Scanning repository: {repo_path}")
202
-
203
- # Get all commits
204
- commits = get_commit_messages(repo_path, since_date)
205
- print(f"Found {len(commits)} commits")
206
-
207
- if not commits:
208
- print("No commits found.", file=sys.stderr)
209
- return 0
210
-
211
- patterns_extracted = 0
212
-
213
- # Process each commit with progress bar
214
- iterator = tqdm(commits, desc="Extracting patterns") if tqdm else commits
215
-
216
- with open(output_path, 'w', encoding='utf-8') as outf:
217
- for commit in iterator:
218
- commit_hash = commit['hash']
219
- message = commit['message']
220
- author = commit['author']
221
- date = commit['date']
222
-
223
- # Infer problem type
224
- problem_type = infer_problem_type(message)
225
-
226
- # Skip if not a bug fix or feature
227
- if problem_type == "unknown":
228
- continue
229
-
230
- # Get changed files
231
- changed_files = get_changed_files(repo_path, commit_hash)
232
-
233
- for filepath in changed_files:
234
- # Skip binary files
235
- full_path = os.path.join(repo_path, filepath)
236
- if not os.path.exists(full_path):
237
- continue
238
-
239
- if not is_text_file(filepath):
240
- continue
241
-
242
- # Get diff
243
- before_content, after_content = get_file_diff(repo_path, commit_hash, filepath)
244
-
245
- # Skip if no meaningful change
246
- if before_content == after_content:
247
- continue
248
- if not before_content and not after_content:
249
- continue
250
-
251
- # Compute confidence
252
- confidence = compute_confidence(message, before_content, after_content)
253
-
254
- # Create pattern record
255
- pattern = {
256
- "pattern_id": generate_pattern_id(commit_hash, filepath),
257
- "problem_type": problem_type,
258
- "before_code": before_content or "",
259
- "after_code": after_content or "",
260
- "commit_msg": message,
261
- "author": author,
262
- "date": date,
263
- "confidence": round(confidence, 2)
264
- }
265
-
266
- # Write as JSONL
267
- outf.write(json.dumps(pattern, ensure_ascii=False) + '\n')
268
- patterns_extracted += 1
269
-
270
- print(f"\nExtracted {patterns_extracted} patterns to {output_path}")
271
- return patterns_extracted
272
-
273
-
274
- def main():
275
- parser = argparse.ArgumentParser(
276
- description="Extract code patterns from Git history for training data"
277
- )
278
- parser.add_argument(
279
- "--repo-path",
280
- type=str,
281
- required=True,
282
- help="Path to the Git repository"
283
- )
284
- parser.add_argument(
285
- "--output",
286
- type=str,
287
- required=True,
288
- help="Output JSONL file path"
289
- )
290
- parser.add_argument(
291
- "--since-date",
292
- type=str,
293
- default=None,
294
- help="Only extract commits since this date (YYYY-MM-DD)"
295
- )
296
-
297
- args = parser.parse_args()
298
-
299
- # Validate repo path
300
- if not os.path.isdir(os.path.join(args.repo_path, '.git')):
301
- print(f"Error: {args.repo_path} is not a Git repository", file=sys.stderr)
302
- sys.exit(1)
303
-
304
- # Run extraction
305
- extract_patterns(args.repo_path, args.output, args.since_date)
306
-
307
-
308
- if __name__ == "__main__":
309
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/extract_rtmp_tools.py DELETED
@@ -1,174 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Extract training data from RTMP tools for Stack 2.9
4
- Creates synthetic tool-use examples from the RTMP codebase
5
- """
6
-
7
- import os
8
- import json
9
- from pathlib import Path
10
-
11
- RTMP_DIR = "/Users/walidsobhi/.openclaw/workspace/RTMP"
12
- OUTPUT_DIR = "/Users/walidsobhi/.openclaw/workspace/stack-2.9/data/rtmp-tools"
13
-
14
- def get_tool_description(tool_name: str) -> str:
15
- """Get tool descriptions from tool names"""
16
- descriptions = {
17
- "BashTool": "Execute shell commands in a sandboxed environment",
18
- "FileReadTool": "Read file contents from the filesystem",
19
- "FileWriteTool": "Write content to files",
20
- "FileEditTool": "Edit files using sed-style replacements",
21
- "GlobTool": "Find files matching glob patterns",
22
- "GrepTool": "Search for patterns in files",
23
- "TaskCreateTool": "Create tasks in the task list",
24
- "TaskListTool": "List all tasks in the task list",
25
- "TaskUpdateTool": "Update task status and details",
26
- "TaskGetTool": "Get details of a specific task",
27
- "WebSearchTool": "Search the web for information",
28
- "WebFetchTool": "Fetch and analyze web pages",
29
- "SkillTool": "Execute user-invocable skills",
30
- "McpTool": "Call MCP (Model Context Protocol) tools",
31
- "AgentTool": "Delegate tasks to sub-agents",
32
- }
33
- return descriptions.get(tool_name.replace("Tool", ""), f"Tool: {tool_name}")
34
-
35
- def extract_tool_examples():
36
- """Extract tool patterns and create training examples"""
37
- examples = []
38
-
39
- tools_dir = Path(RTMP_DIR) / "tools"
40
- if not tools_dir.exists():
41
- print(f"❌ Tools directory not found: {tools_dir}")
42
- return examples
43
-
44
- # Process each tool
45
- tool_dirs = [d for d in tools_dir.iterdir() if d.is_dir() and d.name.endswith("Tool")]
46
-
47
- for tool_dir in tool_dirs:
48
- tool_name = tool_dir.name
49
-
50
- # Get main tool file
51
- main_files = list(tool_dir.glob("*.tsx")) + list(tool_dir.glob("*.ts"))
52
-
53
- # Create example for this tool
54
- desc = get_tool_description(tool_name)
55
-
56
- # Generate prompt-response pairs for tool usage
57
- example = {
58
- "messages": [
59
- {
60
- "role": "user",
61
- "content": f"How do I use the {tool_name}? What does it do?"
62
- },
63
- {
64
- "role": "assistant",
65
- "content": f"The {tool_name} allows you to {desc.lower()}.\n\n" +
66
- f"This is one of the core tools in RTMP (Real-Time AI Assistant).\n\n" +
67
- f"Tools like {tool_name} enable the assistant to interact with the filesystem, " +
68
- f"execute commands, manage tasks, and more."
69
- }
70
- ]
71
- }
72
- examples.append(example)
73
-
74
- # Add more specific examples for common tools
75
- if "Bash" in tool_name:
76
- example = {
77
- "messages": [
78
- {"role": "user", "content": "List all files in the current directory"},
79
- {"role": "assistant", "tool_calls": [
80
- {"name": "Bash", "arguments": {"command": "ls -la"}}
81
- ]}
82
- ]
83
- }
84
- examples.append(example)
85
-
86
- elif "FileRead" in tool_name:
87
- example = {
88
- "messages": [
89
- {"role": "user", "content": "Read the contents of config.json"},
90
- {"role": "assistant", "tool_calls": [
91
- {"name": "FileRead", "arguments": {"file_path": "config.json"}}
92
- ]}
93
- ]
94
- }
95
- examples.append(example)
96
-
97
- elif "Glob" in tool_name:
98
- example = {
99
- "messages": [
100
- {"role": "user", "content": "Find all TypeScript files in the project"},
101
- {"role": "assistant", "tool_calls": [
102
- {"name": "Glob", "arguments": {"pattern": "**/*.ts"}}
103
- ]}
104
- ]
105
- }
106
- examples.append(example)
107
-
108
- elif "Grep" in tool_name:
109
- example = {
110
- "messages": [
111
- {"role": "user", "content": "Find all occurrences of 'TODO' in the code"},
112
- {"role": "assistant", "tool_calls": [
113
- {"name": "Grep", "arguments": {"pattern": "TODO", "path": "."}}
114
- ]}
115
- ]
116
- }
117
- examples.append(example)
118
-
119
- elif "TaskCreate" in tool_name:
120
- example = {
121
- "messages": [
122
- {"role": "user", "content": "Create a task to fix the login bug"},
123
- {"role": "assistant", "tool_calls": [
124
- {"name": "TaskCreate", "arguments": {
125
- "subject": "Fix login bug",
126
- "description": "Investigate and fix the login issue"
127
- }}
128
- ]}
129
- ]
130
- }
131
- examples.append(example)
132
-
133
- elif "WebSearch" in tool_name:
134
- example = {
135
- "messages": [
136
- {"role": "user", "content": "Search for latest Python 3.14 features"},
137
- {"role": "assistant", "tool_calls": [
138
- {"name": "WebSearch", "arguments": {"query": "Python 3.14 new features"}}
139
- ]}
140
- ]
141
- }
142
- examples.append(example)
143
-
144
- return examples
145
-
146
- def main():
147
- print("=" * 60)
148
- print("Extracting RTMP Tool Patterns for Training")
149
- print("=" * 60)
150
-
151
- # Create output directory
152
- os.makedirs(OUTPUT_DIR, exist_ok=True)
153
-
154
- # Extract examples
155
- examples = extract_tool_examples()
156
-
157
- print(f"\n✅ Extracted {len(examples)} tool usage examples")
158
-
159
- # Save to JSONL
160
- output_file = os.path.join(OUTPUT_DIR, "tool_patterns.jsonl")
161
- with open(output_file, 'w') as f:
162
- for ex in examples:
163
- f.write(json.dumps(ex) + '\n')
164
-
165
- print(f"✅ Saved to: {output_file}")
166
-
167
- # Also show some examples
168
- print("\n📋 Sample examples:")
169
- for i, ex in enumerate(examples[:3]):
170
- user_msg = ex["messages"][0]["content"]
171
- print(f" {i+1}. User: {user_msg[:60]}...")
172
-
173
- if __name__ == "__main__":
174
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/extract_rtmp_tools.ts DELETED
@@ -1,115 +0,0 @@
1
- // Extract tool schemas from RTMP for training data
2
- //
3
- // This script extracts tool definitions from the RTMP codebase
4
- // and adds them to stack-2.9's training data catalog.
5
-
6
- import { readdir, readFile, writeFile } from 'fs/promises'
7
- import { join, basename } from 'path'
8
-
9
- const RTMP_TOOLS_DIR = '/Users/walidsobhi/.openclaw/workspace/RTMP/tools'
10
- const STACK_CATALOG = '/Users/walidsobhi/.openclaw/workspace/stack-2.9/training-data/tools/catalog.json'
11
-
12
- interface ToolSchema {
13
- tool: string
14
- description: string
15
- hasPrompt: boolean
16
- hasImplementation: boolean
17
- inputSchema: Record<string, unknown>
18
- }
19
-
20
- async function extractToolSchemas(): Promise<ToolSchema[]> {
21
- const tools: ToolSchema[] = []
22
- const toolDirs = await readdir(RTMP_TOOLS_DIR)
23
-
24
- for (const toolDir of toolDirs) {
25
- const toolPath = join(RTMP_TOOLS_DIR, toolDir)
26
- const stat = await readdir(toolPath).then(() => true).catch(() => false)
27
-
28
- if (!stat) continue
29
-
30
- // Try to extract tool name and description from tool files
31
- let description = ''
32
- let hasPrompt = false
33
- let hasImplementation = false
34
-
35
- try {
36
- // Check for prompt.ts
37
- const promptPath = join(toolPath, 'prompt.ts')
38
- const promptContent = await readFile(promptPath, 'utf-8')
39
- hasPrompt = true
40
-
41
- // Extract first meaningful comment as description
42
- const comments = promptContent.match(/\/\*\*[\s\S]*?\*\//g)
43
- if (comments && comments.length > 0) {
44
- const comment = comments[0]
45
- description = comment
46
- .replace(/\/\*\*|\*\//g, '')
47
- .replace(/^\s*\*\s?/gm, '')
48
- .trim()
49
- .slice(0, 200)
50
- }
51
- } catch {
52
- // No prompt.ts
53
- }
54
-
55
- try {
56
- // Check for implementation files
57
- const toolFiles = await readdir(toolPath)
58
- hasImplementation = toolFiles.some(f =>
59
- f.endsWith('.ts') || f.endsWith('.tsx')
60
- )
61
- } catch {
62
- // Ignore
63
- }
64
-
65
- // Format tool name (remove Tool suffix for cleaner names)
66
- const toolName = toolDir.replace(/Tool$/, '')
67
-
68
- tools.push({
69
- tool: toolDir,
70
- description: description || `${toolName} tool`,
71
- hasPrompt,
72
- hasImplementation,
73
- inputSchema: {}
74
- })
75
- }
76
-
77
- return tools
78
- }
79
-
80
- async function main() {
81
- console.log('Extracting tool schemas from RTMP...')
82
-
83
- const tools = await extractToolSchemas()
84
- console.log(`Found ${tools.length} tools`)
85
-
86
- // Read existing catalog
87
- let existingTools: ToolSchema[] = []
88
- try {
89
- const existingContent = await readFile(STACK_CATALOG, 'utf-8')
90
- existingTools = JSON.parse(existingContent)
91
- } catch {
92
- console.log('No existing catalog found')
93
- }
94
-
95
- // Merge with existing (avoid duplicates)
96
- const existingNames = new Set(existingTools.map(t => t.tool))
97
- const newTools = tools.filter(t => !existingNames.has(t.tool))
98
-
99
- console.log(`Adding ${newTools.length} new tools`)
100
-
101
- // Combine
102
- const allTools = [...existingTools, ...newTools]
103
-
104
- // Write updated catalog
105
- await writeFile(STACK_CATALOG, JSON.stringify(allTools, null, 2))
106
- console.log(`Updated catalog with ${allTools.length} tools`)
107
-
108
- // Also print summary
109
- console.log('\nNew tools added:')
110
- for (const tool of newTools) {
111
- console.log(` - ${tool.tool}`)
112
- }
113
- }
114
-
115
- main().catch(console.error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/extract_rtmp_tools_advanced.py DELETED
@@ -1,199 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Extract advanced training data from RTMP tools
4
- Includes actual patterns, best practices, and tool definitions
5
- """
6
-
7
- import os
8
- import json
9
- from pathlib import Path
10
-
11
- RTMP_DIR = "/Users/walidsobhi/.openclaw/workspace/RTMP"
12
- OUTPUT_DIR = "/Users/walidsobhi/.openclaw/workspace/stack-2.9/data/rtmp-tools"
13
-
14
- def get_tool_definitions():
15
- """Get detailed tool definitions with examples"""
16
- tools = []
17
-
18
- # BashTool
19
- tools.append({
20
- "messages": [
21
- {"role": "system", "content": "You are a helpful AI coding assistant with access to various tools."},
22
- {"role": "user", "content": "List all files in the current directory including hidden files"},
23
- {"role": "assistant", "tool_calls": [
24
- {"name": "Bash", "arguments": {"command": "ls -la"}}
25
- ]},
26
- {"role": "tool", "content": "total 128\ndrwxr-xr-x 1 user staff 4096 Apr 4 10:00 .\ndrwxr-xr-x 1 user staff 4096 Apr 4 10:00 ..\n-rw-r--r-- 1 user staff 1024 Apr 4 09:00 file.ts"}
27
- ]
28
- })
29
-
30
- tools.append({
31
- "messages": [
32
- {"role": "user", "content": "Find all TypeScript files in the src directory"},
33
- {"role": "assistant", "tool_calls": [
34
- {"name": "Glob", "arguments": {"pattern": "src/**/*.ts"}}
35
- ]}
36
- ]
37
- })
38
-
39
- tools.append({
40
- "messages": [
41
- {"role": "user", "content": "Search for all TODO comments in the codebase"},
42
- {"role": "assistant", "tool_calls": [
43
- {"name": "Grep", "arguments": {"pattern": "TODO", "path": ".", "output_mode": "content"}}
44
- ]}
45
- ]
46
- })
47
-
48
- tools.append({
49
- "messages": [
50
- {"role": "user", "content": "Read the package.json file"},
51
- {"role": "assistant", "tool_calls": [
52
- {"name": "FileRead", "arguments": {"file_path": "package.json"}}
53
- ]}
54
- ]
55
- })
56
-
57
- tools.append({
58
- "messages": [
59
- {"role": "user", "content": "Create a new task to implement the login feature"},
60
- {"role": "assistant", "tool_calls": [
61
- {"name": "TaskCreate", "arguments": {
62
- "subject": "Implement login feature",
63
- "description": "Add user authentication with OAuth",
64
- "activeForm": "Implementing login feature"
65
- }}
66
- ]}
67
- ]
68
- })
69
-
70
- tools.append({
71
- "messages": [
72
- {"role": "user", "content": "Search the web for how to use PyTorch MPS on Apple Silicon"},
73
- {"role": "assistant", "tool_calls": [
74
- {"name": "WebSearch", "arguments": {"query": "PyTorch MPS Apple Silicon tutorial"}}
75
- ]}
76
- ]
77
- })
78
-
79
- tools.append({
80
- "messages": [
81
- {"role": "user", "content": "Fetch the documentation from https://pytorch.org/docs"},
82
- {"role": "assistant", "tool_calls": [
83
- {"name": "WebFetch", "arguments": {"url": "https://pytorch.org/docs", "prompt": "Extract the main features of PyTorch"}}
84
- ]}
85
- ]
86
- })
87
-
88
- # Task management examples
89
- tools.append({
90
- "messages": [
91
- {"role": "user", "content": "Show me all open tasks"},
92
- {"role": "assistant", "tool_calls": [
93
- {"name": "TaskList", "arguments": {}}
94
- ]}
95
- ]
96
- })
97
-
98
- tools.append({
99
- "messages": [
100
- {"role": "user", "content": "What's the status of the login feature task?"},
101
- {"role": "assistant", "tool_calls": [
102
- {"name": "TaskGet", "arguments": {"taskId": "1"}}
103
- ]}
104
- ]
105
- })
106
-
107
- # File operations
108
- tools.append({
109
- "messages": [
110
- {"role": "user", "content": "Write a simple Python script to a new file"},
111
- {"role": "assistant", "tool_calls": [
112
- {"name": "FileWrite", "arguments": {
113
- "file_path": "hello.py",
114
- "content": "print('Hello, World!')\n"
115
- }}
116
- ]}
117
- ]
118
- })
119
-
120
- tools.append({
121
- "messages": [
122
- {"role": "user", "content": "Edit the config file to enable debug mode"},
123
- {"role": "assistant", "tool_calls": [
124
- {"name": "FileEdit", "arguments": {
125
- "file_path": "config.json",
126
- "old_string": "\"debug\": false",
127
- "new_string": "\"debug\": true"
128
- }}
129
- ]}
130
- ]
131
- })
132
-
133
- # Complex multi-step examples
134
- tools.append({
135
- "messages": [
136
- {"role": "user", "content": "First find all React component files, then read one of them"},
137
- {"role": "assistant", "tool_calls": [
138
- {"name": "Glob", "arguments": {"pattern": "**/*.{tsx,jsx}"}},
139
- {"name": "TaskCreate", "arguments": {
140
- "subject": "Read React components",
141
- "description": "Read the found component files",
142
- "activeForm": "Reading React components"
143
- }}
144
- ]}
145
- ]
146
- })
147
-
148
- # Skill invocation
149
- tools.append({
150
- "messages": [
151
- {"role": "user", "content": "Commit the changes with a message"},
152
- {"role": "assistant", "tool_calls": [
153
- {"name": "Skill", "arguments": {"skill": "git-commit", "args": "-m 'Fix bug'"}}
154
- ]}
155
- ]
156
- })
157
-
158
- return tools
159
-
160
- def main():
161
- print("=" * 60)
162
- print("Extracting Advanced RTMP Tool Patterns")
163
- print("=" * 60)
164
-
165
- # Get tool examples
166
- tools = get_tool_definitions()
167
-
168
- print(f"\n✅ Created {len(tools)} advanced tool examples")
169
-
170
- # Save to JSONL
171
- output_file = os.path.join(OUTPUT_DIR, "advanced_tool_patterns.jsonl")
172
- with open(output_file, 'w') as f:
173
- for ex in tools:
174
- f.write(json.dumps(ex) + '\n')
175
-
176
- print(f"✅ Saved to: {output_file}")
177
-
178
- # Combine with previous
179
- prev_file = os.path.join(OUTPUT_DIR, "tool_patterns.jsonl")
180
- combined_file = os.path.join(OUTPUT_DIR, "combined_tools.jsonl")
181
-
182
- with open(combined_file, 'w') as out:
183
- # Previous simple patterns
184
- if os.path.exists(prev_file):
185
- with open(prev_file) as f:
186
- for line in f:
187
- out.write(line)
188
- # Advanced patterns
189
- with open(output_file) as f:
190
- for line in f:
191
- out.write(line)
192
-
193
- print(f"\n📦 Total combined examples:")
194
- with open(combined_file) as f:
195
- count = sum(1 for _ in f)
196
- print(f" {count} tool usage examples")
197
-
198
- if __name__ == "__main__":
199
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/generate_code_completion_data.py DELETED
@@ -1,262 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Synthetic Code Completion Training Data Generator for Stack 2.9
4
- Generates training examples for pure code completion without tools.
5
- """
6
-
7
- import json
8
- import random
9
- import argparse
10
- from pathlib import Path
11
- from typing import Dict, List
12
-
13
- LANGUAGES = ["python", "javascript", "go", "rust", "typescript"]
14
- DIFFICULTY_EASY = "easy"
15
- DIFFICULTY_MEDIUM = "medium"
16
- DIFFICULTY_HARD = "hard"
17
-
18
- # Code templates organized by language -> difficulty -> templates
19
- CODE_TEMPLATES = {
20
- "python": {
21
- DIFFICULTY_EASY: [
22
- {"context": "def greet(name):", "completion": ' return f"Hello, {name}!"', "description": "Simple greeting function"},
23
- {"context": "numbers = [1, 2, 3, 4, 5]\n\n", "completion": "for num in numbers:\n print(num)", "description": "Loop through list"},
24
- {"context": "class Person:\n def __init__(self, name):", "completion": " self.name = name", "description": "Class init"},
25
- {"context": "def add(a, b):\n ", "completion": " return a + b", "description": "Add function"},
26
- {"context": "if x > 0:\n print('positive')\nelif x < 0:\n ", "completion": " print('negative')", "description": "Conditional"},
27
- ],
28
- DIFFICULTY_MEDIUM: [
29
- {"context": "def fibonacci(n):\n if n <= 1:\n return n\n ", "completion": " return fibonacci(n-1) + fibonacci(n-2)", "description": "Fibonacci"},
30
- {"context": "class Calculator:\n def __init__(self):\n self.result = 0\n \n def add(self, x):\n ", "completion": " self.result += x\n return self.result", "description": "Calculator"},
31
- {"context": "async def fetch_data(url):\n async with aiohttp.ClientSession() as session:\n async with session.get(url) as response:\n ", "completion": " return await response.json()", "description": "Async HTTP"},
32
- {"context": "def validate_email(email):\n pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'\n ", "completion": " return re.match(pattern, email) is not None", "description": "Email validation"},
33
- {"context": "@app.route('/users/<int:user_id>')\ndef get_user(user_id):\n user = User.query.get_or_404(user_id)\n ", "completion": " return jsonify(user.to_dict())", "description": "Flask route"},
34
- ],
35
- DIFFICULTY_HARD: [
36
- {"context": "class LRUCache:\n def __init__(self, capacity):\n self.capacity = capacity\n self.cache = OrderedDict()\n \n def get(self, key):\n if key not in self.cache:\n return -1\n ", "completion": " self.cache.move_to_end(key)\n return self.cache[key]", "description": "LRU Cache"},
37
- {"context": "def merge_sort(arr):\n if len(arr) <= 1:\n return arr\n \n mid = len(arr) // 2\n left = merge_sort(arr[:mid])\n right = merge_sort(arr[mid:])\n ", "completion": " return merge(left, right)", "description": "Merge sort"},
38
- {"context": "class BinaryTree:\n def __init__(self, value):\n self.value = value\n self.left = None\n self.right = None\n \n def inorder(self, node, result=None):\n if result is None:\n result = []\n if node:\n ", "completion": " self.inorder(node.left, result)\n result.append(node.value)\n self.inorder(node.right, result)\n return result", "description": "Binary tree inorder"},
39
- {"context": "def bellman_ford(graph, source):\n dist = {v: float('inf') for v in graph}\n dist[source] = 0\n \n for _ in range(len(graph) - 1):\n for u, v, w in graph.edges:\n if dist[u] != float('inf') and dist[u] + w < dist[v]:\n ", "completion": " dist[v] = dist[u] + w\n return dist", "description": "Bellman-Ford"},
40
- ],
41
- },
42
- "javascript": {
43
- DIFFICULTY_EASY: [
44
- {"context": "const greet = (name) => {", "completion": ' return `Hello, ${name}!`;', "description": "Arrow greeting"},
45
- {"context": "const numbers = [1, 2, 3, 4, 5];\n\n", "completion": "numbers.forEach(num => console.log(num));", "description": "forEach loop"},
46
- {"context": "class Person {\n constructor(name) {", "completion": " this.name = name;", "description": "JS class constructor"},
47
- {"context": "const add = (a, b) => {", "completion": " return a + b;", "description": "Add function"},
48
- {"context": "if (x > 0) {\n console.log('positive');\n} else if (x < 0) {\n ", "completion": " console.log('negative');", "description": "Conditional"},
49
- ],
50
- DIFFICULTY_MEDIUM: [
51
- {"context": "const fetchData = async (url) => {\n try {\n const response = await fetch(url);\n ", "completion": " return await response.json();\n } catch (error) {\n console.error('Error:', error);\n }", "description": "Async fetch"},
52
- {"context": "class EventEmitter {\n constructor() {\n this.events = {};\n }\n \n on(event, callback) {\n ", "completion": " if (!this.events[event]) this.events[event] = [];\n this.events[event].push(callback);", "description": "Event emitter"},
53
- {"context": "const debounce = (func, delay) => {\n let timeoutId;\n return (...args) => {\n clearTimeout(timeoutId);\n ", "completion": " timeoutId = setTimeout(() => func.apply(this, args), delay);", "description": "Debounce"},
54
- {"context": "const memoize = (fn) => {\n const cache = new Map();\n return (n) => {\n if (cache.has(n)) {\n return cache.get(n);\n }\n ", "completion": " const result = fn(n);\n cache.set(n, result);\n return result;", "description": "Memoize"},
55
- ],
56
- DIFFICULTY_HARD: [
57
- {"context": "class PromisePool {\n constructor(maxConcurrent) {\n this.maxConcurrent = maxConcurrent;\n this.running = 0;\n this.queue = [];\n }\n \n add(promiseFn) {\n return new Promise((resolve, reject) => {\n ", "completion": " this.queue.push({ promiseFn, resolve, reject });\n this.process();\n });", "description": "Promise pool"},
58
- {"context": "const virtualDOM = {\n createElement(tag, props, ...children) {\n return {\n tag,\n props: props || {},\n children: children.flat(),\n };\n },\n render(vnode, container) {\n ", "completion": " const el = document.createElement(vnode.tag);\n Object.entries(vnode.props || {}).forEach(([key, value]) => el.setAttribute(key, value));\n vnode.children.forEach(child => {\n if (typeof child === 'string') el.appendChild(document.createTextNode(child));\n else this.render(child, el);\n });\n container.appendChild(el);", "description": "Virtual DOM"},
59
- ],
60
- },
61
- "go": {
62
- DIFFICULTY_EASY: [
63
- {"context": "func greet(name string) string {", "completion": ' return "Hello, " + name + "!"', "description": "Greet function"},
64
- {"context": "func add(a, b int) int {", "completion": " return a + b", "description": "Add function"},
65
- {"context": "type Person struct {\n Name string\n ", "completion": " Age int", "description": "Struct definition"},
66
- {"context": "for i := 0; i < 10; i++ {\n ", "completion": " fmt.Println(i)", "description": "For loop"},
67
- {"context": "if x > 0 {\n fmt.Println(\"positive\")\n} else {\n ", "completion": ' fmt.Println("non-positive")', "description": "If-else"},
68
- ],
69
- DIFFICULTY_MEDIUM: [
70
- {"context": "func (p Person) Greet() string {", "completion": ' return fmt.Sprintf("Hello, %s!", p.Name)', "description": "Method"},
71
- {"context": "func worker(jobs <-chan int, results chan<- int) {\n for j := range jobs {\n ", "completion": " results <- j * 2", "description": "Worker goroutine"},
72
- {"context": "type Handler interface {\n Handle(ctx context.Context, req Request) Response\n ", "completion": " Cleanup(ctx context.Context)", "description": "Interface"},
73
- {"context": "func fetchData(url string) ([]byte, error) {\n resp, err := http.Get(url)\n if err != nil {\n return nil, err\n }\n defer resp.Body.Close()\n ", "completion": " return io.ReadAll(resp.Body)", "description": "HTTP GET"},
74
- ],
75
- DIFFICULTY_HARD: [
76
- {"context": "type TreeNode struct {\n Val int\n Left *TreeNode\n Right *TreeNode\n}\n\nfunc (root *TreeNode) InorderTraversal() []int {\n var result []int\n var inorder func(*TreeNode)\n inorder = func(node *TreeNode) {\n if node == nil {\n return\n }\n ", "completion": " inorder(node.Left)\n result = append(result, node.Val)\n inorder(node.Right)", "description": "Tree inorder"},
77
- {"context": "func (c *Client) StreamProcess(ctx context.Context, req *Request, stream chan<- *Response) error {\n for {\n select {\n case <-ctx.Done():\n return ctx.Err()\n default:\n result, err := c.processOne(req)\n if err != nil {\n return err\n }\n ", "completion": " select {\n case stream <- result:\n case <-ctx.Done():\n return ctx.Err()\n }", "description": "Streaming"},
78
- ],
79
- },
80
- "rust": {
81
- DIFFICULTY_EASY: [
82
- {"context": "fn greet(name: &str) -> String {", "completion": ' format!("Hello, {}!", name)', "description": "Greet function"},
83
- {"context": "fn add(a: i32, b: i32) -> i32 {", "completion": " a + b", "description": "Add function"},
84
- {"context": "struct Person {\n name: String,\n ", "completion": " age: u32,", "description": "Struct"},
85
- {"context": "let numbers = vec![1, 2, 3, 4, 5];\nfor num in &numbers {\n ", "completion": " println!(\"{}\", num);", "description": "For loop"},
86
- {"context": "fn main() {\n let result = match value {\n Some(x) => x,\n ", "completion": " None => 0,", "description": "Match"},
87
- ],
88
- DIFFICULTY_MEDIUM: [
89
- {"context": "impl Person {\n fn new(name: String, age: u32) -> Self {", "completion": " Person { name, age }", "description": "Constructor"},
90
- {"context": "fn fetch_data(url: &str) -> Result<String, Error> {\n let response = reqwest::blocking::get(url)?;\n ", "completion": " let body = response.text()?;\n Ok(body)", "description": "HTTP request"},
91
- {"context": "fn process_items<T: Display>(items: Vec<T>) -> String {\n items\n .iter()\n .enumerate()\n .map(|(i, item)| format!(\"{}: {}\", i, item))\n ", "completion": " .collect::<Vec<_>>()\n .join(\", \")", "description": "Iterator chain"},
92
- {"context": "fn spawn_worker(jobs: Arc<Mutex<Vec<Job>>>) {\n thread::spawn(move || {\n loop {\n let job = {\n let mut jobs = jobs.lock().unwrap();\n jobs.pop()\n };\n match job {\n Some(job) => job.execute(),\n ", "completion": " None => break,\n };\n }\n });", "description": "Worker thread"},
93
- ],
94
- DIFFICULTY_HARD: [
95
- {"context": "pub struct LRUCache<K, V> {\n capacity: usize,\n cache: LinkedHashMap<K, V>,\n}\n\nimpl<K: Eq + Hash + Clone, V: Clone> LRUCache<K, V> {\n pub fn get(&mut self, key: &K) -> Option<&V> {\n if self.cache.contains_key(key) {\n ", "completion": " self.cache.remove(key);\n let value = self.cache[key].clone();\n self.cache.insert(key.clone(), value);\n self.cache.get(key)\n } else {\n None\n }", "description": "LRU Cache"},
96
- {"context": "pub trait Observer<T> {\n fn update(&self, event: &T);\n}\n\npub struct Subject<T> {\n observers: Vec<Box<dyn Observer<T>>>,\n}\n\nimpl<T> Subject<T> {\n pub fn notify(&self, event: &T) {\n for observer in &self.observers {\n ", "completion": " observer.update(event);", "description": "Observer pattern"},
97
- ],
98
- },
99
- }
100
-
101
- VARIANTS = ["basic", "explain", "debug", "optimize"]
102
-
103
- VARIANT_PROMPTS = {
104
- "basic": {"system": "You are a helpful AI assistant that helps with code completion.", "user_prefix": "Complete the following code:\n\n"},
105
- "explain": {"system": "You are a helpful AI assistant that explains and completes code.", "user_prefix": "Explain what this code does and complete it:\n\n"},
106
- "debug": {"system": "You are a helpful AI assistant that finds bugs and suggests fixes.", "user_prefix": "There's a bug in this code. Fix and complete it:\n\n"},
107
- "optimize": {"system": "You are a helpful AI assistant that optimizes code for performance.", "user_prefix": "Optimize this code and complete it:\n\n"},
108
- }
109
-
110
-
111
- def create_completion_example(context, completion, language, difficulty, variant, description):
112
- """Create a single code completion example."""
113
- variant_info = VARIANT_PROMPTS[variant]
114
- messages = [
115
- {"role": "system", "content": variant_info["system"]},
116
- {"role": "user", "content": f"{variant_info['user_prefix']}```{language}\n{context}```"},
117
- {"role": "assistant", "content": f"Here's the completed code:\n\n```{language}\n{context}{completion}\n```"}
118
- ]
119
- return {
120
- "messages": messages,
121
- "language": language,
122
- "difficulty": difficulty,
123
- "variant": variant,
124
- "description": description,
125
- "context": context,
126
- "completion": completion,
127
- }
128
-
129
-
130
- def generate_examples_for_language(language, difficulty, num_examples, variants):
131
- """Generate examples for a specific language and difficulty."""
132
- templates = CODE_TEMPLATES[language][difficulty]
133
- examples = []
134
- for i in range(num_examples):
135
- template = templates[i % len(templates)]
136
- variant = random.choice(variants)
137
- example = create_completion_example(
138
- context=template["context"],
139
- completion=template["completion"],
140
- language=language,
141
- difficulty=difficulty,
142
- variant=variant,
143
- description=template["description"]
144
- )
145
- examples.append(example)
146
- return examples
147
-
148
-
149
- def generate_dataset(num_examples=1000, languages=None, difficulties=None, variants=None, balance=True):
150
- """Generate the complete dataset."""
151
- if languages is None:
152
- languages = LANGUAGES
153
- if difficulties is None:
154
- difficulties = [DIFFICULTY_EASY, DIFFICULTY_MEDIUM, DIFFICULTY_HARD]
155
- if variants is None:
156
- variants = VARIANTS
157
-
158
- examples = []
159
-
160
- if balance:
161
- examples_per_lang = num_examples // len(languages)
162
- examples_per_diff = examples_per_lang // len(difficulties)
163
- remainder = num_examples % (len(languages) * len(difficulties))
164
-
165
- for lang in languages:
166
- for diff_idx, diff in enumerate(difficulties):
167
- count = examples_per_diff + (1 if diff_idx < remainder else 0)
168
- lang_examples = generate_examples_for_language(lang, diff, count, variants)
169
- examples.extend(lang_examples)
170
- else:
171
- for _ in range(num_examples):
172
- lang = random.choice(languages)
173
- diff = random.choice(difficulties)
174
- template = random.choice(CODE_TEMPLATES[lang][diff])
175
- variant = random.choice(variants)
176
- example = create_completion_example(
177
- context=template["context"],
178
- completion=template["completion"],
179
- language=lang,
180
- difficulty=diff,
181
- variant=variant,
182
- description=template["description"]
183
- )
184
- examples.append(example)
185
-
186
- random.shuffle(examples)
187
- return examples
188
-
189
-
190
- def save_jsonl(examples, output_path):
191
- """Save examples to JSONL format."""
192
- output_file = Path(output_path)
193
- output_file.parent.mkdir(parents=True, exist_ok=True)
194
- with open(output_file, 'w', encoding='utf-8') as f:
195
- for example in examples:
196
- f.write(json.dumps(example, ensure_ascii=False) + '\n')
197
-
198
-
199
- def save_json(examples, output_path):
200
- """Save examples to JSON format."""
201
- output_file = Path(output_path)
202
- output_file.parent.mkdir(parents=True, exist_ok=True)
203
- with open(output_file, 'w', encoding='utf-8') as f:
204
- json.dump(examples, f, ensure_ascii=False, indent=2)
205
-
206
-
207
- def main():
208
- parser = argparse.ArgumentParser(description="Generate synthetic code completion training data")
209
- parser.add_argument("--num-examples", type=int, default=1000, help="Number of examples to generate")
210
- parser.add_argument("--output-dir", type=str, default="training-data/code-completion", help="Output directory")
211
- parser.add_argument("--output-format", choices=["jsonl", "json", "both"], default="jsonl", help="Output format")
212
- parser.add_argument("--seed", type=int, default=42, help="Random seed")
213
- args = parser.parse_args()
214
-
215
- random.seed(args.seed)
216
-
217
- print(f"Generating {args.num_examples} code completion training examples...")
218
- print(f" Languages: {LANGUAGES}")
219
- print(f" Output directory: {args.output_dir}")
220
-
221
- examples = generate_dataset(
222
- num_examples=args.num_examples,
223
- languages=LANGUAGES,
224
- difficulties=[DIFFICULTY_EASY, DIFFICULTY_MEDIUM, DIFFICULTY_HARD],
225
- variants=VARIANTS
226
- )
227
-
228
- output_dir = Path(args.output_dir)
229
-
230
- if args.output_format in ["jsonl", "both"]:
231
- jsonl_path = output_dir / "code_completion.jsonl"
232
- save_jsonl(examples, str(jsonl_path))
233
- print(f"Saved JSONL: {jsonl_path}")
234
-
235
- if args.output_format in ["json", "both"]:
236
- json_path = output_dir / "code_completion.json"
237
- save_json(examples, str(json_path))
238
- print(f"Saved JSON: {json_path}")
239
-
240
- # Statistics
241
- print(f"\nStatistics:")
242
- print(f" Total examples: {len(examples)}")
243
-
244
- lang_counts = {}
245
- diff_counts = {}
246
- for ex in examples:
247
- lang_counts[ex["language"]] = lang_counts.get(ex["language"], 0) + 1
248
- diff_counts[ex["difficulty"]] = diff_counts.get(ex["difficulty"], 0) + 1
249
-
250
- print(f" By language:")
251
- for lang, count in sorted(lang_counts.items(), key=lambda x: x[1], reverse=True):
252
- print(f" - {lang}: {count}")
253
-
254
- print(f" By difficulty:")
255
- for diff, count in sorted(diff_counts.items(), key=lambda x: x[1], reverse=True):
256
- print(f" - {diff}: {count}")
257
-
258
- print(f"\nGeneration complete!")
259
-
260
-
261
- if __name__ == "__main__":
262
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/generate_from_rtmp.ts DELETED
@@ -1,114 +0,0 @@
1
- // Generate synthetic training data from RTMP codebase
2
- //
3
- // Extracts code examples and patterns from RTMP to create training data
4
- // for stack-2.9.
5
-
6
- import { readdir, readFile, writeFile, mkdir } from 'fs/promises'
7
- import { join, basename } from 'path'
8
-
9
- const RTMP_DIR = '/Users/walidsobhi/.openclaw/workspace/RTMP'
10
- const OUTPUT_DIR = '/Users/walidsobhi/.openclaw/workspace/stack-2.9/training-data/src-derived'
11
-
12
- interface TrainingExample {
13
- messages: Array<{
14
- role: string
15
- content: string
16
- }>
17
- }
18
-
19
- const SYSTEM_PROMPT = `You are Stack, an AI coding assistant based on Claude Code. You help with programming tasks, answer questions, use tools when needed, and provide code examples.`
20
-
21
- async function extractCodeExamples(): Promise<TrainingExample[]> {
22
- const examples: TrainingExample[] = []
23
-
24
- // Extract from RTMP tools prompts - these are good instruction examples
25
- const toolsDir = join(RTMP_DIR, 'tools')
26
- const toolDirs = await readdir(toolsDir).catch(() => [])
27
-
28
- for (const toolDir of toolDirs.slice(0, 10)) { // Limit to 10 tools
29
- const promptPath = join(toolsDir, toolDir, 'prompt.ts')
30
- try {
31
- const content = await readFile(promptPath, 'utf-8')
32
-
33
- // Extract useful code patterns
34
- const toolName = toolDir.replace('Tool', '')
35
-
36
- // Create example from tool usage
37
- examples.push({
38
- messages: [
39
- { role: 'system', content: SYSTEM_PROMPT },
40
- {
41
- role: 'user',
42
- content: `How do I use the ${toolName} tool?`
43
- },
44
- {
45
- role: 'assistant',
46
- content: `The ${toolName} tool allows you to ${getToolDescription(toolName)}. Here's how to use it:\n\n\`\`\`\n// Example usage\n// See the tool source for complete documentation\n\`\`\`\n\nKey features:\n- Feature 1\n- Feature 2`
47
- }
48
- ]
49
- })
50
- } catch {
51
- // Skip if no prompt
52
- }
53
- }
54
-
55
- // Extract from RTMP commands
56
- const commandsDir = join(RTMP_DIR, 'commands')
57
- try {
58
- const commandDirs = await readdir(commandsDir)
59
- for (const cmd of commandDirs.slice(0, 5)) {
60
- examples.push({
61
- messages: [
62
- { role: 'system', content: SYSTEM_PROMPT },
63
- {
64
- role: 'user',
65
- content: `How do I use the /${cmd} command?`
66
- },
67
- {
68
- role: 'assistant',
69
- content: `The /${cmd} command provides ${cmd} functionality. Use it by typing /${cmd} in your prompt.`
70
- }
71
- ]
72
- })
73
- }
74
- } catch {
75
- // Ignore
76
- }
77
-
78
- return examples
79
- }
80
-
81
- function getToolDescription(toolName: string): string {
82
- const descriptions: Record<string, string> = {
83
- 'Bash': 'execute shell commands and get output',
84
- 'FileRead': 'read files from the filesystem',
85
- 'FileWrite': 'write content to files',
86
- 'FileEdit': 'make targeted edits to files',
87
- 'Glob': 'find files matching patterns',
88
- 'Grep': 'search for text in files',
89
- 'LSP': 'get language server features like autocomplete',
90
- 'MCP': 'use Model Context Protocol servers',
91
- 'Task': 'create and manage task lists',
92
- 'Todo': 'track tasks and todo items'
93
- }
94
- return descriptions[toolName] || 'perform its designated function'
95
- }
96
-
97
- async function main() {
98
- console.log('Generating synthetic training data from RTMP...')
99
-
100
- // Ensure output directory exists
101
- await mkdir(OUTPUT_DIR, { recursive: true }).catch(() => {})
102
-
103
- const examples = await extractCodeExamples()
104
- console.log(`Generated ${examples.length} training examples`)
105
-
106
- // Write to JSONL
107
- const outputPath = join(OUTPUT_DIR, 'rtmp_examples.jsonl')
108
- const content = examples.map(e => JSON.stringify(e)).join('\n')
109
- await writeFile(outputPath, content)
110
-
111
- console.log(`Written to ${outputPath}`)
112
- }
113
-
114
- main().catch(console.error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/generate_random_synthetic.py DELETED
@@ -1,141 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Generate random synthetic tool-use examples.
4
- Uses tool catalog to create syntactically valid random conversations.
5
- """
6
-
7
- import json
8
- import random
9
- from pathlib import Path
10
- import argparse
11
-
12
- def load_tool_catalog(path: str):
13
- with open(path, 'r') as f:
14
- return json.load(f)
15
-
16
- def random_value_for_type(param_name: str) -> Any:
17
- """Generate a plausible random value based on parameter name."""
18
- if 'file' in param_name or 'path' in param_name:
19
- return random.choice(['src/main.py', 'README.md', 'package.json', 'config.yaml', 'tests/test.js'])
20
- elif 'command' in param_name or 'cmd' in param_name:
21
- return random.choice(['npm test', 'pytest', 'git status', 'ls -la', 'make build'])
22
- elif 'pattern' in param_name or 'glob' in param_name:
23
- return random.choice(['**/*.py', '**/*.js', '**/*.md'])
24
- elif 'query' in param_name or 'search' in param_name:
25
- return random.choice(['TODO', 'FIXME', 'function main'])
26
- elif 'url' in param_name or 'uri' in param_name:
27
- return random.choice(['https://api.example.com', 'mcp://server/resource'])
28
- elif 'status' in param_name:
29
- return random.choice(['pending', 'in_progress', 'completed'])
30
- elif 'id' in param_name or 'task_id' in param_name:
31
- return random.randint(100, 999)
32
- elif 'name' in param_name:
33
- return random.choice(['agent1', 'myteam', 'task123'])
34
- elif 'content' in param_name or 'text' in param_name:
35
- return 'Lorem ipsum dolor sit amet...'
36
- elif 'directory' in param_name or 'dir' in param_name:
37
- return random.choice(['.', 'src', 'tests', 'lib'])
38
- elif 'branch' in param_name:
39
- return random.choice(['main', 'develop', 'feature/new'])
40
- else:
41
- return f"value_{random.randint(1,100)}"
42
-
43
- def generate_random_example(tools: List[Dict], tool_count: int = 1) -> Dict[str, Any]:
44
- """Generate a random conversation with one or more tool uses."""
45
- tools_sample = random.sample(tools, min(tool_count, len(tools)))
46
-
47
- # Build messages
48
- messages = []
49
-
50
- # Random user prompt
51
- user_prompt = random.choice([
52
- "Help me with something",
53
- "Do a task",
54
- "I need assistance",
55
- "Can you handle this?",
56
- "Execute this",
57
- "Run this operation"
58
- ])
59
- messages.append({"role": "user", "content": user_prompt})
60
-
61
- # For each tool, add assistant tool-use and tool-result
62
- for i, tool in enumerate(tools_sample):
63
- tool_name = tool.get("tool") or tool.get("name", "UnknownTool")
64
-
65
- # Generate random parameters based on tool's expected inputs
66
- # We don't have strict schema, so make up plausible params
67
- tool_input = {}
68
- for j in range(random.randint(1, 3)):
69
- param_name = random.choice(['file_path', 'command', 'pattern', 'query', 'url', 'id', 'name', 'directory'])
70
- tool_input[param_name] = random_value_for_type(param_name)
71
-
72
- # Assistant uses tool
73
- messages.append({
74
- "role": "assistant",
75
- "content": f"Using {tool_name}...",
76
- "tool_use": {
77
- "name": tool_name,
78
- "input": tool_input
79
- }
80
- })
81
-
82
- # Tool result
83
- result_content = f"Operation completed successfully. Affected items: {random.randint(1,10)}"
84
- messages.append({
85
- "role": "user",
86
- "content": "",
87
- "tool_result": {
88
- "tool_use_id": f"tool_{i+1}",
89
- "content": result_content
90
- }
91
- })
92
-
93
- # Assistant acknowledges
94
- messages.append({
95
- "role": "assistant",
96
- "content": random.choice(["Done.", "Completed.", "All set."])
97
- })
98
-
99
- return {
100
- "messages": messages,
101
- "source": "random_synthetic",
102
- "tools_used": [t.get("tool") for t in tools_sample]
103
- }
104
-
105
- def main():
106
- parser = argparse.ArgumentParser()
107
- parser.add_argument("--catalog", type=str, default="training-data/tools/catalog.json")
108
- parser.add_argument("--output", type=str, default="training-data/scaled/random_synthetic.jsonl")
109
- parser.add_argument("--count", type=int, default=10000)
110
- parser.add_argument("--tools-per-example", type=int, default=1)
111
- args = parser.parse_args()
112
-
113
- catalog_path = Path(args.catalog)
114
- output_path = Path(args.output)
115
-
116
- if not catalog_path.exists():
117
- print(f"❌ Catalog not found: {catalog_path}")
118
- return
119
-
120
- tools = load_tool_catalog(catalog_path)
121
- print(f"🔧 Loaded {len(tools)} tools from catalog")
122
-
123
- output_path.parent.mkdir(parents=True, exist_ok=True)
124
-
125
- with open(output_path, 'w') as f:
126
- for i in range(args.count):
127
- example = generate_random_example(tools, args.tools_per_example)
128
- f.write(json.dumps(example) + "\n")
129
- if (i+1) % 1000 == 0:
130
- print(f" Generated {i+1}/{args.count}...", end='\r')
131
-
132
- print(f"\n✨ Generated {args.count} random synthetic examples")
133
- print(f" Saved to: {output_path}")
134
-
135
- # Show sample
136
- with open(output_path, 'r') as f:
137
- sample = json.loads(f.readline())
138
- print(f"\n📝 Sample: {len(sample['messages'])} messages, tools: {sample.get('tools_used')}")
139
-
140
- if __name__ == "__main__":
141
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/generate_synthetic.py DELETED
@@ -1,256 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Generate synthetic training examples using templates.
4
- No external APIs - pure template expansion and variation.
5
- """
6
-
7
- import json
8
- import random
9
- import string
10
- from pathlib import Path
11
- from typing import Dict, List, Any
12
- import argparse
13
-
14
- # Load tool catalog
15
- def load_tools(catalog_path: str) -> List[Dict[str, Any]]:
16
- with open(catalog_path, 'r') as f:
17
- return json.load(f)
18
-
19
- # Template definitions for each tool
20
- def get_tool_templates(tool_name: str) -> List[Dict[str, Any]]:
21
- """Return list of template scenarios for a given tool."""
22
- templates = {
23
- "FileReadTool": [
24
- {
25
- "user": "Read the file {file_path}",
26
- "params": {"file_path": "{file_path}"},
27
- "result": "Contents of {file_path}:\n{file_content}"
28
- },
29
- {
30
- "user": "Show me what's in {file_path}",
31
- "params": {"file_path": "{file_path}"},
32
- "result": "Here's {file_path}:\n{file_content}"
33
- },
34
- {
35
- "user": "Can you open {file_path}?",
36
- "params": {"file_path": "{file_path}"},
37
- "result": "Opening {file_path}...\n{file_content}"
38
- }
39
- ],
40
- "FileWriteTool": [
41
- {
42
- "user": "Create a new file {file_path} with content: {content}",
43
- "params": {"file_path": "{file_path}", "content": "{content}"},
44
- "result": "File {file_path} created successfully"
45
- },
46
- {
47
- "user": "Write this to {file_path}: {content}",
48
- "params": {"file_path": "{file_path}", "content": "{content}"},
49
- "result": "Wrote to {file_path}"
50
- }
51
- ],
52
- "GlobTool": [
53
- {
54
- "user": "Find all {pattern} files",
55
- "params": {"pattern": "{pattern}"},
56
- "result": "Found {count} files:\n{files}"
57
- },
58
- {
59
- "user": "List files matching {pattern}",
60
- "params": {"pattern": "{pattern}"},
61
- "result": "Matches for {pattern}:\n{files}"
62
- }
63
- ],
64
- "GrepTool": [
65
- {
66
- "user": "Search for {pattern} in {directory}",
67
- "params": {"pattern": "{pattern}", "directory": "{directory}"},
68
- "result": "Found {count} matches:\n{matches}"
69
- },
70
- {
71
- "user": "Find all occurrences of {pattern}",
72
- "params": {"pattern": "{pattern}"},
73
- "result": "Search results:\n{matches}"
74
- }
75
- ],
76
- "BashTool": [
77
- {
78
- "user": "Run: {command}",
79
- "params": {"command": "{command}"},
80
- "result": "$ {command}\n{output}"
81
- },
82
- {
83
- "user": "Execute {command}",
84
- "params": {"command": "{command}"},
85
- "result": "Output:\n{output}"
86
- }
87
- ]
88
- }
89
-
90
- # Return templates or generate generic ones if not defined
91
- return templates.get(tool_name, [
92
- {
93
- "user": "Use {tool} with {params}",
94
- "params": {"arg": "value"},
95
- "result": "Operation completed"
96
- }
97
- ])
98
-
99
- def generate_variations(template: Dict[str, str], count: int, tool_name: str) -> List[Dict[str, Any]]:
100
- """Generate multiple variations of a template."""
101
- examples = []
102
-
103
- for _ in range(count):
104
- # Create parameter values
105
- params = generate_params(tool_name, template.get("params", {}))
106
-
107
- # Fill template placeholders
108
- user_prompt = fill_template(template.get("user", ""), params)
109
- tool_params = fill_params(template.get("params", {}), params)
110
- result = fill_template(template.get("result", ""), params)
111
-
112
- # Build conversation
113
- messages = [
114
- {"role": "user", "content": user_prompt},
115
- {
116
- "role": "assistant",
117
- "content": "I'll help with that.",
118
- "tool_use": {
119
- "name": tool_name,
120
- "input": tool_params
121
- }
122
- },
123
- {
124
- "role": "user",
125
- "content": "",
126
- "tool_result": {
127
- "tool_use_id": "tool_1",
128
- "content": result
129
- }
130
- },
131
- {"role": "assistant", "content": "Done!"}
132
- ]
133
-
134
- examples.append({
135
- "messages": messages,
136
- "source": "synthetic_template",
137
- "tool": tool_name
138
- })
139
-
140
- return examples
141
-
142
- def generate_params(tool_name: str, template_params: Dict[str, str]) -> Dict[str, Any]:
143
- """Generate realistic parameter values based on tool and template."""
144
- params = {}
145
-
146
- for key, placeholder in template_params.items():
147
- if placeholder == "{file_path}":
148
- params[key] = random.choice([
149
- "src/main.py", "package.json", "README.md", "src/utils.js",
150
- "tests/test_api.py", "config.yaml", "Dockerfile", "requirements.txt"
151
- ])
152
- elif placeholder == "{pattern}":
153
- params[key] = random.choice([
154
- "**/*.py", "**/*.js", "**/*.ts", "*.md", "src/**/*.test.js"
155
- ])
156
- elif placeholder == "{command}":
157
- params[key] = random.choice([
158
- "ls -la", "pwd", "git status", "npm run build",
159
- "python -m pytest", "make test", "docker ps"
160
- ])
161
- elif placeholder == "{content}":
162
- params[key] = random.choice([
163
- "console.log('Hello, World!');",
164
- "def hello():\n return 'Hello'",
165
- "# TODO: implement\npass",
166
- "import React from 'react';"
167
- ])
168
- elif placeholder == "{directory}":
169
- params[key] = random.choice([
170
- ".", "src", "tests", "lib", "app/components"
171
- ])
172
- else:
173
- # Generic placeholder
174
- params[key] = f"generated_value_{random.randint(1,1000)}"
175
-
176
- return params
177
-
178
- def fill_template(template: str, params: Dict[str, Any]) -> str:
179
- """Replace placeholders in template."""
180
- result = template
181
- for key, value in params.items():
182
- placeholder = f"{{{key}}}"
183
- result = result.replace(placeholder, str(value))
184
- return result
185
-
186
- def fill_params(template_params: Dict[str, str], params: Dict[str, Any]) -> Dict[str, Any]:
187
- """Fill parameter templates."""
188
- filled = {}
189
- for key, placeholder in template_params.items():
190
- if placeholder in [f"{{{k}}}" for k in params.keys()]:
191
- # Find matching param key
192
- param_key = placeholder.strip("{}")
193
- filled[key] = params.get(param_key, placeholder)
194
- else:
195
- filled[key] = params.get(key, placeholder)
196
- return filled
197
-
198
- def main():
199
- parser = argparse.ArgumentParser()
200
- parser.add_argument("--catalog", type=str, default="training-data/tools/catalog.json")
201
- parser.add_argument("--output", type=str, default="training-data/scaled/template_synthetic.jsonl")
202
- parser.add_argument("--examples-per-tool", type=int, default=500)
203
- parser.add_argument("--tools-limit", type=int, default=None, help="Limit number of tools to process")
204
- args = parser.parse_args()
205
-
206
- catalog_path = Path(args.catalog)
207
- output_path = Path(args.output)
208
-
209
- if not catalog_path.exists():
210
- print(f"❌ Tool catalog not found: {catalog_path}")
211
- return
212
-
213
- tools = load_tools(catalog_path)
214
- if args.tools_limit:
215
- tools = tools[:args.tools_limit]
216
-
217
- print(f"🔧 Generating synthetic examples for {len(tools)} tools")
218
- print(f" Target: {args.examples_per_tool} examples per tool")
219
- print(f" Total expected: ~{len(tools) * args.examples_per_tool} examples")
220
-
221
- output_path.parent.mkdir(parents=True, exist_ok=True)
222
-
223
- total_examples = 0
224
- with open(output_path, 'w') as f:
225
- for tool in tools:
226
- tool_name = tool.get("tool") or tool.get("name", "Unknown")
227
- templates = get_tool_templates(tool_name)
228
-
229
- if not templates:
230
- print(f"⚠️ No templates for {tool_name}, skipping")
231
- continue
232
-
233
- # Generate examples for each template
234
- examples_per_template = args.examples_per_tool // len(templates)
235
-
236
- for template in templates:
237
- examples = generate_variations(template, examples_per_template, tool_name)
238
- for ex in examples:
239
- f.write(json.dumps(ex) + "\n")
240
- total_examples += 1
241
-
242
- print(f"✅ {tool_name}: {examples_per_template * len(templates)} examples")
243
-
244
- print(f"\n✨ Generated {total_examples} synthetic examples")
245
- print(f" Saved to: {output_path}")
246
-
247
- # Create a sample
248
- print("\n📝 Sample example:")
249
- with open(output_path, 'r') as f:
250
- first_line = f.readline()
251
- if first_line:
252
- sample = json.loads(first_line)
253
- print(f" User: {sample['messages'][0]['content'][:80]}...")
254
-
255
- if __name__ == "__main__":
256
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/generate_synthetic_v2.py DELETED
@@ -1,316 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Generate high-quality synthetic training data using tool-specific templates.
4
- Each tool gets realistic scenarios with proper parameters.
5
- """
6
-
7
- import json
8
- import random
9
- from pathlib import Path
10
- from typing import Dict, List, Any
11
- import argparse
12
-
13
- # Comprehensive templates for all tools
14
- TOOL_TEMPLATES = {
15
- "AgentTool": [
16
- {"user": "Create an agent to help with testing", "params": {"name": "test_agent", "goal": "Write unit tests"}, "result": "Agent 'test_agent' created"},
17
- {"user": "Spawn a teammate to handle frontend tasks", "params": {"name": "frontend_dev", "skills": ["react", "typescript"]}, "result": "Teammate 'frontend_dev' added to team"}
18
- ],
19
- "AskUserQuestionTool": [
20
- {"user": "Ask the user which framework they prefer", "params": {"question": "Which framework do you want to use: React, Vue, or Angular?"}, "result": "User responded: React"},
21
- {"user": "I need clarification on the requirements", "params": {"question": "Should the API be REST or GraphQL?"}, "result": "User answered: REST"}
22
- ],
23
- "BashTool": [
24
- {"user": "Run tests", "params": {"command": "npm test"}, "result": "PASS src/index.test.js\nTests: 12 passed, 0 failed"},
25
- {"user": "Check git status", "params": {"command": "git status"}, "result": "On branch main\nModified: src/index.js"},
26
- {"user": "Install dependencies", "params": {"command": "pip install -r requirements.txt"}, "result": "Successfully installed Flask==2.0.0"}
27
- ],
28
- "BriefTool": [
29
- {"user": "Give me a brief on this project", "params": {"topic": "project_overview"}, "result": "This is a voice-enabled AI coding assistant built on Qwen2.5-Coder-32B"},
30
- {"user": "Brief me on the architecture", "params": {"topic": "architecture"}, "result": "Stack 2.9 uses vLLM for inference with LoRA fine-tuning"}
31
- ],
32
- "ConfigTool": [
33
- {"user": "Show configuration", "params": {"section": "model"}, "result": "model: Qwen2.5-Coder-32B\ncontext: 32768"},
34
- {"user": "Get settings", "params": {"key": "max_tokens"}, "result": "max_tokens: 4000"}
35
- ],
36
- "EnterPlanModeTool": [
37
- {"user": "Enter plan mode", "params": {"goal": "Refactor authentication module"}, "result": "Plan mode activated for: Refactor authentication module"},
38
- ],
39
- "EnterWorktreeTool": [
40
- {"user": "Create a worktree for feature branch", "params": {"branch": "feature/new-ui"}, "result": "Worktree created at .worktrees/feature_new_ui"},
41
- ],
42
- "ExitWorktreeTool": [
43
- {"user": "Exit current worktree", "params": {}, "result": "Exited worktree, returning to main"},
44
- ],
45
- "FileEditTool": [
46
- {"user": "Fix syntax error in main.py line 10", "params": {"file_path": "src/main.py", "old_string": "prin('hello')", "new_string": "print('hello')"}, "result": "File edited successfully"},
47
- {"user": "Add import statement to app.py", "params": {"file_path": "app.py", "insert_after": "import os", "new_string": "import sys"}, "result": "Import added"},
48
- ],
49
- "FileReadTool": [
50
- {"user": "Read package.json", "params": {"file_path": "package.json"}, "result": "{\n \"name\": \"myapp\",\n \"version\": \"1.0.0\"\n}"},
51
- {"user": "Show me README.md", "params": {"file_path": "README.md"}, "result": "# My Project\n\nDescription here..."},
52
- ],
53
- "FileWriteTool": [
54
- {"user": "Create a new file utils.py", "params": {"file_path": "src/utils.py", "content": "def helper():\n return 'help'"}, "result": "File src/utils.py created"},
55
- ],
56
- "GlobTool": [
57
- {"user": "Find all Python files", "params": {"pattern": "**/*.py"}, "result": "Found 15 files:\nsrc/main.py\nsrc/utils.py\ntests/test_main.py"},
58
- {"user": "List test files", "params": {"pattern": "**/*.test.js"}, "result": "Found 3 files:\ntests/unit.test.js\ntests/integration.test.js"},
59
- ],
60
- "GrepTool": [
61
- {"user": "Search for 'TODO' comments", "params": {"pattern": "TODO"}, "result": "src/main.py:15:# TODO: implement error handling\nsrc/utils.py:42:# TODO: add validation"},
62
- {"user": "Find all console.log statements", "params": {"pattern": "console.log"}, "result": "src/index.js:10:console.log('debug')\nsrc/app.js:25:console.log('start')"},
63
- ],
64
- "LSPTool": [
65
- {"user": "Get definition of function calculateTotal", "params": {"file_path": "src/math.js", "line": 10, "character": 15}, "result": "Definition at src/math.js:20-30\nfunction calculateTotal(items) {...}"},
66
- {"user": "Find references of MyClass", "params": {"file_path": "src/MyClass.ts", "line": 5, "character": 10}, "result": "References:\n- src/main.ts:15\n- tests/MyClass.test.ts:8"},
67
- ],
68
- "ListMcpResourcesTool": [
69
- {"user": "List available MCP resources", "params": {}, "result": "Resources:\n- server1.file_system\n- server2.database\n- server3.api"},
70
- ],
71
- "MCPTool": [
72
- {"user": "Connect to GitHub MCP server", "params": {"server_name": "github"}, "result": "Connected to GitHub MCP server"},
73
- ],
74
- "NotebookEditTool": [
75
- {"user": "Add markdown cell to notebook", "params": {"notebook_path": "analysis.ipynb", "cell_index": 0, "cell_type": "markdown", "content": "# Analysis"}, "result": "Cell added"},
76
- ],
77
- "ReadMcpResourceTool": [
78
- {"user": "Read resource file from MCP", "params": {"uri": "mcp://server1/file.txt"}, "result": "File content here..."},
79
- ],
80
- "RemoteTriggerTool": [
81
- {"user": "Trigger deployment on staging", "params": {"target": "staging-server", "action": "deploy"}, "result": "Deployment triggered, build ID: 12345"},
82
- ],
83
- "SendMessageTool": [
84
- {"user": "Message the design team about the mockups", "params": {"to": "design-team", "subject": "Mockups ready", "body": "Please review the new mockups in Figma"}, "result": "Message sent to design-team"},
85
- ],
86
- "SkillTool": [
87
- {"user": "Run code review skill", "params": {"skill": "code-review", "inputs": {"code": "function foo() { return 1; }"}}, "result": "Review: Use strict equality, add JSDoc"},
88
- ],
89
- "TaskCreateTool": [
90
- {"user": "Create task: Fix login bug", "params": {"title": "Fix login bug", "description": "Users can't log in with valid credentials"}, "result": "Task #123 created"},
91
- ],
92
- "TaskGetTool": [
93
- {"user": "Get details of task 123", "params": {"task_id": 123}, "result": "Task #123: Fix login bug\nStatus: in progress\nAssignee: @dev"},
94
- ],
95
- "TaskListTool": [
96
- {"user": "List all tasks", "params": {"status": "in_progress"}, "result": "Tasks:\n#123 Fix login bug\n#124 Update docs"},
97
- ],
98
- "TaskStopTool": [
99
- {"user": "Stop task 123", "params": {"task_id": 123}, "result": "Task #123 stopped"},
100
- ],
101
- "TaskUpdateTool": [
102
- {"user": "Mark task 123 as complete", "params": {"task_id": 123, "status": "completed"}, "result": "Task #123 marked complete"},
103
- ],
104
- "TeamCreateTool": [
105
- {"user": "Create a team for backend devs", "params": {"team_name": "backend", "members": ["@alice", "@bob"]}, "result": "Team 'backend' created with 2 members"},
106
- ],
107
- "TeamDeleteTool": [
108
- {"user": "Delete the temp team", "params": {"team_name": "temp"}, "result": "Team 'temp' deleted"},
109
- ],
110
- "TodoWriteTool": [
111
- {"user": "Add todo: update documentation", "params": {"text": "Update API documentation"}, "result": "Todo added"},
112
- ],
113
- "ToolSearchTool": [
114
- {"user": "Search for file search tools", "params": {"query": "find files"}, "result": "Found: GlobTool, GrepTool"},
115
- ],
116
- "WebFetchTool": [
117
- {"user": "Fetch the OpenRouter API docs", "params": {"url": "https://openrouter.ai/docs"}, "result": "Fetched 15KB from openrouter.ai/docs"},
118
- ],
119
- "WebSearchTool": [
120
- {"user": "Search for 'Node.js best practices 2024'", "params": {"query": "Node.js best practices 2024"}, "result": "Top results:\n1. Node.js Design Patterns\n2. 2024 Node.js Best Practices Guide"},
121
- ]
122
- }
123
-
124
- # Realistic value pools
125
- FILE_PATHS = ["src/main.py", "src/utils.js", "README.md", "package.json", "config.yaml",
126
- "Dockerfile", "requirements.txt", "tests/test_api.py", "src/components/Button.tsx",
127
- "lib/helpers.py", "app/models.py", "src/index.js", "Makefile"]
128
- COMMANDS = ["npm test", "pytest", "make build", "git status", "ls -la",
129
- "python -m pip install -r requirements.txt", "docker ps", "npm run lint"]
130
- PATTERNS = ["**/*.py", "**/*.js", "**/*.ts", "*.md", "**/*.test.js", "**/__tests__/**/*.py"]
131
- QUESTIONS = ["Which framework should we use?", "Is this a bug or a feature?",
132
- "What's the priority of this task?", "Should we refactor or rewrite?"]
133
-
134
- def fill_placeholders(text: str, params: Dict[str, Any]) -> str:
135
- """Replace all {key} placeholders in text with values from params."""
136
- for key, value in params.items():
137
- placeholder = f"{{{key}}}"
138
- if placeholder in text:
139
- text = text.replace(placeholder, str(value))
140
- return text
141
-
142
- def generate_variations(template: Dict[str, Any], count: int, tool_name: str) -> List[Dict[str, Any]]:
143
- """Generate multiple realistic variations of a template."""
144
- examples = []
145
-
146
- for i in range(count):
147
- params = {}
148
- user_text = template["user"]
149
- result_text = template["result"]
150
-
151
- # Build params by scanning template for placeholders
152
- template_str = user_text + json.dumps(template.get("params", {})) + result_text
153
-
154
- # Determine what placeholders exist
155
- for key, default_val in template.get("params", {}).items():
156
- if isinstance(default_val, str) and ("{" + key + "}") in template_str:
157
- # This is a placeholder - generate dynamic value
158
- if key == "file_path":
159
- params[key] = random.choice(FILE_PATHS)
160
- elif key == "command":
161
- params[key] = random.choice(COMMANDS)
162
- elif key == "pattern":
163
- params[key] = random.choice(PATTERNS)
164
- elif key == "question":
165
- params[key] = random.choice(QUESTIONS)
166
- elif key == "topic":
167
- params[key] = random.choice(["project_overview", "architecture", "team", "timeline"])
168
- elif key == "branch":
169
- params[key] = random.choice(["feature/new-ui", "bugfix/login", "hotfix/security"])
170
- elif key == "name":
171
- params[key] = random.choice(["test_agent", "code_reviewer", "deployment_bot", "test_suite"])
172
- elif key == "goal":
173
- params[key] = random.choice(["Write unit tests", "Refactor legacy code", "Add documentation"])
174
- elif key == "to":
175
- params[key] = random.choice(["team-backend", "design-team", "product-team"])
176
- elif key == "subject":
177
- params[key] = random.choice(["Review needed", "Update available", "Deployment status"])
178
- elif key == "body":
179
- params[key] = "Please review the attached documents."
180
- elif key == "server_name":
181
- params[key] = random.choice(["github", "jira", "slack", "postgres"])
182
- elif key == "action":
183
- params[key] = random.choice(["deploy", "restart", "backup", "migrate"])
184
- elif key == "target":
185
- params[key] = random.choice(["staging", "production", "dev"])
186
- elif key == "skill":
187
- params[key] = random.choice(["code-review", "security-scan", "performance-test"])
188
- elif key == "query":
189
- params[key] = random.choice(["find files", "search code", "list todos"])
190
- elif key == "url":
191
- params[key] = random.choice(["https://api.example.com/docs", "https://github.com/repo"])
192
- elif key == "task_id":
193
- params[key] = random.randint(100, 999)
194
- elif key == "title":
195
- params[key] = random.choice(["Fix bug", "Add feature", "Update docs", "Refactor code"])
196
- elif key == "description":
197
- params[key] = "Detailed description of the task..."
198
- elif key == "team_name":
199
- params[key] = random.choice(["backend", "frontend", "devops", "qa"])
200
- elif key == "members":
201
- params[key] = ["@user1", "@user2"]
202
- elif key == "status":
203
- params[key] = random.choice(["in_progress", "completed", "todo"])
204
- elif key == "text":
205
- params[key] = "Sample todo item"
206
- elif key == "cell_index":
207
- params[key] = random.randint(0, 10)
208
- elif key == "cell_type":
209
- params[key] = random.choice(["code", "markdown"])
210
- elif key == "content":
211
- params[key] = random.choice(["print('hello')", "# TODO", "import React", "def main():\n pass"])
212
- elif key == "uri":
213
- params[key] = "mcp://server/resource"
214
- elif key in ["line", "character"] and "file_path" in params:
215
- params[key] = random.randint(1, 100) if key == "line" else random.randint(1, 50)
216
- else:
217
- # Generic placeholder
218
- params[key] = f"value_{random.randint(1, 100)}"
219
- else:
220
- # Not a placeholder, use the static value as-is
221
- params[key] = default_val
222
-
223
- # Fill user prompt with params
224
- user_prompt = fill_placeholders(user_text, params)
225
- user_prompt = user_prompt.replace("{tool}", tool_name)
226
-
227
- # Build tool input from params
228
- tool_input = {}
229
- for key, template_val in template.get("params", {}).items():
230
- if key in params:
231
- tool_input[key] = params[key]
232
- else:
233
- tool_input[key] = template_val
234
-
235
- # Fill result
236
- result = fill_placeholders(result_text, params)
237
-
238
- # Build conversation
239
- messages = [
240
- {"role": "user", "content": user_prompt},
241
- {
242
- "role": "assistant",
243
- "content": random.choice([
244
- "I'll help with that.",
245
- "Sure, let me do that.",
246
- "Processing your request..."
247
- ]),
248
- "tool_use": {
249
- "name": tool_name,
250
- "input": tool_input
251
- }
252
- },
253
- {
254
- "role": "user",
255
- "content": "",
256
- "tool_result": {
257
- "tool_use_id": "tool_1",
258
- "content": result
259
- }
260
- },
261
- {"role": "assistant", "content": random.choice(["Done!", "Completed.", "All set!"])}
262
- ]
263
-
264
- examples.append({
265
- "messages": messages,
266
- "source": "synthetic_template",
267
- "tool": tool_name
268
- })
269
-
270
- return examples
271
-
272
- def main():
273
- parser = argparse.ArgumentParser()
274
- parser.add_argument("--output", type=str, default="training-data/scaled/template_synthetic.jsonl")
275
- parser.add_argument("--examples-per-tool", type=int, default=500)
276
- parser.add_argument("--tools-limit", type=int, default=None)
277
- args = parser.parse_args()
278
-
279
- output_path = Path(args.output)
280
- output_path.parent.mkdir(parents=True, exist_ok=True)
281
-
282
- tools = list(TOOL_TEMPLATES.keys())
283
- if args.tools_limit:
284
- tools = tools[:args.tools_limit]
285
-
286
- print(f"🔧 Generating synthetic examples for {len(tools)} tools")
287
- print(f" Target: {args.examples_per_tool} per tool")
288
- print(f" Total expected: ~{len(tools) * args.examples_per_tool}")
289
-
290
- total_examples = 0
291
- with open(output_path, 'w') as f:
292
- for tool_name in tools:
293
- templates = TOOL_TEMPLATES[tool_name]
294
- ex_per_template = max(1, args.examples_per_tool // len(templates))
295
-
296
- for template in templates:
297
- examples = generate_variations(template, ex_per_template, tool_name)
298
- for ex in examples:
299
- f.write(json.dumps(ex) + "\n")
300
- total_examples += 1
301
-
302
- print(f"✅ {tool_name}: {ex_per_template * len(templates)} examples")
303
-
304
- print(f"\n✨ Generated {total_examples} synthetic examples")
305
- print(f" Saved to: {output_path}")
306
-
307
- # Show sample
308
- print("\n📝 Sample example:")
309
- with open(output_path, 'r') as f:
310
- sample = json.loads(f.readline())
311
- print(f" Tool: {sample['tool']}")
312
- print(f" User: {sample['messages'][0]['content'][:60]}...")
313
- print(f" Assistant uses: {sample['messages'][1]['tool_use']['name']}")
314
-
315
- if __name__ == "__main__":
316
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/generate_tool_data.py DELETED
@@ -1,615 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Synthetic Tool-Calling Training Data Generator for Stack 2.9
4
- Generates training examples in Qwen2.5-Coder format with tool_calls.
5
- """
6
-
7
- import json
8
- import random
9
- import argparse
10
- from pathlib import Path
11
- from typing import Dict, List, Any
12
- from datetime import datetime
13
-
14
- # ============================================================================
15
- # Tool Definitions (Qwen2.5-Coder format)
16
- # ============================================================================
17
-
18
- TOOL_DEFINITIONS = [
19
- {
20
- "type": "function",
21
- "function": {
22
- "name": "Bash",
23
- "description": "Execute bash commands in the terminal. Use for running shell commands, scripts, git operations, package managers, and system commands.",
24
- "parameters": {
25
- "type": "object",
26
- "properties": {
27
- "command": {
28
- "type": "string",
29
- "description": "The bash command to execute"
30
- },
31
- "timeout": {
32
- "type": "integer",
33
- "description": "Timeout in seconds (default: 30)"
34
- }
35
- },
36
- "required": ["command"]
37
- }
38
- }
39
- },
40
- {
41
- "type": "function",
42
- "function": {
43
- "name": "FileRead",
44
- "description": "Read the contents of a file from the filesystem. Use for viewing source code, configuration files, documentation, or any text-based files.",
45
- "parameters": {
46
- "type": "object",
47
- "properties": {
48
- "path": {
49
- "type": "string",
50
- "description": "Path to the file to read"
51
- },
52
- "offset": {
53
- "type": "integer",
54
- "description": "Line number to start reading from (1-indexed)"
55
- },
56
- "limit": {
57
- "type": "integer",
58
- "description": "Maximum number of lines to read"
59
- }
60
- },
61
- "required": ["path"]
62
- }
63
- }
64
- },
65
- {
66
- "type": "function",
67
- "function": {
68
- "name": "FileWrite",
69
- "description": "Create or overwrite a file with content. Use for creating new files, updating existing files, or writing code, configuration, or documentation.",
70
- "parameters": {
71
- "type": "object",
72
- "properties": {
73
- "path": {
74
- "type": "string",
75
- "description": "Path where the file should be created or written"
76
- },
77
- "content": {
78
- "type": "string",
79
- "description": "The content to write to the file"
80
- },
81
- "append": {
82
- "type": "boolean",
83
- "description": "Append to existing file instead of overwriting (default: false)"
84
- }
85
- },
86
- "required": ["path", "content"]
87
- }
88
- }
89
- },
90
- {
91
- "type": "function",
92
- "function": {
93
- "name": "WebSearch",
94
- "description": "Search the web for information. Use for finding documentation, looking up error messages, researching libraries, or getting up-to-date information.",
95
- "parameters": {
96
- "type": "object",
97
- "properties": {
98
- "query": {
99
- "type": "string",
100
- "description": "The search query to look up on the web"
101
- },
102
- "count": {
103
- "type": "integer",
104
- "description": "Number of results to return (default: 5)"
105
- }
106
- },
107
- "required": ["query"]
108
- }
109
- }
110
- },
111
- {
112
- "type": "function",
113
- "function": {
114
- "name": "Grep",
115
- "description": "Search for patterns in files. Use for finding specific code, function definitions, imports, TODO comments, error patterns, or any text across the codebase.",
116
- "parameters": {
117
- "type": "object",
118
- "properties": {
119
- "pattern": {
120
- "type": "string",
121
- "description": "The search pattern or regex to match"
122
- },
123
- "path": {
124
- "type": "string",
125
- "description": "Directory or file path to search in (default: current directory)"
126
- },
127
- "recursive": {
128
- "type": "boolean",
129
- "description": "Search recursively in subdirectories (default: true)"
130
- },
131
- "file_pattern": {
132
- "type": "string",
133
- "description": "File pattern to filter results (e.g., '*.py', '*.js')"
134
- }
135
- },
136
- "required": ["pattern"]
137
- }
138
- }
139
- }
140
- ]
141
-
142
- # ============================================================================
143
- # Template Data for Generation
144
- # ============================================================================
145
-
146
- FILE_PATHS = [
147
- "src/main.py", "src/utils.py", "src/config.py", "src/models.py",
148
- "src/api.py", "src/handlers.py", "src/middleware.py",
149
- "tests/test_main.py", "tests/test_utils.py", "tests/conftest.py",
150
- "README.md", "LICENSE", "package.json", "requirements.txt",
151
- "config.yaml", "config.json", ".env.example",
152
- "src/components/Button.tsx", "src/components/Header.jsx",
153
- "src/styles.css", "src/index.js", "src/app.js",
154
- "docs/API.md", "docs/ARCHITECTURE.md", "docs/CONTRIBUTING.md",
155
- "scripts/setup.sh", "scripts/deploy.py", "Makefile"
156
- ]
157
-
158
- CODE_SNIPPETS = {
159
- "python": [
160
- "def hello():\n print('Hello, World!')",
161
- "class MyClass:\n def __init__(self):\n self.value = 42",
162
- "import os\nos.path.join('a', 'b')",
163
- "async def fetch_data():\n async with aiohttp.ClientSession() as session:\n return await session.get(url)",
164
- ],
165
- "javascript": [
166
- "const fetch = require('node-fetch');\nconst data = await fetch(url);",
167
- "function handleClick() {\n setCount(count + 1);\n}",
168
- "export default function App() {\n return <div>Hello</div>;\n}",
169
- "const [state, setState] = useState(null);",
170
- ],
171
- "bash": [
172
- "npm install",
173
- "git status",
174
- "pytest -v",
175
- "python -m pytest tests/",
176
- "make build",
177
- "docker build -t myapp .",
178
- "ls -la",
179
- "curl -X GET https://api.example.com",
180
- ]
181
- }
182
-
183
- WEB_SEARCH_QUERIES = [
184
- "python async await best practices",
185
- "javascript array methods map filter reduce",
186
- "TypeScript generics tutorial",
187
- "React hooks useEffect dependency array",
188
- "Node.js error handling best practices",
189
- "Docker vs Kubernetes differences",
190
- "Git rebase vs merge",
191
- "SQL join types explained",
192
- "REST API design principles",
193
- "Python list comprehension examples",
194
- "JavaScript promise async await",
195
- "CSS flexbox vs grid",
196
- "MongoDB vs PostgreSQL",
197
- "AWS Lambda cold start optimization",
198
- "Docker compose tutorial",
199
- ]
200
-
201
- GREP_PATTERNS = [
202
- ("TODO", "*.py", "Find TODO comments in Python files"),
203
- ("function ", "*.js", "Find function definitions in JS"),
204
- ("import ", "*.py", "Find imports in Python"),
205
- ("class ", "*.ts", "Find class definitions in TypeScript"),
206
- ("def ", "*.py", "Find function definitions in Python"),
207
- ("const ", "*.js", "Find const declarations in JS"),
208
- ("interface ", "*.ts", "Find interface definitions"),
209
- ("#pragma", "*.c", "Find pragma directives in C"),
210
- ("SELECT ", "*.sql", "Find SQL queries"),
211
- ("TODO", "*.md", "Find TODOs in documentation"),
212
- ]
213
-
214
- # ============================================================================
215
- # Scenario Templates
216
- # ============================================================================
217
-
218
- SCENARIOS = {
219
- "FileRead": [
220
- {
221
- "user": "Read the file at src/main.py and show me what's in it",
222
- "tool_call": {"name": "FileRead", "arguments": {"path": "src/main.py"}},
223
- "result": "Successfully read file: src/main.py\n```python\ndef main():\n print('Hello from Stack 2.9!')\n\nif __name__ == '__main__':\n main()\n```",
224
- "response": "Here's the contents of `src/main.py`:\n\n```python\ndef main():\n print('Hello from Stack 2.9!')\n\nif __name__ == '__main__':\n main()\n```\n\nThis is a simple Python script that prints a greeting when executed directly."
225
- },
226
- {
227
- "user": "Can you show me the README.md file?",
228
- "tool_call": {"name": "FileRead", "arguments": {"path": "README.md"}},
229
- "result": "Successfully read file: README.md\n```markdown\n# My Project\n\nA sample project for Stack 2.9.\n\n## Installation\n\npip install -r requirements.txt\n```",
230
- "response": "Here's the README.md:\n\n```markdown\n# My Project\n\nA sample project for Stack 2.9.\n\n## Installation\n\npip install -r requirements.txt\n```"
231
- },
232
- {
233
- "user": "What's in the config.yaml file?",
234
- "tool_call": {"name": "FileRead", "arguments": {"path": "config.yaml"}},
235
- "result": "Successfully read file: config.yaml\n```yaml\ndatabase:\n host: localhost\n port: 5432\n name: myapp\n\napp:\n port: 3000\n debug: true\n```",
236
- "response": "The config.yaml contains:\n\n```yaml\ndatabase:\n host: localhost\n port: 5432\n name: myapp\n\napp:\n port: 3000\n debug: true\n```\n\nIt has database connection settings and app configuration."
237
- },
238
- {
239
- "user": "Check the package.json to see the dependencies",
240
- "tool_call": {"name": "FileRead", "arguments": {"path": "package.json"}},
241
- "result": "Successfully read file: package.json\n```json\n{\n \"name\": \"my-app\",\n \"version\": \"1.0.0\",\n \"dependencies\": {\n \"express\": \"^4.18.0\",\n \"react\": \"^18.2.0\"\n }\n}\n```",
242
- "response": "The package.json shows these dependencies:\n\n```json\n{\n \"name\": \"my-app\",\n \"version\": \"1.0.0\",\n \"dependencies\": {\n \"express\": \"^4.18.0\",\n \"react\": \"^18.2.0\"\n }\n}\n```"
243
- },
244
- ],
245
- "FileWrite": [
246
- {
247
- "user": "Create a new file called hello.py with a hello world function",
248
- "tool_call": {"name": "FileWrite", "arguments": {"path": "hello.py", "content": "def hello():\n print('Hello, World!')\n\nif __name__ == '__main__':\n hello()\n"}},
249
- "result": "Successfully wrote file: hello.py",
250
- "response": "Created `hello.py` with a hello world function:\n\n```python\ndef hello():\n print('Hello, World!')\n\nif __name__ == '__main__':\n hello()\n```\n\nYou can run it with `python hello.py`."
251
- },
252
- {
253
- "user": "Write a simple React component to src/components/Button.jsx",
254
- "tool_call": {"name": "FileWrite", "arguments": {"path": "src/components/Button.jsx", "content": "import React from 'react';\n\nexport default function Button({ onClick, children }) {\n return (\n <button onClick={onClick} className=\"btn\">\n {children}\n </button>\n );\n}\n"}},
255
- "result": "Successfully wrote file: src/components/Button.jsx",
256
- "response": "Created `src/components/Button.jsx`:\n\n```jsx\nimport React from 'react';\n\nexport default function Button({ onClick, children }) {\n return (\n <button onClick={onClick} className=\"btn\">\n {children}\n </button>\n );\n}\n```"
257
- },
258
- {
259
- "user": "Create a config.json file with some settings",
260
- "tool_call": {"name": "FileWrite", "arguments": {"path": "config.json", "content": "{\n \"app\": {\n \"name\": \"MyApp\",\n \"version\": \"1.0.0\"\n },\n \"server\": {\n \"port\": 8080,\n \"host\": \"localhost\"\n }\n}\n"}},
261
- "result": "Successfully wrote file: config.json",
262
- "response": "Created `config.json`:\n\n```json\n{\n \"app\": {\n \"name\": \"MyApp\",\n \"version\": \"1.0.0\"\n },\n \"server\": {\n \"port\": 8080,\n \"host\": \"localhost\"\n }\n}\n```"
263
- },
264
- ],
265
- "Bash": [
266
- {
267
- "user": "Run npm install to install dependencies",
268
- "tool_call": {"name": "Bash", "arguments": {"command": "npm install"}},
269
- "result": "added 245 packages in 3.5s",
270
- "response": "Dependencies installed successfully! Added 245 packages."
271
- },
272
- {
273
- "user": "Check git status",
274
- "tool_call": {"name": "Bash", "arguments": {"command": "git status"}},
275
- "result": "On branch main\nYour branch is up to date with 'origin/main'.\n\nChanges not staged for commit:\n modified: README.md\n modified: src/main.py\n\nno changes added to commit (but untracked files present)",
276
- "response": "Git status shows:\n- Modified: README.md, src/main.py (not staged)\n- Untracked files present"
277
- },
278
- {
279
- "user": "Run the tests with pytest",
280
- "tool_call": {"name": "Bash", "arguments": {"command": "pytest -v"}},
281
- "result": "tests/test_main.py::test_hello PASSED\ntests/test_utils.py::test_add PASSED\n\n2 passed in 0.5s",
282
- "response": "All tests passed!\n- `tests/test_main.py::test_hello` ✓\n- `tests/test_utils.py::test_add` ✓"
283
- },
284
- {
285
- "user": "List all files in the current directory",
286
- "tool_call": {"name": "Bash", "arguments": {"command": "ls -la"}},
287
- "result": "total 64\ndrwxr-xr-x 12 user staff 384 Apr 6 00:58 .\ndrwxr-xr-x 9 user staff 288 Apr 6 00:50 ..\n-rw-r--r-- 1 user staff 220 Apr 6 00:50 README.md\ndrwxr-xr-x 3 user staff 96 Apr 6 00:51 src\ndrwxr-xr-x 3 user staff 96 Apr 6 00:51 tests\n...",
288
- "response": "Files in current directory:\n- README.md\n- src/\n- tests/\n- And more (permissions, dates shown)"
289
- },
290
- {
291
- "user": "Build the project with make",
292
- "tool_call": {"name": "Bash", "arguments": {"command": "make build"}},
293
- "result": "Building...\ngcc -o main main.c\nobuild successful!",
294
- "response": "Build completed successfully!"
295
- },
296
- ],
297
- "WebSearch": [
298
- {
299
- "user": "How do I use async/await in Python?",
300
- "tool_call": {"name": "WebSearch", "arguments": {"query": "python async await best practices", "count": 5}},
301
- "result": "Search results for 'python async await best practices':\n1. Real Python - Async IO in Python: A Complete Walkthrough\n2. Python Docs - asyncio - Asynchronous I/O\n3. Towards Data Science - Mastering Python Async/Await\n4. Stack Overflow - Common async/await mistakes\n5. Medium - Async/Await Patterns in Python",
302
- "response": "Here are some resources on Python async/await:\n\n1. **Real Python** - Async IO in Python: A Complete Walkthrough\n2. **Python Docs** - asyncio - Asynchronous I/O\n3. **Towards Data Science** - Mastering Python Async/Await\n\nKey points:\n- Use `async def` to define async functions\n- Use `await` to call other async functions\n- `asyncio.run()` to execute the main coroutine"
303
- },
304
- {
305
- "user": "What's the difference between Docker and Kubernetes?",
306
- "tool_call": {"name": "WebSearch", "arguments": {"query": "Docker vs Kubernetes differences", "count": 5}},
307
- "result": "Search results for 'Docker vs Kubernetes differences':\n1. Docker Blog - Containers 101\n2. Kubernetes.io - What is Kubernetes\n3. Red Hat - Docker vs Kubernetes explained\n4. DevOps.com - Container orchestration comparison\n5. Container Journal - When to use each",
308
- "response": "**Docker** is a container platform - it packages applications into containers.\n\n**Kubernetes** is a container orchestrator - it manages multiple containers across machines.\n\nThink of Docker as individual shipping containers, and Kubernetes as the port management system that coordinates all of them."
309
- },
310
- {
311
- "user": "How do I use React hooks properly?",
312
- "tool_call": {"name": "WebSearch", "arguments": {"query": "React hooks useEffect dependency array", "count": 5}},
313
- "result": "Search results for 'React hooks useEffect dependency array':\n1. React Docs - Synchronizing with Effects\n2. Kent C. Dodds - A Complete Guide to useEffect\n3. Robin Wieruch - React useEffect Hook Tutorial\n4. Stack Overflow - Common useEffect mistakes\n5. LogRocket - useEffect dependency array deep dive",
314
- "response": "**React Hooks Guide:**\n\n1. **useState** - For state management\n2. **useEffect** - For side effects, takes dependency array\n3. **useContext** - For global state\n\nThe dependency array controls when the effect runs. Empty `[]` = once on mount. Include variables to re-run when they change."
315
- },
316
- ],
317
- "Grep": [
318
- {
319
- "user": "Find all TODO comments in the Python files",
320
- "tool_call": {"name": "Grep", "arguments": {"pattern": "TODO", "path": ".", "recursive": True, "file_pattern": "*.py"}},
321
- "result": "Found 3 matches:\nsrc/main.py:14: # TODO: Add error handling\nsrc/utils.py:22: # TODO: Implement caching\ntests/test_main.py:8: # TODO: Add more tests",
322
- "response": "Found 3 TODO comments in Python files:\n- `src/main.py:14` - Add error handling\n- `src/utils.py:22` - Implement caching\n- `tests/test_main.py:8` - Add more tests"
323
- },
324
- {
325
- "user": "Search for function definitions in the JavaScript files",
326
- "tool_call": {"name": "Grep", "arguments": {"pattern": "function ", "path": "src", "recursive": True, "file_pattern": "*.js"}},
327
- "result": "Found 5 matches:\nsrc/index.js:3: function setup() {\nsrc/handlers.js:12: function handleClick(event) {\nsrc/handlers.js:25: function handleSubmit(form) {\nsrc/utils.js:8: function formatDate(date) {\nsrc/utils.js:15: function parseJSON(str) {",
328
- "response": "Found 5 function definitions in src/:\n- `src/index.js:3` - setup()\n- `src/handlers.js:12` - handleClick(event)\n- `src/handlers.js:15` - handleSubmit(form)\n- `src/utils.js:8` - formatDate(date)\n- `src/utils.js:15` - parseJSON(str)"
329
- },
330
- {
331
- "user": "Find all imports in the Python codebase",
332
- "tool_call": {"name": "Grep", "arguments": {"pattern": "import ", "path": ".", "recursive": True, "file_pattern": "*.py"}},
333
- "result": "Found 12 matches:\nsrc/main.py:1: import os\nsrc/main.py:2: import sys\nsrc/main.py:3: import json\nsrc/utils.py:1: from typing import Dict\nsrc/api.py:5: import requests\n...",
334
- "response": "Found 12 import statements across Python files. Main imports:\n- `os`, `sys`, `json` in src/main.py\n- `typing.Dict` in src/utils.py\n- `requests` in src/api.py"
335
- },
336
- {
337
- "user": "Look for class definitions in TypeScript files",
338
- "tool_call": {"name": "Grep", "arguments": {"pattern": "class ", "path": "src", "recursive": True, "file_pattern": "*.ts"}},
339
- "result": "Found 2 matches:\nsrc/models/User.ts:3: class User {\nsrc/models/Config.ts:8: class Config {",
340
- "response": "Found 2 class definitions:\n- `src/models/User.ts:3` - User class\n- `src/models/Config.ts:8` - Config class"
341
- },
342
- ],
343
- }
344
-
345
-
346
- # ============================================================================
347
- # Data Augmentation Helpers
348
- # ============================================================================
349
-
350
- def vary_file_paths(template: Dict) -> Dict:
351
- """Vary file paths in a scenario to create diversity."""
352
- import copy
353
- result = copy.deepcopy(template)
354
-
355
- # Vary the file paths
356
- if "path" in result["tool_call"]["arguments"]:
357
- original_path = result["tool_call"]["arguments"]["path"]
358
- for old_path in FILE_PATHS:
359
- if old_path in result["user"]:
360
- result["user"] = result["user"].replace(old_path, random.choice(FILE_PATHS))
361
- result["tool_call"]["arguments"]["path"] = random.choice(FILE_PATHS)
362
- break
363
-
364
- return result
365
-
366
-
367
- def vary_content(template: Dict) -> Dict:
368
- """Vary content in a scenario."""
369
- import copy
370
- result = copy.deepcopy(template)
371
-
372
- if "content" in result["tool_call"]["arguments"]:
373
- # Vary code snippets
374
- lang = random.choice(["python", "javascript"])
375
- result["tool_call"]["arguments"]["content"] = random.choice(CODE_SNIPPETS[lang])
376
-
377
- return result
378
-
379
-
380
- def vary_bash_command(template: Dict) -> Dict:
381
- """Vary bash commands."""
382
- import copy
383
- result = copy.deepcopy(template)
384
-
385
- if "command" in result["tool_call"]["arguments"]:
386
- original = result["tool_call"]["arguments"]["command"].split()[0] if result["tool_call"]["arguments"]["command"] else ""
387
-
388
- if "npm" in original:
389
- commands = ["npm install", "npm run build", "npm test", "npm start"]
390
- elif "git" in original:
391
- commands = ["git status", "git log --oneline -5", "git diff", "git branch -a"]
392
- elif "pytest" in original:
393
- commands = ["pytest -v", "pytest tests/", "pytest -xvs", "pytest --cov"]
394
- elif "ls" in original:
395
- commands = ["ls -la", "ls -1", "ls -lah"]
396
- elif "make" in original:
397
- commands = ["make build", "make clean", "make test", "make install"]
398
- else:
399
- commands = ["echo 'hello'", "pwd", "whoami", "date"]
400
-
401
- result["tool_call"]["arguments"]["command"] = random.choice(commands)
402
-
403
- return result
404
-
405
-
406
- def vary_search_query(template: Dict) -> Dict:
407
- """Vary web search queries."""
408
- import copy
409
- result = copy.deepcopy(template)
410
-
411
- if "query" in result["tool_call"]["arguments"]:
412
- result["tool_call"]["arguments"]["query"] = random.choice(WEB_SEARCH_QUERIES)
413
-
414
- return result
415
-
416
-
417
- def vary_grep_pattern(template: Dict) -> Dict:
418
- """Vary grep patterns."""
419
- import copy
420
- result = copy.deepcopy(template)
421
-
422
- pattern, file_pattern, _ = random.choice(GREP_PATTERNS)
423
- result["tool_call"]["arguments"]["pattern"] = pattern
424
- result["tool_call"]["arguments"]["file_pattern"] = file_pattern
425
-
426
- return result
427
-
428
-
429
- # ============================================================================
430
- # Main Generation Functions
431
- # ============================================================================
432
-
433
- def create_tool_call_message(tool_call: Dict, tool_call_id: str) -> Dict:
434
- """Create a tool_calls message in Qwen format."""
435
- return {
436
- "role": "assistant",
437
- "content": None,
438
- "tool_calls": [
439
- {
440
- "id": tool_call_id,
441
- "type": "function",
442
- "function": {
443
- "name": tool_call["name"],
444
- "arguments": json.dumps(tool_call["arguments"])
445
- }
446
- }
447
- ]
448
- }
449
-
450
-
451
- def create_tool_message(role: str, tool_call_id: str, tool_name: str, result: str) -> Dict:
452
- """Create a tool message (result of tool execution)."""
453
- return {
454
- "role": role, # typically "tool"
455
- "content": result,
456
- "tool_call_id": tool_call_id,
457
- "name": tool_name
458
- }
459
-
460
-
461
- def generate_example(scenario: Dict, system_prompt: str = None) -> Dict:
462
- """Generate a single training example in Qwen2.5-Coder format."""
463
- if system_prompt is None:
464
- system_prompt = "You are a helpful AI assistant that can use tools to help users solve problems. When you need to perform actions like reading files, running commands, searching the web, or searching code, use the appropriate tool."
465
-
466
- tool_call_id = f"call_${random.randint(1000, 9999)}"
467
-
468
- messages = [
469
- {"role": "system", "content": system_prompt},
470
- {"role": "user", "content": scenario["user"]},
471
- create_tool_call_message(scenario["tool_call"], tool_call_id),
472
- create_tool_message("tool", tool_call_id, scenario["tool_call"]["name"], scenario["result"]),
473
- {"role": "assistant", "content": scenario["response"]}
474
- ]
475
-
476
- return {
477
- "messages": messages,
478
- "tools": TOOL_DEFINITIONS
479
- }
480
-
481
-
482
- def augment_scenario(scenario: Dict, tool_name: str) -> Dict:
483
- """Apply random augmentations to a scenario."""
484
- import random
485
-
486
- augmented = scenario.copy()
487
-
488
- if tool_name == "FileRead":
489
- augmented = vary_file_paths(augmented)
490
- elif tool_name == "FileWrite":
491
- augmented = vary_file_paths(augmented)
492
- augmented = vary_content(augmented)
493
- elif tool_name == "Bash":
494
- augmented = vary_bash_command(augmented)
495
- elif tool_name == "WebSearch":
496
- augmented = vary_search_query(augmented)
497
- elif tool_name == "Grep":
498
- augmented = vary_grep_pattern(augmented)
499
-
500
- return augmented
501
-
502
-
503
- def generate_dataset(num_examples: int = 1000, output_path: str = None) -> List[Dict]:
504
- """Generate the complete dataset."""
505
- examples = []
506
- tools = list(SCENARIOS.keys())
507
-
508
- # Track counts for balance
509
- examples_per_tool = num_examples // len(tools)
510
- remainder = num_examples % len(tools)
511
-
512
- for i, tool_name in enumerate(tools):
513
- # Determine how many examples for this tool
514
- count = examples_per_tool + (1 if i < remainder else 0)
515
-
516
- base_scenarios = SCENARIOS[tool_name]
517
-
518
- for j in range(count):
519
- # Use base scenario and vary
520
- base = base_scenarios[j % len(base_scenarios)]
521
-
522
- # Apply augmentations for variety
523
- if j >= len(base_scenarios):
524
- scenario = augment_scenario(base, tool_name)
525
- else:
526
- scenario = base
527
-
528
- example = generate_example(scenario)
529
- examples.append(example)
530
-
531
- # Shuffle for better training
532
- random.shuffle(examples)
533
-
534
- return examples
535
-
536
-
537
- def save_jsonl(examples: List[Dict], output_path: str):
538
- """Save examples to JSONL format."""
539
- output_file = Path(output_path)
540
- output_file.parent.mkdir(parents=True, exist_ok=True)
541
-
542
- with open(output_file, 'w', encoding='utf-8') as f:
543
- for example in examples:
544
- f.write(json.dumps(example, ensure_ascii=False) + '\n')
545
-
546
-
547
- def save_json(examples: List[Dict], output_path: str):
548
- """Save examples to JSON format."""
549
- output_file = Path(output_path)
550
- output_file.parent.mkdir(parents=True, exist_ok=True)
551
-
552
- with open(output_file, 'w', encoding='utf-8') as f:
553
- json.dump(examples, f, ensure_ascii=False, indent=2)
554
-
555
-
556
- def main():
557
- parser = argparse.ArgumentParser(description="Generate synthetic tool-calling training data")
558
- parser.add_argument("--num-examples", type=int, default=1000, help="Number of examples to generate")
559
- parser.add_argument("--output-dir", type=str, default="training-data", help="Output directory")
560
- parser.add_argument("--output-format", choices=["jsonl", "json", "both"], default="jsonl", help="Output format")
561
- parser.add_argument("--seed", type=int, default=42, help="Random seed")
562
- args = parser.parse_args()
563
-
564
- # Set seed for reproducibility
565
- random.seed(args.seed)
566
-
567
- print(f"🎯 Generating {args.num_examples} tool-calling training examples...")
568
- print(f" Output directory: {args.output_dir}")
569
- print(f" Format: {args.output_format}")
570
- print()
571
-
572
- # Generate dataset
573
- examples = generate_dataset(args.num_examples)
574
-
575
- output_dir = Path(args.output_dir)
576
-
577
- # Save based on format
578
- if args.output_format in ["jsonl", "both"]:
579
- jsonl_path = output_dir / "tool_examples.jsonl"
580
- save_jsonl(examples, str(jsonl_path))
581
- print(f"✅ Saved JSONL: {jsonl_path}")
582
-
583
- if args.output_format in ["json", "both"]:
584
- json_path = output_dir / "tool_examples.json"
585
- save_json(examples, str(json_path))
586
- print(f"✅ Saved JSON: {json_path}")
587
-
588
- # Statistics
589
- print(f"\n📊 Statistics:")
590
- print(f" Total examples: {len(examples)}")
591
-
592
- # Count by tool
593
- tool_counts = {}
594
- for ex in examples:
595
- for msg in ex["messages"]:
596
- if msg.get("tool_calls"):
597
- tool_name = msg["tool_calls"][0]["function"]["name"]
598
- tool_counts[tool_name] = tool_counts.get(tool_name, 0) + 1
599
-
600
- print(f" Examples by tool:")
601
- for tool, count in sorted(tool_counts.items(), key=lambda x: x[1], reverse=True):
602
- print(f" - {tool}: {count}")
603
-
604
- # Show sample
605
- print(f"\n📝 Sample example (first in dataset):")
606
- sample = examples[0]
607
- print(f" Tools defined: {len(sample['tools'])}")
608
- print(f" Messages: {len(sample['messages'])}")
609
- print(f" First user message: {sample['messages'][1]['content'][:60]}...")
610
-
611
- print(f"\n✨ Generation complete!")
612
-
613
-
614
- if __name__ == "__main__":
615
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/generate_tool_use_tests.py DELETED
@@ -1,163 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Tool-Use Evaluation Framework for Stack 2.9.
4
- Generates test cases and evaluates model's tool selection accuracy.
5
- """
6
-
7
- import json
8
- import random
9
- import re
10
- from pathlib import Path
11
- from typing import Dict, List, Any
12
- import argparse
13
-
14
- def load_tool_catalog(path: str) -> List[Dict]:
15
- with open(path, 'r') as f:
16
- return json.load(f)
17
-
18
- def generate_test_case(tool: Dict[str, Any]) -> Dict[str, Any]:
19
- """Generate a single test case for a tool."""
20
- tool_name = tool["tool"]
21
-
22
- # Templates for each tool (simplified)
23
- user_prompts = {
24
- "FileReadTool": [
25
- "Read {file_path}",
26
- "Show me the contents of {file_path}",
27
- "What's in {file_path}?",
28
- "Open {file_path}"
29
- ],
30
- "FileWriteTool": [
31
- "Create a new file {file_path} with content: {content}",
32
- "Write this to {file_path}: {content}",
33
- "Save the following as {file_path}: {content}"
34
- ],
35
- "GlobTool": [
36
- "Find all {pattern} files",
37
- "List files matching {pattern}",
38
- "Show me every {pattern}",
39
- "Search for files like {pattern}"
40
- ],
41
- "GrepTool": [
42
- "Search for {pattern} in {directory}",
43
- "Find all {pattern}",
44
- "Grep for {pattern}",
45
- "Locate {pattern} in the codebase"
46
- ],
47
- "BashTool": [
48
- "Run: {command}",
49
- "Execute {command}",
50
- "Please run {command}",
51
- "Can you execute {command}?"
52
- ]
53
- # ... others use default fallback
54
- }
55
-
56
- prompts = user_prompts.get(tool_name, [
57
- "Use {tool} to do something",
58
- "Execute {tool}",
59
- "Call {tool}"
60
- ])
61
-
62
- # Choose random prompt template
63
- prompt_template = random.choice(prompts)
64
-
65
- # Extract placeholders from template
66
- placeholders = re.findall(r'{(.*?)}', prompt_template)
67
-
68
- # Generate parameter values for each placeholder
69
- params = {}
70
- for ph in placeholders:
71
- if ph == 'file_path':
72
- params[ph] = random.choice([
73
- "src/main.py", "README.md", "package.json",
74
- "config.yaml", "tests/test_api.py", "src/index.js"
75
- ])
76
- elif ph == 'pattern':
77
- params[ph] = random.choice([
78
- "**/*.py", "**/*.js", "**/*.md", "**/*.test.*",
79
- "src/**/*.ts", "lib/**/*.py"
80
- ])
81
- elif ph == 'command':
82
- params[ph] = random.choice([
83
- "npm test", "pytest", "git status", "ls -la",
84
- "make build", "python -m pip install -e ."
85
- ])
86
- elif ph == 'query':
87
- params[ph] = random.choice(["TODO", "FIXME", "BUG", "HACK"])
88
- elif ph == 'directory':
89
- params[ph] = random.choice([".", "src", "tests", "lib", "app"])
90
- elif ph == 'content':
91
- params[ph] = "console.log('test');"
92
- elif ph == 'tool':
93
- params[ph] = tool_name
94
- else:
95
- params[ph] = f"value_{random.randint(1,100)}"
96
-
97
- # Fill prompt template
98
- prompt = prompt_template.format(**params)
99
-
100
- # Build expected tool call
101
- expected_tool = tool_name
102
- # Remove 'tool' param if present (it's just for substitution)
103
- expected_params = {k: v for k, v in params.items() if k != 'tool'}
104
-
105
- return {
106
- "test_id": f"{tool_name}_{random.randint(1000,9999)}",
107
- "prompt": prompt,
108
- "expected_tool": expected_tool,
109
- "expected_params": expected_params,
110
- "tool_description": tool.get("description", ""),
111
- "difficulty": random.choice(["easy", "medium", "hard"])
112
- }
113
-
114
- def generate_test_suite(catalog: List[Dict], tests_per_tool: int = 10) -> List[Dict]:
115
- """Generate test suite for all tools."""
116
- suite = []
117
- for tool in catalog:
118
- for _ in range(tests_per_tool):
119
- test_case = generate_test_case(tool)
120
- suite.append(test_case)
121
- return suite
122
-
123
- def main():
124
- parser = argparse.ArgumentParser()
125
- parser.add_argument("--catalog", type=str, default="training-data/tools/catalog.json")
126
- parser.add_argument("--output", type=str, default="stack-2.9-eval/tool_use/test_cases.json")
127
- parser.add_argument("--tests-per-tool", type=int, default=10)
128
- args = parser.parse_args()
129
-
130
- catalog_path = Path(args.catalog)
131
- output_path = Path(args.output)
132
-
133
- if not catalog_path.exists():
134
- print(f"❌ Catalog not found: {catalog_path}")
135
- return
136
-
137
- tools = load_tool_catalog(catalog_path)
138
- print(f"🔧 Generating test cases for {len(tools)} tools")
139
-
140
- suite = generate_test_suite(tools, args.tests_per_tool)
141
-
142
- output_path.parent.mkdir(parents=True, exist_ok=True)
143
- with open(output_path, 'w') as f:
144
- json.dump(suite, f, indent=2)
145
-
146
- print(f"\n✨ Generated {len(suite)} test cases")
147
- print(f" Saved to: {output_path}")
148
-
149
- # Summary by tool
150
- by_tool = {}
151
- for tc in suite:
152
- tool = tc["expected_tool"]
153
- by_tool[tool] = by_tool.get(tool, 0) + 1
154
-
155
- print("\n📊 Test cases per tool (top 10):")
156
- for tool, count in sorted(by_tool.items(), key=lambda x: x[1], reverse=True)[:10]:
157
- print(f" {tool}: {count}")
158
-
159
- print("\n✅ Test suite ready!")
160
- print(" To evaluate: run tool_use_evaluator.py with a trained model")
161
-
162
- if __name__ == "__main__":
163
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/mine_sessions.py DELETED
@@ -1,233 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Mine OpenClaw/Claude sessions for training data.
4
- Extracts conversations with tool use into JSONL format.
5
- """
6
-
7
- import os
8
- import json
9
- import glob
10
- from pathlib import Path
11
- from typing import Dict, List, Any
12
- import argparse
13
-
14
- def find_session_logs() -> List[Path]:
15
- """Find potential session log files in common locations."""
16
- possible_locations = [
17
- # OpenClaw sessions
18
- Path.home() / ".openclaw" / "sessions",
19
- Path.home() / ".openclaw" / "history",
20
- # Claude Code sessions
21
- Path.home() / ".claude" / "sessions",
22
- Path.home() / ".anthropic" / "sessions",
23
- # Generic
24
- Path.home() / "Documents" / "claude_sessions",
25
- Path.cwd() / "sessions",
26
- Path.cwd() / ".sessions",
27
- ]
28
-
29
- log_files = []
30
- for location in possible_locations:
31
- if location.exists():
32
- # Look for JSON, JSONL, or MD files
33
- for pattern in ["*.json", "*.jsonl", "*.md"]:
34
- log_files.extend(location.glob(pattern))
35
-
36
- return log_files
37
-
38
- def parse_json_conversation(data: Dict[str, Any]) -> List[Dict[str, Any]]:
39
- """Parse a JSON conversation into training examples."""
40
- examples = []
41
-
42
- # Try different known formats
43
- if "messages" in data:
44
- # OpenAI format
45
- messages = data["messages"]
46
- if is_valid_conversation(messages):
47
- examples.append({"messages": messages, "source": "openai_format"})
48
-
49
- elif "conversation" in data:
50
- # Custom format with conversation array
51
- messages = data["conversation"]
52
- if is_valid_conversation(messages):
53
- examples.append({"messages": messages, "source": "custom"})
54
-
55
- elif "turns" in data:
56
- # Turn-based format
57
- turns = data["turns"]
58
- messages = []
59
- for turn in turns:
60
- if "role" in turn and "content" in turn:
61
- messages.append({
62
- "role": turn["role"],
63
- "content": turn["content"],
64
- "tool_use": turn.get("tool_use"),
65
- "tool_result": turn.get("tool_result")
66
- })
67
- if is_valid_conversation(messages):
68
- examples.append({"messages": messages, "source": "turn_based"})
69
-
70
- return examples
71
-
72
- def is_valid_conversation(messages: List[Dict[str, Any]]) -> bool:
73
- """Check if message list is a valid conversation with tool use."""
74
- if not isinstance(messages, list) or len(messages) < 2:
75
- return False
76
-
77
- # Must have at least one user and one assistant message
78
- roles = [m.get("role") for m in messages if "role" in m]
79
- if "user" not in roles or "assistant" not in roles:
80
- return False
81
-
82
- return True
83
-
84
- def parse_markdown_conversation(text: str) -> List[Dict[str, Any]]:
85
- """Parse Markdown logs (Claude Code format typically)."""
86
- examples = []
87
-
88
- # Claude Code / chat format often has blocks like:
89
- # User: ...
90
- # Assistant: ...
91
- # or with tool use in special blocks
92
-
93
- lines = text.split("\n")
94
- current_role = None
95
- current_content = []
96
- messages = []
97
-
98
- for line in lines:
99
- line = line.rstrip()
100
-
101
- # Detect role changes
102
- if line.startswith("**User:**") or line.startswith("User:"):
103
- if current_role:
104
- messages.append({
105
- "role": current_role,
106
- "content": "\n".join(current_content).strip()
107
- })
108
- current_role = "user"
109
- current_content = [line.split(":", 1)[1].strip()] if ":" in line else []
110
- elif line.startswith("**Assistant:**") or line.startswith("Assistant:"):
111
- if current_role:
112
- messages.append({
113
- "role": current_role,
114
- "content": "\n".join(current_content).strip()
115
- })
116
- current_role = "assistant"
117
- current_content = [line.split(":", 1)[1].strip()] if ":" in line else []
118
- elif line.startswith("**Tool:**") or line.startswith("Tool Use:"):
119
- if current_role:
120
- messages.append({
121
- "role": current_role,
122
- "content": "\n".join(current_content).strip()
123
- })
124
- current_role = "assistant"
125
- # Start tool use block
126
- current_content = []
127
- # Could parse tool name and parameters
128
- else:
129
- if current_role:
130
- current_content.append(line)
131
-
132
- # Don't forget last message
133
- if current_role and current_content:
134
- messages.append({
135
- "role": current_role,
136
- "content": "\n".join(current_content).strip()
137
- })
138
-
139
- if is_valid_conversation(messages):
140
- examples.append({"messages": messages, "source": "markdown"})
141
-
142
- return examples
143
-
144
- def save_examples(examples: List[Dict[str, Any]], output_path: Path):
145
- """Save examples to JSONL file."""
146
- output_path.parent.mkdir(parents=True, exist_ok=True)
147
-
148
- with open(output_path, 'a') as f:
149
- for ex in examples:
150
- f.write(json.dumps(ex) + "\n")
151
-
152
- def main():
153
- parser = argparse.ArgumentParser()
154
- parser.add_argument("--output", type=str, default="training-data/scaled/sessions.jsonl")
155
- parser.add_argument("--dry-run", action="store_true", help="Just list files, don't parse")
156
- args = parser.parse_args()
157
-
158
- output_path = Path(args.output)
159
-
160
- print(f"🔍 Searching for session logs...")
161
- log_files = find_session_logs()
162
-
163
- if not log_files:
164
- print("⚠️ No session logs found in standard locations.")
165
- print(" Expected locations: ~/.openclaw/sessions, ~/.claude/sessions, ~/.anthropic/sessions")
166
- return
167
-
168
- print(f"📁 Found {len(log_files)} log files")
169
-
170
- if args.dry_run:
171
- for f in log_files[:10]:
172
- print(f" - {f}")
173
- if len(log_files) > 10:
174
- print(f" ... and {len(log_files)-10} more")
175
- return
176
-
177
- total_examples = 0
178
- for log_file in log_files:
179
- try:
180
- with open(log_file, 'r', encoding='utf-8', errors='ignore') as f:
181
- content = f.read()
182
-
183
- examples = []
184
-
185
- # Try JSON first
186
- if log_file.suffix in ['.json', '.jsonl']:
187
- if log_file.suffix == '.jsonl':
188
- # Multiple JSON objects per line
189
- for line in content.split('\n'):
190
- line = line.strip()
191
- if line:
192
- try:
193
- data = json.loads(line)
194
- examples.extend(parse_json_conversation(data))
195
- except json.JSONDecodeError:
196
- pass
197
- else:
198
- # Single JSON object
199
- try:
200
- data = json.loads(content)
201
- examples.extend(parse_json_conversation(data))
202
- except json.JSONDecodeError:
203
- # Maybe it's a JSON array
204
- try:
205
- data_list = json.loads(content)
206
- if isinstance(data_list, list):
207
- for data in data_list:
208
- examples.extend(parse_json_conversation(data))
209
- except:
210
- pass
211
- else:
212
- # Markdown or text
213
- examples.extend(parse_markdown_conversation(content))
214
-
215
- if examples:
216
- save_examples(examples, output_path)
217
- total_examples += len(examples)
218
- print(f"✅ {log_file.name}: {len(examples)} examples")
219
-
220
- except Exception as e:
221
- print(f"❌ Error processing {log_file}: {e}")
222
-
223
- print(f"\n✨ Extracted {total_examples} examples from session logs")
224
- print(f" Saved to: {output_path}")
225
-
226
- if total_examples == 0:
227
- print("\n⚠️ No valid conversations found. Consider:")
228
- print(" 1. Check if you have session logs in non-standard locations")
229
- print(" 2. Your logs may be in a different format")
230
- print(" 3. You may need to export conversations from your tools")
231
-
232
- if __name__ == "__main__":
233
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/quality_validate.py DELETED
@@ -1,158 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Quality validation for Stack 2.9 training dataset.
4
- Checks: message structure, tool format, schema compliance.
5
- """
6
-
7
- import json
8
- from pathlib import Path
9
- from typing import Dict, List, Any
10
- import argparse
11
- from collections import Counter
12
-
13
- def load_tool_catalog(path: str) -> Dict[str, Any]:
14
- with open(path, 'r') as f:
15
- return {tool["tool"]: tool for tool in json.load(f)}
16
-
17
- def validate_example(example: Dict[str, Any], tool_catalog: Dict[str, Any]) -> List[str]:
18
- """Validate a single example. Returns list of errors (empty if valid)."""
19
- errors = []
20
-
21
- if "messages" not in example:
22
- errors.append("Missing 'messages' field")
23
- return errors
24
-
25
- messages = example["messages"]
26
- if not isinstance(messages, list) or len(messages) < 2:
27
- errors.append("Invalid messages: must be list with at least 2 messages")
28
- return errors
29
-
30
- # Check roles sequence
31
- roles = [msg.get("role") for msg in messages]
32
- valid_roles = {"system", "user", "assistant"}
33
- if not all(r in valid_roles for r in roles):
34
- errors.append(f"Invalid roles: {roles}")
35
-
36
- # Tool use validation
37
- for msg in messages:
38
- if msg.get("role") == "assistant" and "tool_use" in msg:
39
- tool_use = msg["tool_use"]
40
- if "name" not in tool_use:
41
- errors.append("Tool use missing 'name'")
42
- else:
43
- tool_name = tool_use["name"]
44
- if tool_name not in tool_catalog:
45
- errors.append(f"Unknown tool: {tool_name}")
46
- if "input" not in tool_use:
47
- errors.append(f"Tool use missing 'input' for {tool_name}")
48
-
49
- if msg.get("role") == "user" and "tool_result" in msg:
50
- tool_result = msg["tool_result"]
51
- if "tool_use_id" not in tool_result:
52
- errors.append("Tool result missing 'tool_use_id'")
53
- if "content" not in tool_result:
54
- errors.append("Tool result missing 'content'")
55
-
56
- # Check message content is non-empty (except user with tool_result can be empty)
57
- for i, msg in enumerate(messages):
58
- role = msg.get("role")
59
- content = msg.get("content")
60
- if role == "user" and "tool_result" in msg:
61
- continue # Tool result user message can have empty content
62
- if content is not None and not isinstance(content, str):
63
- errors.append(f"Message content must be string, got {type(content)}")
64
- if content is not None and len(content.strip()) == 0:
65
- errors.append(f"Empty content in {role} message")
66
-
67
- return errors
68
-
69
- def main():
70
- parser = argparse.ArgumentParser()
71
- parser.add_argument("--input", type=str, default="training-data/final/train.jsonl")
72
- parser.add_argument("--catalog", type=str, default="training-data/tools/catalog.json")
73
- parser.add_argument("--output-report", type=str, default="training-data/final/quality_report.json")
74
- args = parser.parse_args()
75
-
76
- input_path = Path(args.input)
77
- catalog_path = Path(args.catalog)
78
-
79
- if not input_path.exists():
80
- print(f"❌ Input not found: {input_path}")
81
- return
82
-
83
- if not catalog_path.exists():
84
- print(f"⚠️ Catalog not found: {catalog_path}, skipping tool validation")
85
- tool_catalog = {}
86
- else:
87
- tool_catalog = load_tool_catalog(catalog_path)
88
- print(f"✅ Loaded tool catalog with {len(tool_catalog)} tools")
89
-
90
- print(f"🔍 Validating {input_path}...")
91
-
92
- total_examples = 0
93
- valid_examples = 0
94
- error_distribution = Counter()
95
- tool_usage = Counter()
96
-
97
- with open(input_path, 'r') as f:
98
- for line in f:
99
- total_examples += 1
100
- try:
101
- example = json.loads(line)
102
- errors = validate_example(example, tool_catalog)
103
-
104
- if errors:
105
- for err in errors:
106
- error_distribution[err] += 1
107
- else:
108
- valid_examples += 1
109
-
110
- # Track tool usage regardless of validation
111
- for msg in example.get("messages", []):
112
- if "tool_use" in msg:
113
- tool_name = msg["tool_use"]["name"]
114
- tool_usage[tool_name] += 1
115
-
116
- except json.JSONDecodeError:
117
- error_distribution["JSON decode error"] += 1
118
-
119
- print(f"\n📊 Validation Results:")
120
- print(f" Total examples: {total_examples}")
121
- print(f" Valid: {valid_examples} ({valid_examples/total_examples*100:.1f}%)")
122
- print(f" Invalid: {total_examples - valid_examples}")
123
-
124
- if error_distribution:
125
- print("\n Error breakdown:")
126
- for err, count in error_distribution.most_common(10):
127
- print(f" - {err}: {count}")
128
-
129
- print("\n Tool usage (top 10):")
130
- for tool, count in tool_usage.most_common(10):
131
- print(f" - {tool}: {count}")
132
-
133
- # Write report
134
- report = {
135
- "total_examples": total_examples,
136
- "valid_examples": valid_examples,
137
- "invalid_examples": total_examples - valid_examples,
138
- "validity_rate": valid_examples / total_examples if total_examples > 0 else 0,
139
- "error_distribution": dict(error_distribution),
140
- "tool_usage": dict(tool_usage),
141
- "generated_at": datetime.datetime.now().isoformat()
142
- }
143
-
144
- output_report = Path(args.output_report)
145
- output_report.parent.mkdir(parents=True, exist_ok=True)
146
- with open(output_report, 'w') as f:
147
- json.dump(report, f, indent=2)
148
-
149
- print(f"\n✅ Report saved: {output_report}")
150
-
151
- if valid_examples / total_examples < 0.9:
152
- print("\n⚠️ Quality below 90%. Consider filtering invalid examples before training.")
153
- else:
154
- print("\n✅ Dataset quality looks good for training!")
155
-
156
- if __name__ == "__main__":
157
- import json, datetime
158
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/training-data-extractor.js DELETED
@@ -1,1098 +0,0 @@
1
- #!/usr/bin/env node
2
- /**
3
- * Stack 2.9 - Enhanced Training Data Extractor
4
- * Extracts training examples from OpenClaw codebase
5
- *
6
- * Features:
7
- * 1. Parse code patterns: function+comment pairs, error messages, test files
8
- * 2. Real conversation parsing (JSON, JSONL, Markdown formats)
9
- * 3. Synthetic examples (50+ per tool)
10
- * 4. JSONL output
11
- */
12
-
13
- import fs from 'fs';
14
- import path from 'path';
15
- import { fileURLToPath } from 'url';
16
- import os from 'os';
17
-
18
- const __filename = fileURLToPath(import.meta.url);
19
- const __dirname = path.dirname(__filename);
20
-
21
- // Paths
22
- const SRC_DIR = path.join(__dirname, 'src');
23
- const OUTPUT_DIR = path.join(__dirname, 'training-data');
24
- const SYNTHETIC_DIR = path.join(OUTPUT_DIR, 'synthetic');
25
- const TOOLS_SCHEMA_DIR = path.join(OUTPUT_DIR, 'tools');
26
- const CODE_PAIRS_DIR = path.join(OUTPUT_DIR, 'code-pairs');
27
- const CONVERSATIONS_DIR = path.join(OUTPUT_DIR, 'conversations');
28
-
29
- // Ensure directories exist
30
- for (const dir of [OUTPUT_DIR, SYNTHETIC_DIR, TOOLS_SCHEMA_DIR, CODE_PAIRS_DIR, CONVERSATIONS_DIR]) {
31
- fs.mkdirSync(dir, { recursive: true });
32
- }
33
-
34
- // ============================================================================
35
- // 1. EXTRACT TOOL SCHEMAS FROM src/tools/
36
- // ============================================================================
37
-
38
- function extractToolSchemas() {
39
- const toolsDir = path.join(SRC_DIR, 'tools');
40
- if (!fs.existsSync(toolsDir)) {
41
- console.log('⚠️ Tools directory not found, skipping...');
42
- return [];
43
- }
44
-
45
- const schemas = [];
46
- const toolDirs = fs.readdirSync(toolsDir).filter(name => {
47
- const stat = fs.statSync(path.join(toolsDir, name));
48
- return stat.isDirectory();
49
- });
50
-
51
- for (const toolDir of toolDirs) {
52
- const toolPath = path.join(toolsDir, toolDir);
53
- const promptFile = path.join(toolPath, 'prompt.ts');
54
- const toolFile = path.join(toolPath, toolDir + '.tsx') || path.join(toolPath, toolDir + '.ts');
55
-
56
- if (fs.existsSync(promptFile) || fs.existsSync(toolFile)) {
57
- try {
58
- const promptContent = fs.existsSync(promptFile) ? fs.readFileSync(promptFile, 'utf-8') : '';
59
- const toolContent = fs.existsSync(toolFile) ? fs.readFileSync(toolFile, 'utf-8') : '';
60
-
61
- // Extract tool description from JSDoc
62
- const descMatch = promptContent.match(/\/\*\*([\s\S]*?)\*\//);
63
- let description = '';
64
- if (descMatch) {
65
- description = descMatch[1]
66
- .replace(/^\s*\* ?/gm, '')
67
- .replace(/^\s*\*/g, '')
68
- .replace(/\*\/$/, '')
69
- .trim()
70
- .substring(0, 300);
71
- }
72
-
73
- // Extract input interface from tool file
74
- let inputSchema = {};
75
- const interfaceMatch = toolContent.match(/interface\s+(\w+Input\w*)\s*\{([\s\S]*?)\}/);
76
- if (interfaceMatch) {
77
- const fields = interfaceMatch[2].match(/(\w+)(\??):\s*([^;]+);/g) || [];
78
- for (const field of fields) {
79
- const match = field.match(/(\w+)(\??):\s*([^;]+);/);
80
- if (match) {
81
- inputSchema[match[1]] = { type: match[3].trim(), optional: match[2] === '?' };
82
- }
83
- }
84
- }
85
-
86
- schemas.push({
87
- tool: toolDir,
88
- description,
89
- hasPrompt: !!promptContent,
90
- hasImplementation: !!toolContent,
91
- inputSchema
92
- });
93
- } catch (e) {
94
- console.log(`⚠️ Error parsing ${toolDir}: ${e.message}`);
95
- }
96
- }
97
- }
98
-
99
- // Write tools catalog
100
- fs.writeFileSync(
101
- path.join(TOOLS_SCHEMA_DIR, 'catalog.json'),
102
- JSON.stringify(schemas, null, 2)
103
- );
104
-
105
- console.log(`✅ Extracted ${schemas.length} tool schemas`);
106
- return schemas;
107
- }
108
-
109
- // ============================================================================
110
- // 2. EXTRACT CODE-COMMENT PAIRS FROM src/
111
- // ============================================================================
112
-
113
- function extractCodeCommentPairs() {
114
- console.log('🔍 Extracting code-comment pairs...');
115
- const pairs = [];
116
-
117
- // Patterns for JSDoc comments
118
- const jsdocPattern = /\/\*\*([\s\S]*?)\*\/\s*\n(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)\s*(?::\s*([^{]+))?\{([\s\S]*?)\n\}/g;
119
- const methodPattern = /\/\*\*([\s\S]*?)\*\/\s*\n\s*(?:async\s+)?(\w+)\s*\([^)]*\)[^:]*\{([\s\S]*?)\n\s*\}/g;
120
-
121
- // Error message patterns
122
- const errorPattern = /(?:throw\s+new\s+Error|logger\.error|console\.error)\s*\(\s*[`"']([^`'"]+)[`'"]/g;
123
- const errorClassPattern = /class\s+(\w+Error\w*)\s+extends\s+Error\s*\{([^}]*)\}/g;
124
-
125
- function processFile(filePath) {
126
- try {
127
- const content = fs.readFileSync(filePath, 'utf-8');
128
- const relativePath = path.relative(SRC_DIR, filePath);
129
-
130
- // Skip test files and mock files for now
131
- if (filePath.includes('__tests__') || filePath.includes('mocks')) return;
132
-
133
- // Extract function + JSDoc pairs
134
- let match;
135
- const funcRegex = /\/\*\*([\s\S]*?)\*\/\s*\n\s*(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)\s*(?::\s*([^;\n]+))?[^{]*\{([\s\S]*?)\n\}/g;
136
-
137
- while ((match = funcRegex.exec(content)) !== null) {
138
- const jsdoc = match[1].replace(/^\s*\*\s*/gm, '').trim();
139
- const funcName = match[2];
140
- const params = match[3].trim();
141
- const returnType = match[4]?.trim() || 'void';
142
- const body = match[5].trim();
143
-
144
- // Only include if meaningful (not too short, has actual logic)
145
- if (body.length > 50 && jsdoc.length > 10) {
146
- pairs.push({
147
- type: 'function',
148
- name: funcName,
149
- path: relativePath,
150
- code: `function ${funcName}(${params})${returnType ? `: ${returnType}` : ''} { ... }`,
151
- fullBody: body.substring(0, 500),
152
- comment: jsdoc.substring(0, 300),
153
- commentType: 'jsdoc'
154
- });
155
- }
156
- }
157
-
158
- // Extract error messages and patterns
159
- const errorRegex = /(?:throw\s+new\s+Error|logger\.error|console\.error)\s*\(\s*[`"']([^`'"]+)[`'"]/g;
160
- let errorMatch;
161
- while ((errorMatch = errorRegex.exec(content)) !== null) {
162
- const errorMsg = errorMatch[1];
163
- // Categorize error type
164
- let category = 'general';
165
- if (errorMsg.includes('not found') || errorMsg.includes('No such')) category = 'not_found';
166
- else if (errorMsg.includes('permission') || errorMsg.includes('denied')) category = 'permission';
167
- else if (errorMsg.includes('invalid') || errorMsg.includes('malformed')) category = 'validation';
168
- else if (errorMsg.includes('timeout')) category = 'timeout';
169
- else if (errorMsg.includes('already')) category = 'conflict';
170
-
171
- pairs.push({
172
- type: 'error_message',
173
- path: relativePath,
174
- message: errorMsg,
175
- category,
176
- fixSuggestion: generateFixSuggestion(errorMsg, category)
177
- });
178
- }
179
-
180
- // Extract class with error handling
181
- const classRegex = /class\s+(\w+)\s*\{([^}]*(?:\{[^}]*\}[^}]*)*)\}/g;
182
- let classMatch;
183
- while ((classMatch = classRegex.exec(content)) !== null) {
184
- const className = match[1];
185
- const classBody = match[2];
186
- // Look for try-catch patterns
187
- if (classBody.includes('try') && classBody.includes('catch')) {
188
- pairs.push({
189
- type: 'error_handling_class',
190
- name: className,
191
- path: relativePath,
192
- pattern: 'try-catch',
193
- example: classBody.substring(0, 400)
194
- });
195
- }
196
- }
197
-
198
- } catch (e) {
199
- // Skip files that can't be read
200
- }
201
- }
202
-
203
- function walkDir(dir, extensions = ['.ts', '.tsx']) {
204
- if (!fs.existsSync(dir)) return;
205
-
206
- const entries = fs.readdirSync(dir, { withFileTypes: true });
207
- for (const entry of entries) {
208
- const fullPath = path.join(dir, entry.name);
209
- if (entry.isDirectory() && !entry.name.startsWith('.') && entry.name !== 'node_modules') {
210
- walkDir(fullPath, extensions);
211
- } else if (entry.isFile() && extensions.some(ext => entry.name.endsWith(ext))) {
212
- processFile(fullPath);
213
- }
214
- }
215
- }
216
-
217
- walkDir(SRC_DIR);
218
-
219
- // Save code-comment pairs
220
- fs.writeFileSync(
221
- path.join(CODE_PAIRS_DIR, 'pairs.json'),
222
- JSON.stringify(pairs, null, 2)
223
- );
224
-
225
- console.log(`✅ Extracted ${pairs.length} code-comment pairs`);
226
- return pairs;
227
- }
228
-
229
- function generateFixSuggestion(message, category) {
230
- const suggestions = {
231
- not_found: 'Check if the resource exists or provide the correct path',
232
- permission: 'Ensure you have the necessary permissions for this operation',
233
- validation: 'Verify the input format and required fields',
234
- timeout: 'Increase timeout duration or check network connectivity',
235
- conflict: 'Check if the resource already exists or needs to be deleted first',
236
- general: 'Review the error message and correct the underlying issue'
237
- };
238
- return suggestions[category] || suggestions.general;
239
- }
240
-
241
- // ============================================================================
242
- // 3. PARSE TEST FILES FOR TEST-GENERATION EXAMPLES
243
- // ============================================================================
244
-
245
- function extractTestExamples() {
246
- console.log('🧪 Extracting test examples...');
247
- const testExamples = [];
248
-
249
- const testPattern = /describe\s*\(\s*['"]([^'"]+)['"](?:\s*,\s*)?\(\s*\)\s*=>\s*\{([^}]*(?:\{[^}]*\}[^}]*)*)\}\s*\)/g;
250
- const itPattern = /it\s*\(\s*['"]([^'"]+)['"](?:\s*,\s*)?(?:async\s+)?\(\s*\)\s*(?:=>\s*)?\{([\s\S]*?)\n\s*\}/g;
251
- const expectPattern = /expect\s*\(([^)]+)\)\.(to[^;(]+)\s*\(([^)]+)\)/g;
252
-
253
- function processTestFile(filePath) {
254
- try {
255
- const content = fs.readFileSync(filePath, 'utf-8');
256
- const relativePath = path.relative(SRC_DIR, filePath);
257
-
258
- let match;
259
- while ((match = testPattern.exec(content)) !== null) {
260
- const testSuite = match[1];
261
- const testBody = match[2];
262
-
263
- // Extract individual it() blocks
264
- const itRegex = /it\s*\(\s*['"]([^'"]+)['"](?:\s*,\s*)?(?:async\s+)?\(\s*\)\s*(?:=>\s*)?\{([\s\S]*?)\n\s*\}/g;
265
- let itMatch;
266
-
267
- while ((itMatch = itRegex.exec(testBody)) !== null) {
268
- const testName = itMatch[1];
269
- const testCode = itMatch[2].trim();
270
-
271
- // Extract assertions
272
- const assertions = [];
273
- const expectRegex = /expect\s*\(([^)]+)\)\.(\w+)\s*\(([^)]*)\)/g;
274
- let expectMatch;
275
-
276
- while ((expectMatch = expectRegex.exec(testCode)) !== null) {
277
- assertions.push({
278
- actual: expectMatch[1],
279
- matcher: expectMatch[2],
280
- expected: expectMatch[3]
281
- });
282
- }
283
-
284
- if (assertions.length > 0) {
285
- testExamples.push({
286
- type: 'test_example',
287
- suite: testSuite,
288
- name: testName,
289
- path: relativePath,
290
- code: testCode.substring(0, 400),
291
- assertions,
292
- isAsync: testCode.includes('await')
293
- });
294
- }
295
- }
296
- }
297
- } catch (e) {
298
- // Skip files that can't be read
299
- }
300
- }
301
-
302
- function walkDir(dir) {
303
- if (!fs.existsSync(dir)) return;
304
-
305
- const entries = fs.readdirSync(dir, { withFileTypes: true });
306
- for (const entry of entries) {
307
- const fullPath = path.join(dir, entry.name);
308
- if (entry.isDirectory()) {
309
- walkDir(fullPath);
310
- } else if (entry.isFile() && (entry.name.endsWith('.test.ts') || entry.name.endsWith('.test.tsx'))) {
311
- processTestFile(fullPath);
312
- }
313
- }
314
- }
315
-
316
- // Look for test files in __tests__ directories
317
- walkDir(SRC_DIR);
318
-
319
- // Save test examples
320
- fs.writeFileSync(
321
- path.join(CODE_PAIRS_DIR, 'test-examples.json'),
322
- JSON.stringify(testExamples, null, 2)
323
- );
324
-
325
- console.log(`✅ Extracted ${testExamples.length} test examples`);
326
- return testExamples;
327
- }
328
-
329
- // ============================================================================
330
- // 4. PARSE REAL CONVERSATIONS FROM SESSION LOGS
331
- // ============================================================================
332
-
333
- function parseConversations() {
334
- console.log('💬 Parsing conversations from session logs...');
335
- const conversations = [];
336
-
337
- // Common session log locations
338
- const sessionLogPaths = [
339
- path.join(os.homedir(), '.claude', 'sessions'),
340
- path.join(os.homedir(), '.openclaw', 'sessions'),
341
- path.join(os.homedir(), '.claude', 'conversations'),
342
- path.join(os.homedir(), '.openclaw', 'conversations'),
343
- path.join(os.homedir(), '.config', 'claude', 'sessions')
344
- ];
345
-
346
- function parseJsonFormat(content, source) {
347
- try {
348
- const data = JSON.parse(content);
349
- if (data.messages && Array.isArray(data.messages)) {
350
- return {
351
- format: 'json',
352
- source,
353
- messages: data.messages,
354
- metadata: data.metadata || {}
355
- };
356
- }
357
- if (data.conversation && data.conversation.messages) {
358
- return {
359
- format: 'json',
360
- source,
361
- messages: data.conversation.messages,
362
- metadata: data.metadata || {}
363
- };
364
- }
365
- } catch (e) {}
366
- return null;
367
- }
368
-
369
- function parseJsonlFormat(content, source) {
370
- const lines = content.trim().split('\n');
371
- const conversations = [];
372
-
373
- for (const line of lines) {
374
- try {
375
- const obj = JSON.parse(line);
376
- if (obj.messages || obj.conversation) {
377
- conversations.push({
378
- format: 'jsonl',
379
- source,
380
- messages: obj.messages || obj.conversation?.messages || [],
381
- metadata: obj.metadata || {}
382
- });
383
- }
384
- } catch (e) {}
385
- }
386
-
387
- return conversations;
388
- }
389
-
390
- function parseMarkdownFormat(content, source) {
391
- const messages = [];
392
- const blocks = content.split(/(?=^##?\s+(?:User|Assistant|System|Human|AI))/m);
393
-
394
- let currentRole = null;
395
- let currentContent = [];
396
-
397
- for (const block of blocks) {
398
- const roleMatch = block.match(/^##?\s+(User|Assistant|System|Human|AI|Assistant \(tool\))/im);
399
- if (roleMatch) {
400
- if (currentRole && currentContent.length > 0) {
401
- messages.push({
402
- role: currentRole,
403
- content: currentContent.join('\n').trim()
404
- });
405
- }
406
- currentRole = roleMatch[1].toLowerCase().replace('assistant (tool)', 'tool');
407
- currentContent = [block.replace(/^##?\s+.*$/m, '').trim()];
408
- } else if (currentRole) {
409
- currentContent.push(block.trim());
410
- }
411
- }
412
-
413
- if (currentRole && currentContent.length > 0) {
414
- messages.push({
415
- role: currentRole,
416
- content: currentContent.join('\n').trim()
417
- });
418
- }
419
-
420
- if (messages.length > 0) {
421
- return {
422
- format: 'markdown',
423
- source,
424
- messages,
425
- metadata: {}
426
- };
427
- }
428
- return null;
429
- }
430
-
431
- function processLogFile(filePath) {
432
- try {
433
- const content = fs.readFileSync(filePath, 'utf-8');
434
- const source = path.relative(os.homedir(), filePath);
435
-
436
- // Try JSON format
437
- if (filePath.endsWith('.json')) {
438
- const parsed = parseJsonFormat(content, source);
439
- if (parsed) {
440
- conversations.push(parsed);
441
- return;
442
- }
443
- }
444
-
445
- // Try JSONL format
446
- if (filePath.endsWith('.jsonl')) {
447
- const parsed = parseJsonlFormat(content, source);
448
- conversations.push(...parsed);
449
- return;
450
- }
451
-
452
- // Try Markdown format
453
- if (filePath.endsWith('.md') || filePath.endsWith('.mdx')) {
454
- const parsed = parseMarkdownFormat(content, source);
455
- if (parsed) {
456
- conversations.push(parsed);
457
- }
458
- }
459
- } catch (e) {
460
- // Skip files that can't be read
461
- }
462
- }
463
-
464
- function walkDir(dir) {
465
- if (!fs.existsSync(dir)) return;
466
-
467
- try {
468
- const entries = fs.readdirSync(dir, { withFileTypes: true });
469
- for (const entry of entries) {
470
- const fullPath = path.join(dir, entry.name);
471
- if (entry.isDirectory()) {
472
- walkDir(fullPath);
473
- } else if (entry.isFile() &&
474
- (entry.name.endsWith('.json') ||
475
- entry.name.endsWith('.jsonl') ||
476
- entry.name.endsWith('.md') ||
477
- entry.name.endsWith('.mdx'))) {
478
- processLogFile(fullPath);
479
- }
480
- }
481
- } catch (e) {
482
- // Skip directories that can't be accessed
483
- }
484
- }
485
-
486
- // Try all known session log locations
487
- for (const logPath of sessionLogPaths) {
488
- walkDir(logPath);
489
- }
490
-
491
- // Save parsed conversations
492
- fs.writeFileSync(
493
- path.join(CONVERSATIONS_DIR, 'parsed.json'),
494
- JSON.stringify(conversations, null, 2)
495
- );
496
-
497
- console.log(`✅ Parsed ${conversations.length} conversations`);
498
- return conversations;
499
- }
500
-
501
- // ============================================================================
502
- // 5. GENERATE SYNTHETIC EXAMPLES (50+ per tool)
503
- // ============================================================================
504
-
505
- function generateSyntheticExamples(tools) {
506
- console.log('🎭 Generating synthetic examples...');
507
- const examples = [];
508
-
509
- // Base system prompts
510
- const systemPrompts = [
511
- 'You are Stack, an AI coding assistant. You help with programming tasks, answer questions, and use tools when needed.',
512
- 'You are Stack, an expert programmer specializing in TypeScript, React, and Node.js.',
513
- 'You are Stack, a helpful AI assistant that can read and write files, run commands, and search code.',
514
- 'You are Stack, an AI coding assistant with access to file system tools, search tools, and more.'
515
- ];
516
-
517
- // =========================================================================
518
- // EXAMPLES: Basic coding questions and answers
519
- // =========================================================================
520
-
521
- const basicQAPairs = [
522
- {
523
- q: 'How do I read a file in Node.js?',
524
- a: 'You can use the `fs` module. Here are the main approaches:\n\n1. Synchronous (simple but blocks):\n```js\nconst fs = require(\'fs\');\nconst content = fs.readFileSync(\'file.txt\', \'utf8\');\n```\n\n2. Async with callbacks:\n```js\nfs.readFile(\'file.txt\', \'utf8\', (err, data) => {\n if (err) throw err;\n console.log(data);\n});\n```\n\n3. Promise-based (recommended):\n```js\nconst fs = require(\'fs\').promises;\nconst data = await fs.readFile(\'file.txt\', \'utf8\');\n```'
525
- },
526
- {
527
- q: 'How do I write to a file in Node.js?',
528
- a: 'Use the `fs` module:\n\n1. Sync (blocks):\n```js\nfs.writeFileSync(\'output.txt\', \'Hello!\', \'utf8\');\n```\n\n2. Async with promises:\n```js\nawait fs.writeFile(\'output.txt\', \'Hello!\', \'utf8\');\n```\n\n3. Append to existing:\n```js\nawait fs.appendFile(\'log.txt\', \'new entry\\n\', \'utf8\');\n```'
529
- },
530
- {
531
- q: 'How do I list files in a directory?',
532
- a: 'Use `fs.readdir()`:\n\n```js\nconst files = await fs.readdir(\'./src\', { withFileTypes: true });\n\nfor (const file of files) {\n if (file.isDirectory()) {\n console.log(`📁 ${file.name}/`);\n } else {\n console.log(`📄 ${file.name}`);\n }\n}\n```'
533
- },
534
- {
535
- q: 'What is the difference between let, const, and var?',
536
- a: 'Key differences:\n\n**const** - Block-scoped, cannot be reassigned (but object properties can change)\n**let** - Block-scoped, can be reassigned\n**var** - Function-scoped (or global), hoisted (avoid in modern JS)\n\n```js\nconst PI = 3.14; // Cannot reassign\nlet count = 0; // Can reassign\ncount = count + 1; // ✓\nvar oldWay = \'hoisted\'; // Avoid - function scoped\n```'
537
- },
538
- {
539
- q: 'How do I use async/await properly?',
540
- a: 'Async/await is syntactic sugar over promises:\n\n```js\n// Basic usage\nasync function fetchData() {\n try {\n const response = await fetch(url);\n const data = await response.json();\n return data;\n } catch (error) {\n console.error(\'Fetch failed:\', error);\n throw error;\n }\n}\n\n// Parallel execution\nconst [users, posts] = await Promise.all([\n fetch(\'/api/users\').then(r => r.json()),\n fetch(\'/api/posts\').then(r => r.json())\n]);\n```'
541
- }
542
- ];
543
-
544
- for (const pair of basicQAPairs) {
545
- examples.push({
546
- messages: [
547
- { role: 'system', content: systemPrompts[0] },
548
- { role: 'user', content: pair.q },
549
- { role: 'assistant', content: pair.a }
550
- ]
551
- });
552
- }
553
-
554
- // =========================================================================
555
- // EXAMPLES: Tool use patterns (per tool)
556
- // =========================================================================
557
-
558
- const toolUsePatterns = [
559
- {
560
- tool: 'FileReadTool',
561
- scenarios: [
562
- {
563
- user: 'Show me the contents of package.json',
564
- assistant: 'I\'ll read the package.json file for you.',
565
- toolName: 'FileReadTool',
566
- toolInput: { file_path: 'package.json' },
567
- resultContent: '{\n "name": "my-project",\n "version": "1.0.0",\n "scripts": {\n "dev": "next dev",\n "build": "next build"\n }\n}',
568
- summary: 'Here\'s what\'s in your package.json:\n\n- **Project name**: my-project\n- **Version**: 1.0.0\n- **Scripts**: dev (next dev), build (next build)'
569
- },
570
- {
571
- user: 'Read the config.ts file from the src directory',
572
- assistant: 'Let me read that config file.',
573
- toolName: 'FileReadTool',
574
- toolInput: { file_path: 'src/config.ts' },
575
- resultContent: 'export const config = {\n apiUrl: \'https://api.example.com\',\n timeout: 5000,\n retries: 3\n};',
576
- summary: 'The config file exports a configuration object with:\n\n- **API URL**: https://api.example.com\n- **Timeout**: 5000ms\n- **Retries**: 3'
577
- },
578
- {
579
- user: 'What\'s in the README.md?',
580
- assistant: 'Reading the README...',
581
- toolName: 'FileReadTool',
582
- toolInput: { file_path: 'README.md' },
583
- resultContent: '# My Project\n\nA brief description of the project.\n\n## Installation\n\nnpm install\n\n## Usage\n\nnpm start',
584
- summary: 'The README contains:\n\n- Project title and description\n- Installation instructions\n- Usage guide'
585
- }
586
- ]
587
- },
588
- {
589
- tool: 'GlobTool',
590
- scenarios: [
591
- {
592
- user: 'Find all TypeScript files in src',
593
- assistant: 'Searching for TypeScript files...',
594
- toolName: 'GlobTool',
595
- toolInput: { pattern: 'src/**/*.ts' },
596
- resultContent: 'src/index.ts\nsrc/utils.ts\nsrc/components/Button.tsx\nsrc/services/api.ts',
597
- summary: 'Found 4 TypeScript files in src/'
598
- },
599
- {
600
- user: 'List all test files',
601
- assistant: 'Looking for test files...',
602
- toolName: 'GlobTool',
603
- toolInput: { pattern: '**/*.test.{ts,tsx}' },
604
- resultContent: '__tests__/sample.test.ts\nsrc/components/__tests__/Button.test.tsx\nsrc/utils/__tests__/helpers.test.ts',
605
- summary: 'Found 3 test files'
606
- },
607
- {
608
- user: 'Find all JSON config files',
609
- assistant: 'Searching for JSON config files...',
610
- toolName: 'GlobTool',
611
- toolInput: { pattern: '**/*.json' },
612
- resultContent: 'package.json\ntsconfig.json\njest.config.json\nsrc/config.json',
613
- summary: 'Found 4 JSON files'
614
- }
615
- ]
616
- },
617
- {
618
- tool: 'GrepTool',
619
- scenarios: [
620
- {
621
- user: 'Find all uses of "TODO" in the code',
622
- assistant: 'Searching for TODO comments...',
623
- toolName: 'GrepTool',
624
- toolInput: { pattern: 'TODO', path: '.' },
625
- resultContent: 'src/utils.ts: // TODO: implement error handling\nsrc/components/Form.tsx: // TODO: add validation\nsrc/api/client.ts: // TODO: add retry logic',
626
- summary: 'Found 3 TODO comments in:\n- src/utils.ts\n- src/components/Form.tsx\n- src/api/client.ts'
627
- },
628
- {
629
- user: 'Search for "async function" in the src',
630
- assistant: 'Finding async functions...',
631
- toolName: 'GrepTool',
632
- toolInput: { pattern: 'async function', path: 'src' },
633
- resultContent: 'src/api/client.ts: async function fetchData() {\nsrc/utils/parser.ts: async function parseJSON() {',
634
- summary: 'Found 2 async functions'
635
- }
636
- ]
637
- },
638
- {
639
- tool: 'BashTool',
640
- scenarios: [
641
- {
642
- user: 'Run npm install',
643
- assistant: 'Installing dependencies...',
644
- toolName: 'BashTool',
645
- toolInput: { command: 'npm install' },
646
- resultContent: 'added 245 packages in 12s\n\n12 packages are looking for funding\n run `npm fund` for details',
647
- summary: 'Successfully installed 245 packages'
648
- },
649
- {
650
- user: 'Check git status',
651
- assistant: 'Checking git status...',
652
- toolName: 'BashTool',
653
- toolInput: { command: 'git status' },
654
- resultContent: 'On branch main\nChanges not staged for commit:\n modified: src/index.ts\n modified: package.json',
655
- summary: 'On branch main with 2 modified files'
656
- },
657
- {
658
- user: 'Run the tests',
659
- assistant: 'Running tests...',
660
- toolName: 'BashTool',
661
- toolInput: { command: 'npm test' },
662
- resultContent: 'PASS __tests__/sample.test.ts\nPASS src/components/__tests__/Button.test.tsx\n\nTest Suites: 2 passed, 2 total\nTests: 5 passed, 5 total',
663
- summary: 'All tests passed! 2 test suites, 5 tests total'
664
- }
665
- ]
666
- },
667
- {
668
- tool: 'FileWriteTool',
669
- scenarios: [
670
- {
671
- user: 'Create a new config file',
672
- assistant: 'Creating the config file...',
673
- toolName: 'FileWriteTool',
674
- toolInput: { file_path: 'config/local.json', content: '{\n "name": "local-config",\n "debug": true\n}' },
675
- resultContent: '',
676
- summary: 'Created config/local.json'
677
- },
678
- {
679
- user: 'Write a simple test file',
680
- assistant: 'Writing the test file...',
681
- toolName: 'FileWriteTool',
682
- toolInput: { file_path: 'test/example.test.ts', content: 'describe(\'Example\', () => {\n it(\'should pass\', () => {\n expect(1 + 1).toBe(2);\n });\n});' },
683
- resultContent: '',
684
- summary: 'Created test/example.test.ts'
685
- }
686
- ]
687
- },
688
- {
689
- tool: 'FileEditTool',
690
- scenarios: [
691
- {
692
- user: 'Add a new export to the index file',
693
- assistant: 'Editing the index file...',
694
- toolName: 'FileEditTool',
695
- toolInput: {
696
- file_path: 'src/index.ts',
697
- old_string: 'export { Config } from \'./config\';',
698
- new_string: 'export { Config } from \'./config\';\nexport { Logger } from \'./logger\';'
699
- },
700
- resultContent: '',
701
- summary: 'Added Logger export to src/index.ts'
702
- }
703
- ]
704
- },
705
- {
706
- tool: 'WebSearchTool',
707
- scenarios: [
708
- {
709
- user: 'Search for the latest React documentation',
710
- assistant: 'Searching the web...',
711
- toolName: 'WebSearchTool',
712
- toolInput: { query: 'React 18 documentation' },
713
- resultContent: 'Found results for React documentation...',
714
- summary: 'Found relevant documentation resources'
715
- }
716
- ]
717
- },
718
- {
719
- tool: 'WebFetchTool',
720
- scenarios: [
721
- {
722
- user: 'Fetch the content of a GitHub README',
723
- assistant: 'Fetching the README...',
724
- toolName: 'WebFetchTool',
725
- toolInput: { url: 'https://github.com/facebook/react' },
726
- resultContent: 'README content...',
727
- summary: 'Successfully fetched the README'
728
- }
729
- ]
730
- }
731
- ];
732
-
733
- // Generate tool use examples
734
- for (const toolPattern of toolUsePatterns) {
735
- for (const scenario of toolPattern.scenarios) {
736
- examples.push({
737
- messages: [
738
- { role: 'system', content: systemPrompts[0] },
739
- { role: 'user', content: scenario.user },
740
- { role: 'assistant', content: scenario.assistant, tool_use: { name: scenario.toolName, input: scenario.toolInput } },
741
- { role: 'user', content: '', tool_result: { tool_use_id: 'tool_1', content: scenario.resultContent } },
742
- { role: 'assistant', content: scenario.summary }
743
- ]
744
- });
745
- }
746
- }
747
-
748
- // =========================================================================
749
- // EXAMPLES: Error handling patterns
750
- // =========================================================================
751
-
752
- const errorHandlingExamples = [
753
- {
754
- user: 'Why did my file write fail?',
755
- context: 'Error: ENOENT: no such file or directory, open \'missing/dir/file.txt\'',
756
- response: 'The error indicates the directory doesn\'t exist. You need to create the parent directory first:\n\n```js\nimport fs from \'fs\'.promises;\nimport path from \'path\';\n\nasync function ensureDir(dirPath) {\n await fs.mkdir(path.dirname(dirPath), { recursive: true });\n}\n\n// Then write\nawait ensureDir(\'missing/dir/file.txt\');\nawait fs.writeFile(\'missing/dir/file.txt\', \'content\');\n```\n\nOr use `fs.mkdirSync(path.dirname(filePath), { recursive: true })` synchronously.'
757
- },
758
- {
759
- user: 'My async function is throwing an error',
760
- context: 'Error: Cannot read property \'map\' of undefined',
761
- response: 'This usually means the data you\'re trying to map over isn\'t defined yet. Here\'s how to handle it:\n\n```js\nasync function getItems() {\n try {\n const data = await fetchData();\n // Check before using\n return data?.items?.map(item => item.id) || [];\n } catch (error) {\n console.error(\'Failed to fetch:\', error);\n return []; // Return fallback\n }\n}\n```\n\nUse optional chaining (`?.`) to safely access nested properties.'
762
- }
763
- ];
764
-
765
- for (const error of errorHandlingExamples) {
766
- examples.push({
767
- messages: [
768
- { role: 'system', content: systemPrompts[0] },
769
- { role: 'user', content: error.user + '\n\nError context: ' + error.context },
770
- { role: 'assistant', content: error.response }
771
- ]
772
- });
773
- }
774
-
775
- // =========================================================================
776
- // EXAMPLES: Multi-step workflows
777
- // =========================================================================
778
-
779
- const multiStepWorkflows = [
780
- {
781
- name: 'Create and test a new component',
782
- steps: [
783
- { role: 'user', content: 'Create a new React component called UserCard' },
784
- { role: 'assistant', content: 'I\'ll create the UserCard component for you.', tool_use: { name: 'FileWriteTool', input: { file_path: 'src/components/UserCard.tsx', content: 'import React from \'react\';\n\ninterface UserCardProps {\n name: string;\n email: string;\n}\n\nexport const UserCard: React.FC<UserCardProps> = ({ name, email }) => {\n return (\n <div className="user-card">\n <h3>{name}</h3>\n <p>{email}</p>\n </div>\n );\n};' } } },
785
- { role: 'user', content: '', tool_result: { tool_use_id: 'tool_1', content: 'File created successfully' } },
786
- { role: 'assistant', content: 'Created the component. Now let me write a test for it.', tool_use: { name: 'FileWriteTool', input: { file_path: 'src/components/__tests__/UserCard.test.tsx', content: 'import React from \'react\';\nimport { render, screen } from \'@testing-library/react\';\nimport { UserCard } from \'../UserCard\';\n\ndescribe(\'UserCard\', () => {\n it(\'renders user name and email\', () => {\n render(<UserCard name="John" email="john@example.com" />);\n expect(screen.getByText(\'John\')).toBeInTheDocument();\n expect(screen.getByText(\'john@example.com\')).toBeInTheDocument();\n });\n});' } } },
787
- { role: 'user', content: '', tool_result: { tool_use_id: 'tool_2', content: 'Test file created' } },
788
- { role: 'assistant', content: 'Now let me run the tests to verify it works.', tool_use: { name: 'BashTool', input: { command: 'npm test -- --testPathPattern=UserCard' } } },
789
- { role: 'user', content: '', tool_result: { tool_use_id: 'tool_3', content: 'PASS src/components/__tests__/UserCard.test.tsx\n\n UserCard\n ✓ renders user name and email' } },
790
- { role: 'assistant', content: 'All tests pass! The UserCard component is ready to use.\n\n```tsx\nimport { UserCard } from \'./components/UserCard\';\n\n<UserCard name="John" email="john@example.com" />\n```' }
791
- ]
792
- },
793
- {
794
- name: 'Debug and fix a failing test',
795
- steps: [
796
- { role: 'user', content: 'Run the tests - I think something is broken' },
797
- { role: 'assistant', content: 'Running tests...', tool_use: { name: 'BashTool', input: { command: 'npm test 2>&1' } } },
798
- { role: 'user', content: '', tool_result: { tool_use_id: 'tool_1', content: 'FAIL src/utils/__tests__/format.test.ts\n\n Format Date\n ✕ should format date as YYYY-MM-DD (5ms)\n\n expect(received).toBe(expected)\n\n Expected: "2024-01-15"\n Received: "15-01-2024"' } },
799
- { role: 'assistant', content: 'I see the test is failing. Let me look at the formatDate function to understand the issue.', tool_use: { name: 'GrepTool', input: { pattern: 'formatDate', path: 'src/utils' } } },
800
- { role: 'user', content: '', tool_result: { tool_use_id: 'tool_2', content: 'src/utils/format.ts:export function formatDate(date: Date): string {\n const d = new Date(date);\n return `${d.getDate()}-${d.getMonth() + 1}-${d.getFullYear()}`;\n}' } },
801
- { role: 'assistant', content: 'Found it! The function returns DD-MM-YYYY but the test expects YYYY-MM-DD. Let me fix it.', tool_use: { name: 'FileEditTool', input: { file_path: 'src/utils/format.ts', old_string: 'return `${d.getDate()}-${d.getMonth() + 1}-${d.getFullYear()}`;', new_string: 'return `${d.getFullYear()}-${String(d.getMonth() + 1).padStart(2, \'0\')}-${String(d.getDate()).padStart(2, \'0\')}`;' } } },
802
- { role: 'user', content: '', tool_result: { tool_use_id: 'tool_3', content: 'File updated' } },
803
- { role: 'assistant', content: 'Fixed the format. Let me run the tests again to confirm.', tool_use: { name: 'BashTool', input: { command: 'npm test -- --testPathPattern=format.test' } } },
804
- { role: 'user', content: '', tool_result: { tool_use_id: 'tool_4', content: 'PASS src/utils/__tests__/format.test.ts\n\n Format Date\n ✓ should format date as YYYY-MM-DD' } },
805
- { role: 'assistant', content: 'All tests pass now! The date is correctly formatted as YYYY-MM-DD.' }
806
- ]
807
- }
808
- ];
809
-
810
- for (const workflow of multiStepWorkflows) {
811
- examples.push({
812
- messages: [
813
- { role: 'system', content: systemPrompts[2] },
814
- ...workflow.steps
815
- ]
816
- });
817
- }
818
-
819
- // =========================================================================
820
- // EXAMPLES: Edge cases
821
- // =========================================================================
822
-
823
- const edgeCases = [
824
- {
825
- user: 'Handle an empty array in my reduce function',
826
- context: 'My reduce throws when the array is empty',
827
- response: 'Here are ways to handle empty arrays with reduce:\n\n```js\n// 1. Provide initial value\nconst sum = [].reduce((acc, val) => acc + val, 0); // Returns 0\n\n// 2. Check first\nif (arr.length === 0) return defaultValue;\n\n// 3. Use reduce with optional initial value\nfunction safeReduce(arr, reducer, initial) {\n if (arr.length === 0 && initial === undefined) {\n throw new Error(\'Cannot reduce empty array without initial value\');\n }\n return arr.reduce(reducer, initial);\n}\n```'
828
- },
829
- {
830
- user: 'Deal with optional function parameters',
831
- context: 'How do I make some parameters optional?',
832
- response: 'You can use optional parameters in several ways:\n\n```ts\n// TypeScript optional parameter\nfunction greet(name: string, greeting?: string) {\n return `${greeting || \'Hello\'}, ${name}!`;\n}\n\n// Default parameter\nfunction createUser(name: string, role = \'user\') {\n return { name, role };\n}\n\n// Nullish coalescing for defaults\nfunction process(value: string | null) {\n const input = value ?? \'default\';\n}\n```'
833
- },
834
- {
835
- user: 'Handle API rate limiting',
836
- context: 'Getting 429 errors from the API',
837
- response: 'Here\'s a pattern for handling rate limits:\n\n```js\nasync function fetchWithRetry(url, retries = 3) {\n for (let i = 0; i < retries; i++) {\n const response = await fetch(url);\n \n if (response.status === 429) {\n const retryAfter = response.headers.get(\'Retry-After\') || 60;\n console.log(`Rate limited. Waiting ${retryAfter}s...`);\n await new Promise(r => setTimeout(r, retryAfter * 1000));\n continue;\n }\n \n return response.json();\n }\n throw new Error(\'Max retries exceeded\');\n}\n```\n\nUse exponential backoff for more aggressive retrying.'
838
- }
839
- ];
840
-
841
- for (const edge of edgeCases) {
842
- examples.push({
843
- messages: [
844
- { role: 'system', content: systemPrompts[1] },
845
- { role: 'user', content: edge.user + '\n\nContext: ' + edge.context },
846
- { role: 'assistant', content: edge.response }
847
- ]
848
- });
849
- }
850
-
851
- // =========================================================================
852
- // GENERATE 50+ EXAMPLES PER TOOL (tool-specific variations)
853
- // =========================================================================
854
-
855
- const toolNames = tools.map(t => t.tool);
856
- const variationsPerTool = {
857
- FileReadTool: [
858
- 'Read the first 100 lines of a large log file',
859
- 'Show me the contents of .env.example',
860
- 'What\'s in the tsconfig.json?',
861
- 'Read the package-lock.json to check versions',
862
- 'Show me the gitignore file'
863
- ],
864
- FileWriteTool: [
865
- 'Create a .gitignore file with common ignores',
866
- 'Write a new entry to the changelog',
867
- 'Create a simple JSON config file',
868
- 'Write the test results to output.txt'
869
- ],
870
- GlobTool: [
871
- 'Find all .test.ts files',
872
- 'List all files in src/ directory',
873
- 'Find all files with "helper" in the name',
874
- 'Search for *.config.js files',
875
- 'Find all files in any __tests__ directory'
876
- ],
877
- GrepTool: [
878
- 'Find all console.log statements',
879
- 'Search for "export default"',
880
- 'Find all imports from "react"',
881
- 'Search for password or secret patterns',
882
- 'Find all unused imports'
883
- ],
884
- BashTool: [
885
- 'Initialize a new git repository',
886
- 'Show the last 10 commits',
887
- 'List all npm scripts available',
888
- 'Check the current directory',
889
- 'Show the difference between branches'
890
- ]
891
- };
892
-
893
- // Generate 50+ examples by varying prompts for each tool
894
- let exampleCount = examples.length;
895
-
896
- for (const tool of tools) {
897
- const variations = variationsPerTool[tool.tool] || [];
898
-
899
- for (let i = 0; i < 5; i++) {
900
- const variation = variations[i % variations.length];
901
- const idx = i % variations.length;
902
-
903
- examples.push({
904
- messages: [
905
- { role: 'system', content: systemPrompts[i % systemPrompts.length] },
906
- { role: 'user', content: `${variation || 'process'} (variant ${i + 1})` },
907
- { role: 'assistant', content: `I'll help you with that using ${tool.tool}. This is a variant example showing different ways to phrase the same intent.`, tool_use: { name: tool.tool, input: generateMockInput(tool.tool, i) } },
908
- { role: 'user', content: '', tool_result: { tool_use_id: `tool_${i}`, content: getMockResult(tool.tool, i) } },
909
- { role: 'assistant', content: `Done! Here's the result for variant ${i + 1} of ${(variation || 'task').toLowerCase()}.` }
910
- ]
911
- });
912
- }
913
- }
914
-
915
- // Write examples to JSONL
916
- const outputPath = path.join(SYNTHETIC_DIR, 'examples.jsonl');
917
- const stream = fs.createWriteStream(outputPath);
918
- for (const ex of examples) {
919
- stream.write(JSON.stringify(ex) + '\n');
920
- }
921
- stream.end();
922
-
923
- console.log(`✅ Generated ${examples.length} synthetic examples`);
924
- return examples;
925
- }
926
-
927
- function generateMockInput(toolName, variant) {
928
- const inputs = {
929
- FileReadTool: [{ file_path: `example-${variant}.txt` }, { file_path: 'src/index.ts' }, { file_path: 'config.json' }],
930
- GlobTool: [{ pattern: `**/*.${variant === 0 ? 'ts' : 'js'}` }, { pattern: 'src/**/*.tsx' }],
931
- GrepTool: [{ pattern: 'TODO', path: 'src' }],
932
- BashTool: [{ command: 'ls -la' }, { command: 'git status' }],
933
- FileWriteTool: [{ file_path: 'output.txt', content: 'test' }]
934
- };
935
- return inputs[toolName]?.[variant % (inputs[toolName]?.length || 1)] || { query: `variant-${variant}` };
936
- }
937
-
938
- function getMockResult(toolName, variant) {
939
- const results = {
940
- FileReadTool: 'File contents here...',
941
- GlobTool: `file1.${variant === 0 ? 'ts' : 'js'}\nfile2.${variant === 0 ? 'ts' : 'js'}`,
942
- GrepTool: 'Found 3 matches',
943
- BashTool: 'Command output here',
944
- FileWriteTool: ''
945
- };
946
- return results[toolName] || 'Done';
947
- }
948
-
949
- // ============================================================================
950
- // 6. CREATE TRAINING MANIFEST
951
- // ============================================================================
952
-
953
- function createManifest(tools, stats) {
954
- const manifest = {
955
- dataset: {
956
- name: 'Stack 2.9 Training Data',
957
- version: '0.2.0',
958
- description: 'Training data for Stack 2.9, an open-source coding assistant based on Qwen2.5-Coder',
959
- source: 'OpenClaw architecture + synthetic examples + code analysis',
960
- license: 'Apache 2.0'
961
- },
962
- stats: {
963
- toolSchemas: tools.length,
964
- syntheticExamples: stats.syntheticExamples,
965
- codeCommentPairs: stats.codeCommentPairs,
966
- testExamples: stats.testExamples,
967
- conversations: stats.conversations,
968
- totalExamples: stats.syntheticExamples
969
- },
970
- model_config: {
971
- base_model: 'Qwen2.5-Coder-32B',
972
- fine_tuning_method: 'LoRA',
973
- lora_rank: 64,
974
- lora_alpha: 128,
975
- target_modules: [
976
- 'q_proj', 'k_proj', 'v_proj', 'o_proj',
977
- 'gate_proj', 'up_proj', 'down_proj'
978
- ],
979
- quantization: 'AWQ 4-bit (inference)',
980
- max_seq_length: 131072,
981
- template: 'chatml'
982
- },
983
- tokenizer: {
984
- family: 'Qwen2',
985
- pad_token: '<|endoftext|>',
986
- bos_token: '<|endoftext|>',
987
- eos_token: '<|endoftext|>'
988
- },
989
- training_data: {
990
- synthetic_examples: `${SYNTHETIC_DIR}/examples.jsonl`,
991
- tools_catalog: `${TOOLS_SCHEMA_DIR}/catalog.json`,
992
- code_pairs: `${CODE_PAIRS_DIR}/pairs.json`,
993
- test_examples: `${CODE_PAIRS_DIR}/test-examples.json`,
994
- conversations: `${CONVERSATIONS_DIR}/parsed.json`,
995
- estimated_tokens: '~50M tokens total',
996
- recommended_dataset_size: '100K - 1M examples'
997
- },
998
- deployment: {
999
- inference_engine: 'vLLM',
1000
- api_compatibility: 'OpenAI-compatible (chat/completions)',
1001
- expected_throughput: '~50 tokens/s on A100 80GB',
1002
- platforms: ['Hugging Face', 'OpenRouter', 'self-hosted']
1003
- }
1004
- };
1005
-
1006
- fs.writeFileSync(
1007
- path.join(OUTPUT_DIR, 'manifest.json'),
1008
- JSON.stringify(manifest, null, 2)
1009
- );
1010
-
1011
- console.log('✅ Created training manifest');
1012
- return manifest;
1013
- }
1014
-
1015
- // ============================================================================
1016
- // 7. CREATE TRAINING CONFIG
1017
- // ============================================================================
1018
-
1019
- function createTrainingConfig() {
1020
- const config = {
1021
- model_name: 'Qwen/Qwen2.5-Coder-32B',
1022
- dataset_path: './training-data/synthetic/examples.jsonl',
1023
- max_seq_length: 131072,
1024
- load_in_4bit: true,
1025
- bf16: true,
1026
- batch_size: 1,
1027
- gradient_accumulation_steps: 16,
1028
- learning_rate: 1e-4,
1029
- num_train_epochs: 3,
1030
- warmup_steps: 100,
1031
- save_steps: 1000,
1032
- eval_steps: 500,
1033
- logging_steps: 10,
1034
- output_dir: './stack-2.9-lora',
1035
- push_to_hub: false,
1036
- hub_model_id: 'your-username/stack-2.9',
1037
- lora_config: {
1038
- r: 64,
1039
- lora_alpha: 128,
1040
- target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
1041
- lora_dropout: 0.05,
1042
- bias: 'none'
1043
- }
1044
- };
1045
-
1046
- fs.writeFileSync(
1047
- path.join(OUTPUT_DIR, 'training-config.json'),
1048
- JSON.stringify(config, null, 2)
1049
- );
1050
-
1051
- console.log('✅ Created training config template');
1052
- return config;
1053
- }
1054
-
1055
- // ============================================================================
1056
- // MAIN
1057
- // ============================================================================
1058
-
1059
- console.log('🔧 Stack 2.9 - Enhanced Training Data Extractor\n');
1060
- console.log(`📂 Source: ${SRC_DIR}`);
1061
- console.log(`📁 Output: ${OUTPUT_DIR}\n`);
1062
-
1063
- // Run extraction pipeline
1064
- const tools = extractToolSchemas();
1065
- const codePairs = extractCodeCommentPairs();
1066
- const testExamples = extractTestExamples();
1067
- const conversations = parseConversations();
1068
- const syntheticExamples = generateSyntheticExamples(tools);
1069
- createManifest(tools, {
1070
- syntheticExamples: syntheticExamples.length,
1071
- codeCommentPairs: codePairs.length,
1072
- testExamples: testExamples.length,
1073
- conversations: conversations.length
1074
- });
1075
- createTrainingConfig();
1076
-
1077
- console.log('\n✨ Extraction complete!');
1078
- console.log('\n📋 Summary:');
1079
- console.log(` - Tool schemas: ${tools.length} tools`);
1080
- console.log(` - Synthetic examples: ${syntheticExamples.length}`);
1081
- console.log(` - Code-comment pairs: ${codePairs.length}`);
1082
- console.log(` - Test examples: ${testExamples.length}`);
1083
- console.log(` - Conversations: ${conversations.length}`);
1084
- console.log('\n📁 Output files:');
1085
- console.log(` - ${TOOLS_SCHEMA_DIR}/catalog.json`);
1086
- console.log(` - ${SYNTHETIC_DIR}/examples.jsonl`);
1087
- console.log(` - ${CODE_PAIRS_DIR}/pairs.json`);
1088
- console.log(` - ${CODE_PAIRS_DIR}/test-examples.json`);
1089
- console.log(` - ${CONVERSATIONS_DIR}/parsed.json`);
1090
- console.log(` - ${OUTPUT_DIR}/manifest.json`);
1091
- console.log(` - ${OUTPUT_DIR}/training-config.json`);
1092
- console.log('\n🚀 Next steps:');
1093
- console.log(' 1. Review extracted code-comment pairs for quality');
1094
- console.log(' 2. Add real conversation logs from ~/.claude/sessions');
1095
- console.log(' 3. Scale: aim for 50+ examples per tool');
1096
- console.log(' 4. Convert to Parquet for faster loading');
1097
- console.log(' 5. Launch LoRA fine-tuning on Qwen2.5-Coder-32B');
1098
- console.log(' 6. Deploy with vLLM and submit to OpenRouter');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/{fuse_lora_adapters.py → training/fuse_lora_adapters.py} RENAMED
File without changes
scripts/{merge_lora_adapters.py → training/merge_lora_adapters.py} RENAMED
File without changes
scripts/update_context_window.py DELETED
@@ -1,190 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Update all configuration files to use 128K context window.
4
- Updates: manifest, training config, prepare_dataset, vLLM server, deploy scripts, docs.
5
- """
6
-
7
- import json
8
- import re
9
- from pathlib import Path
10
- import argparse
11
-
12
- def update_json_file(path: Path, updates: dict):
13
- """Update JSON file with key->value updates."""
14
- if not path.exists():
15
- print(f" ⚠️ Not found: {path}")
16
- return False
17
-
18
- with open(path, 'r') as f:
19
- data = json.load(f)
20
-
21
- changed = False
22
- for key, value in updates.items():
23
- if key in data and data[key] != value:
24
- data[key] = value
25
- changed = True
26
-
27
- if changed:
28
- with open(path, 'w') as f:
29
- json.dump(data, f, indent=2)
30
- print(f" ✅ Updated {path.name}")
31
- else:
32
- print(f" ℹ️ {path.name} already up-to-date")
33
- return changed
34
-
35
- def update_python_file(path: Path, old_pattern: str, new_value: str):
36
- """Replace a constant in a Python file."""
37
- if not path.exists():
38
- print(f" ⚠️ Not found: {path}")
39
- return False
40
-
41
- content = path.read_text()
42
- if old_pattern in content:
43
- new_content = content.replace(old_pattern, new_value)
44
- path.write_text(new_content)
45
- print(f" ✅ Updated {path.name}")
46
- return True
47
- else:
48
- print(f" ℹ️ {path.name} - pattern not found, may be already updated")
49
- return False
50
-
51
- def update_shell_script(path: Path, old_var: str, new_value: str):
52
- """Update shell script variable."""
53
- if not path.exists():
54
- print(f" ⚠️ Not found: {path}")
55
- return False
56
-
57
- content = path.read_text()
58
- if old_var in content:
59
- new_content = re.sub(
60
- rf'{old_var}=.+',
61
- f'{old_var}={new_value}',
62
- content
63
- )
64
- path.write_text(new_content)
65
- print(f" ✅ Updated {path.name}")
66
- return True
67
- else:
68
- print(f" ℹ️ {path.name} - variable not found")
69
- return False
70
-
71
- def update_markdown_file(path: Path, old_text: str, new_text: str):
72
- """Update markdown documentation."""
73
- if not path.exists():
74
- print(f" ⚠️ Not found: {path}")
75
- return False
76
-
77
- content = path.read_text()
78
- if old_text in content:
79
- new_content = content.replace(old_text, new_text)
80
- path.write_text(new_content)
81
- print(f" ✅ Updated {path.name}")
82
- return True
83
- else:
84
- print(f" ℹ️ {path.name} - pattern not found")
85
- return False
86
-
87
- def main():
88
- parser = argparse.ArgumentParser()
89
- parser.add_argument("--workspace", type=str, default=".")
90
- args = parser.parse_args()
91
-
92
- root = Path(args.workspace)
93
-
94
- print("🚀 Updating context window to 128K (131072 tokens)")
95
-
96
- # 1. Training manifest
97
- manifest_path = root / "training-data/manifest.json"
98
- update_json_file(manifest_path, {
99
- "max_seq_length": 131072,
100
- "context_length": 131072
101
- })
102
-
103
- # 2. Training config
104
- training_config_path = root / "training-data/training-config.json"
105
- update_json_file(training_config_path, {
106
- "max_seq_length": 131072
107
- })
108
-
109
- # 3. Python scripts
110
- prepare_script = root / "stack-2.9-training/prepare_dataset.py"
111
- if prepare_script.exists():
112
- content = prepare_script.read_text()
113
- if "max_length=32768" in content:
114
- new_content = content.replace("max_length=32768", "max_length=131072")
115
- prepare_script.write_text(new_content)
116
- print(f" ✅ Updated prepare_dataset.py (max_length)")
117
- else:
118
- print(f" ℹ️ prepare_dataset.py - already 128K or pattern not found")
119
-
120
- # 4. vLLM server
121
- vllm_script = root / "stack-2.9-deploy/vllm_server.py"
122
- if vllm_script.exists():
123
- content = vllm_script.read_text()
124
- if "max_model_len" in content:
125
- # Update max_model_len parameter
126
- new_content = re.sub(
127
- r'--max-model-len\s+\d+',
128
- '--max-model-len 131072',
129
- content
130
- )
131
- vllm_script.write_text(new_content)
132
- print(f" ✅ Updated vllm_server.py (--max-model-len)")
133
- else:
134
- print(f" ℹ️ vllm_server.py - max_model_len not found directly, check manually")
135
-
136
- # 5. Local deploy script
137
- deploy_script = root / "stack-2.9-deploy/local_deploy.sh"
138
- if deploy_script.exists():
139
- content = deploy_script.read_text()
140
- # Update any context-related env var
141
- new_content = content.replace("MAX_MODEL_LEN=32768", "MAX_MODEL_LEN=131072") \
142
- .replace("max_model_len=32768", "max_model_len=131072")
143
- if new_content != content:
144
- deploy_script.write_text(new_content)
145
- print(f" ✅ Updated local_deploy.sh")
146
- else:
147
- print(f" ℹ️ local_deploy.sh - no changes needed")
148
-
149
- # 6. README.md performance table
150
- readme_path = root / "README.md"
151
- if readme_path.exists():
152
- content = readme_path.read_text()
153
- # Update context length from 32K to 128K
154
- new_content = content.replace("32,768 tokens", "131,072 tokens (128K)") \
155
- .replace("32K tokens", "128K tokens")
156
- if new_content != content:
157
- readme_path.write_text(new_content)
158
- print(f" ✅ Updated README.md (context length)")
159
- else:
160
- print(f" ℹ️ README.md - context length already correct")
161
-
162
- # 7. Create configuration note
163
- config_note = """# Context Window Configuration
164
-
165
- Stack 2.9 uses full 128K context window (131072 tokens) to provide complete repository awareness.
166
-
167
- ## Settings
168
- - max_model_len: 131072
169
- - max_seq_length: 131072
170
- - block_size: 16 or 32 (adjust for memory/performance tradeoff)
171
-
172
- ## Memory Requirements
173
- | Context | A100 80GB (4-bit) | H100 80GB (4-bit) |
174
- |---------|-------------------|-------------------|
175
- | 32K | ~20GB | ~18GB |
176
- | 64K | ~35GB | ~32GB |
177
- | 128K | ~60GB | ~55GB |
178
-
179
- Throughput decreases slightly at longer contexts (~30% slower at 128K vs 32K) but provides full repository context.
180
-
181
- """
182
- note_path = root / "stack-2.9-docs/CONTEXT_CONFIG.md"
183
- note_path.write_text(config_note)
184
- print(f" ✅ Created CONTEXT_CONFIG.md")
185
-
186
- print("\n✅ Context window update complete!")
187
- print(" All configs now set to 128K (131072 tokens)")
188
-
189
- if __name__ == "__main__":
190
- main()