sravanthib commited on
Commit
887ba99
·
verified ·
1 Parent(s): 737475b

Training completed

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. all_results.json +7 -7
  3. train_results.json +7 -7
  4. trainer_state.json +23 -23
README.md CHANGED
@@ -8,14 +8,14 @@ tags:
8
  - transformers
9
  pipeline_tag: text-generation
10
  model-index:
11
- - name: new-300-steps-DeepSeek-R1-Distill-Qwen-7B
12
  results: []
13
  ---
14
 
15
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
  should probably proofread and complete it, then remove this comment. -->
17
 
18
- # new-300-steps-DeepSeek-R1-Distill-Qwen-7B
19
 
20
  This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on an unknown dataset.
21
 
 
8
  - transformers
9
  pipeline_tag: text-generation
10
  model-index:
11
+ - name: DeepSeek-R1-Distill-Qwen-7B-squad-nemo-replicaa
12
  results: []
13
  ---
14
 
15
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
  should probably proofread and complete it, then remove this comment. -->
17
 
18
+ # DeepSeek-R1-Distill-Qwen-7B-squad-nemo-replicaa
19
 
20
  This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on an unknown dataset.
21
 
all_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "avg_step_time": 6.437501907348633,
3
  "epoch": 0.00228310502283105,
4
  "total_flos": 2405455154380800.0,
5
- "total_training_time": 75.84518218040466,
6
- "total_training_time_mins": 1.264086369673411,
7
- "train_loss": 1.237103076837957,
8
- "train_runtime": 68.9179,
9
- "train_samples_per_second": 2.902,
10
- "train_steps_per_second": 0.145
11
  }
 
1
  {
2
+ "avg_step_time": 6.510569095611572,
3
  "epoch": 0.00228310502283105,
4
  "total_flos": 2405455154380800.0,
5
+ "total_training_time": 76.59795832633972,
6
+ "total_training_time_mins": 1.2766326387723288,
7
+ "train_loss": 1.1941861145198345,
8
+ "train_runtime": 71.1991,
9
+ "train_samples_per_second": 2.809,
10
+ "train_steps_per_second": 0.14
11
  }
train_results.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "avg_step_time": 6.437501907348633,
3
  "epoch": 0.00228310502283105,
4
  "total_flos": 2405455154380800.0,
5
- "total_training_time": 75.84518218040466,
6
- "total_training_time_mins": 1.264086369673411,
7
- "train_loss": 1.237103076837957,
8
- "train_runtime": 68.9179,
9
- "train_samples_per_second": 2.902,
10
- "train_steps_per_second": 0.145
11
  }
 
1
  {
2
+ "avg_step_time": 6.510569095611572,
3
  "epoch": 0.00228310502283105,
4
  "total_flos": 2405455154380800.0,
5
+ "total_training_time": 76.59795832633972,
6
+ "total_training_time_mins": 1.2766326387723288,
7
+ "train_loss": 1.1941861145198345,
8
+ "train_runtime": 71.1991,
9
+ "train_samples_per_second": 2.809,
10
+ "train_steps_per_second": 0.14
11
  }
trainer_state.json CHANGED
@@ -11,82 +11,82 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.00022831050228310502,
14
- "grad_norm": 58.639156341552734,
15
  "learning_rate": 0.0,
16
  "loss": 10.6752,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.00045662100456621003,
21
- "grad_norm": 19.770322799682617,
22
  "learning_rate": 0.001,
23
- "loss": 0.7099,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.0006849315068493151,
28
- "grad_norm": 20.064632415771484,
29
  "learning_rate": 0.001,
30
- "loss": 0.7537,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.0009132420091324201,
35
- "grad_norm": 0.2150338739156723,
36
  "learning_rate": 0.001,
37
- "loss": 0.034,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.001141552511415525,
42
- "grad_norm": 0.10391011089086533,
43
  "learning_rate": 0.001,
44
- "loss": 0.0311,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.0013698630136986301,
49
- "grad_norm": 0.13938532769680023,
50
  "learning_rate": 0.001,
51
- "loss": 0.0313,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.0015981735159817352,
56
- "grad_norm": 0.14156730473041534,
57
  "learning_rate": 0.001,
58
- "loss": 0.0308,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.0018264840182648401,
63
- "grad_norm": 0.16459645330905914,
64
  "learning_rate": 0.001,
65
- "loss": 0.0328,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.002054794520547945,
70
- "grad_norm": 0.23767752945423126,
71
  "learning_rate": 0.001,
72
- "loss": 0.0369,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.00228310502283105,
77
- "grad_norm": 0.22308476269245148,
78
  "learning_rate": 0.001,
79
- "loss": 0.0352,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.00228310502283105,
84
  "step": 10,
85
  "total_flos": 2405455154380800.0,
86
- "train_loss": 1.237103076837957,
87
- "train_runtime": 68.9179,
88
- "train_samples_per_second": 2.902,
89
- "train_steps_per_second": 0.145
90
  }
91
  ],
92
  "logging_steps": 1,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.00022831050228310502,
14
+ "grad_norm": 61.761863708496094,
15
  "learning_rate": 0.0,
16
  "loss": 10.6752,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.00045662100456621003,
21
+ "grad_norm": 14.540397644042969,
22
  "learning_rate": 0.001,
23
+ "loss": 0.4893,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.0006849315068493151,
28
+ "grad_norm": 15.115448951721191,
29
  "learning_rate": 0.001,
30
+ "loss": 0.533,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.0009132420091324201,
35
+ "grad_norm": 0.1035713478922844,
36
  "learning_rate": 0.001,
37
+ "loss": 0.0327,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.001141552511415525,
42
+ "grad_norm": 0.1258726716041565,
43
  "learning_rate": 0.001,
44
+ "loss": 0.0315,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.0013698630136986301,
49
+ "grad_norm": 0.13812150061130524,
50
  "learning_rate": 0.001,
51
+ "loss": 0.0314,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.0015981735159817352,
56
+ "grad_norm": 0.14780189096927643,
57
  "learning_rate": 0.001,
58
+ "loss": 0.0321,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.0018264840182648401,
63
+ "grad_norm": 0.18661947548389435,
64
  "learning_rate": 0.001,
65
+ "loss": 0.0352,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.002054794520547945,
70
+ "grad_norm": 0.2705967426300049,
71
  "learning_rate": 0.001,
72
+ "loss": 0.0417,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.00228310502283105,
77
+ "grad_norm": 0.24941541254520416,
78
  "learning_rate": 0.001,
79
+ "loss": 0.0397,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.00228310502283105,
84
  "step": 10,
85
  "total_flos": 2405455154380800.0,
86
+ "train_loss": 1.1941861145198345,
87
+ "train_runtime": 71.1991,
88
+ "train_samples_per_second": 2.809,
89
+ "train_steps_per_second": 0.14
90
  }
91
  ],
92
  "logging_steps": 1,