kishkath commited on
Commit
b33e29a
·
verified ·
1 Parent(s): df2b5b8

Upload 15 files

Browse files
phi2-qlora-finetuned/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
 
26
  "k_proj",
27
- "q_proj",
28
  "up_proj",
29
  "down_proj",
30
- "gate_proj",
31
- "v_proj",
32
- "o_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "v_proj",
27
+ "o_proj",
28
+ "gate_proj",
29
  "k_proj",
 
30
  "up_proj",
31
  "down_proj",
32
+ "q_proj"
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
phi2-qlora-finetuned/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:665ed085a1c6782b6f007a8a0d370c75d0888e3672e2dcd344c7f36751aafe53
3
  size 31483040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a86d55f6db299df6e4558dd6225c7091c936db0e172cf87d1fc313c14c618780
3
  size 31483040
phi2-qlora-finetuned/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:417c1cd07d456d14577d780ae5bcb4ce553d325313e89cc6ac7f81d51d7891dc
3
- size 63028090
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a54baedea51cab905e8a740367ae26f02ba6cda16dd03c7f98aa29114a2d159
3
+ size 63028282
phi2-qlora-finetuned/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e1e762d3d91354a1077502c44abb720ff6aaa5d1c35eed7a138a66648653703
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:706a257e793b9bbfce4b1e74573558c614c67421db28f53b2592e3cfc8dbfee7
3
  size 14244
phi2-qlora-finetuned/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:578927bb1c801cdd14af41be9b1907db16bd1c7b35d8fc1fc2779e79adb5109e
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18b984273ea2d45b7ffb1d047bb359d93111e41fcad70d16a1b453fd38f72636
3
  size 988
phi2-qlora-finetuned/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34bebae8dbb4044169aedad702183ea2a5f7688635af093bea630f24dc71f1ed
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f4fe8e328be6e10e053e14666b2e571c45c73d9a8291556b08910e3da67b3e6
3
  size 1064
phi2-qlora-finetuned/tokenizer.json CHANGED
@@ -1,7 +1,21 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 50256,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 512
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 50256,
16
+ "pad_type_id": 0,
17
+ "pad_token": "<|endoftext|>"
18
+ },
19
  "added_tokens": [
20
  {
21
  "id": 50256,
phi2-qlora-finetuned/tokenizer_config.json CHANGED
@@ -318,6 +318,7 @@
318
  "clean_up_tokenization_spaces": true,
319
  "eos_token": "<|endoftext|>",
320
  "extra_special_tokens": {},
 
321
  "model_max_length": 2048,
322
  "pad_token": "<|endoftext|>",
323
  "return_token_type_ids": false,
 
318
  "clean_up_tokenization_spaces": true,
319
  "eos_token": "<|endoftext|>",
320
  "extra_special_tokens": {},
321
+ "max_length": 512,
322
  "model_max_length": 2048,
323
  "pad_token": "<|endoftext|>",
324
  "return_token_type_ids": false,
phi2-qlora-finetuned/trainer_state.json CHANGED
@@ -1,114 +1,418 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.24439918533604887,
5
  "eval_steps": 500,
6
- "global_step": 120,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.020366598778004074,
13
- "grad_norm": 0.1955823004245758,
14
- "learning_rate": 0.00019868265225415265,
15
- "loss": 1.6792,
16
- "mean_token_accuracy": 0.6286436378955841,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.04073319755600815,
21
- "grad_norm": 0.43605607748031616,
22
- "learning_rate": 0.00019075754196709572,
23
- "loss": 1.5504,
24
- "mean_token_accuracy": 0.6437817469239235,
25
  "step": 20
26
  },
27
  {
28
  "epoch": 0.06109979633401222,
29
- "grad_norm": 0.5884078145027161,
30
- "learning_rate": 0.00017621620551276366,
31
- "loss": 1.317,
32
- "mean_token_accuracy": 0.6953216314315795,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.0814663951120163,
37
- "grad_norm": 0.36727696657180786,
38
- "learning_rate": 0.00015611870653623825,
39
- "loss": 1.054,
40
- "mean_token_accuracy": 0.7708626106381417,
41
  "step": 40
42
  },
43
  {
44
  "epoch": 0.10183299389002037,
45
- "grad_norm": 0.2848515510559082,
46
- "learning_rate": 0.000131930153013598,
47
- "loss": 0.6444,
48
- "mean_token_accuracy": 0.867109614610672,
49
  "step": 50
50
  },
51
  {
52
  "epoch": 0.12219959266802444,
53
- "grad_norm": 0.2064630389213562,
54
- "learning_rate": 0.00010541389085854176,
55
- "loss": 0.9981,
56
- "mean_token_accuracy": 0.756032009422779,
57
  "step": 60
58
  },
59
  {
60
  "epoch": 0.1425661914460285,
61
- "grad_norm": 0.18142254650592804,
62
- "learning_rate": 7.85029559788976e-05,
63
- "loss": 0.9005,
64
- "mean_token_accuracy": 0.775178787112236,
65
  "step": 70
66
  },
67
  {
68
  "epoch": 0.1629327902240326,
69
- "grad_norm": 0.21299591660499573,
70
- "learning_rate": 5.3159155930021e-05,
71
- "loss": 0.8883,
72
- "mean_token_accuracy": 0.7880317449569703,
73
  "step": 80
74
  },
75
  {
76
  "epoch": 0.18329938900203666,
77
- "grad_norm": 0.21353664994239807,
78
- "learning_rate": 3.123005411465766e-05,
79
- "loss": 0.8977,
80
- "mean_token_accuracy": 0.801851412653923,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 0.20366598778004075,
85
- "grad_norm": 0.2567966878414154,
86
- "learning_rate": 1.4314282383241096e-05,
87
- "loss": 0.5785,
88
- "mean_token_accuracy": 0.8794851988554001,
89
  "step": 100
90
  },
91
  {
92
  "epoch": 0.2240325865580448,
93
- "grad_norm": 0.23000894486904144,
94
- "learning_rate": 3.6450007480777093e-06,
95
- "loss": 1.0155,
96
- "mean_token_accuracy": 0.7514406576752662,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.24439918533604887,
101
- "grad_norm": 0.19669267535209656,
102
- "learning_rate": 0.0,
103
- "loss": 0.938,
104
- "mean_token_accuracy": 0.7690818622708321,
105
  "step": 120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  }
107
  ],
108
  "logging_steps": 10,
109
- "max_steps": 120,
110
  "num_input_tokens_seen": 0,
111
- "num_train_epochs": 1,
112
  "save_steps": 10,
113
  "stateful_callbacks": {
114
  "TrainerControl": {
@@ -122,7 +426,7 @@
122
  "attributes": {}
123
  }
124
  },
125
- "total_flos": 1.339699180744704e+16,
126
  "train_batch_size": 4,
127
  "trial_name": null,
128
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0183299389002036,
5
  "eval_steps": 500,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.020366598778004074,
13
+ "grad_norm": 0.12626299262046814,
14
+ "learning_rate": 0.00013333333333333334,
15
+ "loss": 1.7006,
16
+ "mean_token_accuracy": 0.6053715996444226,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.04073319755600815,
21
+ "grad_norm": 0.2351449579000473,
22
+ "learning_rate": 0.00019994755690455152,
23
+ "loss": 1.5214,
24
+ "mean_token_accuracy": 0.6368309155106544,
25
  "step": 20
26
  },
27
  {
28
  "epoch": 0.06109979633401222,
29
+ "grad_norm": 0.32846614718437195,
30
+ "learning_rate": 0.0001995283421166614,
31
+ "loss": 1.5662,
32
+ "mean_token_accuracy": 0.6361105382442475,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.0814663951120163,
37
+ "grad_norm": 0.24607931077480316,
38
+ "learning_rate": 0.00019869167087338907,
39
+ "loss": 1.4452,
40
+ "mean_token_accuracy": 0.6645107805728913,
41
  "step": 40
42
  },
43
  {
44
  "epoch": 0.10183299389002037,
45
+ "grad_norm": 0.21988731622695923,
46
+ "learning_rate": 0.00019744105246469263,
47
+ "loss": 1.3939,
48
+ "mean_token_accuracy": 0.6644507594406605,
49
  "step": 50
50
  },
51
  {
52
  "epoch": 0.12219959266802444,
53
+ "grad_norm": 0.18629378080368042,
54
+ "learning_rate": 0.00019578173241879872,
55
+ "loss": 1.3307,
56
+ "mean_token_accuracy": 0.671517875790596,
57
  "step": 60
58
  },
59
  {
60
  "epoch": 0.1425661914460285,
61
+ "grad_norm": 0.19982416927814484,
62
+ "learning_rate": 0.00019372067050063438,
63
+ "loss": 1.3089,
64
+ "mean_token_accuracy": 0.6763716876506806,
65
  "step": 70
66
  },
67
  {
68
  "epoch": 0.1629327902240326,
69
+ "grad_norm": 0.1744842529296875,
70
+ "learning_rate": 0.00019126651152015403,
71
+ "loss": 1.3482,
72
+ "mean_token_accuracy": 0.6714572340250016,
73
  "step": 80
74
  },
75
  {
76
  "epoch": 0.18329938900203666,
77
+ "grad_norm": 0.22721126675605774,
78
+ "learning_rate": 0.00018842954907300236,
79
+ "loss": 1.4167,
80
+ "mean_token_accuracy": 0.6588860973715782,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 0.20366598778004075,
85
+ "grad_norm": 0.17914065718650818,
86
+ "learning_rate": 0.00018522168236559695,
87
+ "loss": 1.3608,
88
+ "mean_token_accuracy": 0.6660649880766869,
89
  "step": 100
90
  },
91
  {
92
  "epoch": 0.2240325865580448,
93
+ "grad_norm": 0.1730283796787262,
94
+ "learning_rate": 0.0001816563663057211,
95
+ "loss": 1.4223,
96
+ "mean_token_accuracy": 0.6599490791559219,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.24439918533604887,
101
+ "grad_norm": 0.17649827897548676,
102
+ "learning_rate": 0.00017774855506796496,
103
+ "loss": 1.3737,
104
+ "mean_token_accuracy": 0.6688225455582142,
105
  "step": 120
106
+ },
107
+ {
108
+ "epoch": 0.26476578411405294,
109
+ "grad_norm": 0.17069195210933685,
110
+ "learning_rate": 0.00017351463937072004,
111
+ "loss": 1.3767,
112
+ "mean_token_accuracy": 0.6619200393557548,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 0.285132382892057,
117
+ "grad_norm": 0.1913885921239853,
118
+ "learning_rate": 0.00016897237772781044,
119
+ "loss": 1.4375,
120
+ "mean_token_accuracy": 0.6498532116413116,
121
+ "step": 140
122
+ },
123
+ {
124
+ "epoch": 0.3054989816700611,
125
+ "grad_norm": 0.17547893524169922,
126
+ "learning_rate": 0.000164140821963114,
127
+ "loss": 1.3268,
128
+ "mean_token_accuracy": 0.6731642320752144,
129
+ "step": 150
130
+ },
131
+ {
132
+ "epoch": 0.3258655804480652,
133
+ "grad_norm": 0.16026997566223145,
134
+ "learning_rate": 0.00015904023730059228,
135
+ "loss": 1.4003,
136
+ "mean_token_accuracy": 0.663200007379055,
137
+ "step": 160
138
+ },
139
+ {
140
+ "epoch": 0.34623217922606925,
141
+ "grad_norm": 0.15250708162784576,
142
+ "learning_rate": 0.0001536920173648984,
143
+ "loss": 1.2856,
144
+ "mean_token_accuracy": 0.6764356568455696,
145
+ "step": 170
146
+ },
147
+ {
148
+ "epoch": 0.3665987780040733,
149
+ "grad_norm": 0.13922956585884094,
150
+ "learning_rate": 0.00014811859444908052,
151
+ "loss": 1.3469,
152
+ "mean_token_accuracy": 0.6688723161816597,
153
+ "step": 180
154
+ },
155
+ {
156
+ "epoch": 0.3869653767820774,
157
+ "grad_norm": 0.16509701311588287,
158
+ "learning_rate": 0.00014234334542574906,
159
+ "loss": 1.3352,
160
+ "mean_token_accuracy": 0.6676128759980202,
161
+ "step": 190
162
+ },
163
+ {
164
+ "epoch": 0.4073319755600815,
165
+ "grad_norm": 0.14618448913097382,
166
+ "learning_rate": 0.00013639049369634876,
167
+ "loss": 1.3593,
168
+ "mean_token_accuracy": 0.6654890060424805,
169
+ "step": 200
170
+ },
171
+ {
172
+ "epoch": 0.42769857433808556,
173
+ "grad_norm": 0.16653190553188324,
174
+ "learning_rate": 0.00013028500758979506,
175
+ "loss": 1.328,
176
+ "mean_token_accuracy": 0.6699673473834992,
177
+ "step": 210
178
+ },
179
+ {
180
+ "epoch": 0.4480651731160896,
181
+ "grad_norm": 0.146384596824646,
182
+ "learning_rate": 0.00012405249563662537,
183
+ "loss": 1.2914,
184
+ "mean_token_accuracy": 0.6747352227568626,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 0.4684317718940937,
189
+ "grad_norm": 0.16215957701206207,
190
+ "learning_rate": 0.0001177190991579223,
191
+ "loss": 1.3342,
192
+ "mean_token_accuracy": 0.6690372809767723,
193
+ "step": 230
194
+ },
195
+ {
196
+ "epoch": 0.48879837067209775,
197
+ "grad_norm": 0.17549686133861542,
198
+ "learning_rate": 0.00011131138261952845,
199
+ "loss": 1.2852,
200
+ "mean_token_accuracy": 0.6783039927482605,
201
+ "step": 240
202
+ },
203
+ {
204
+ "epoch": 0.5091649694501018,
205
+ "grad_norm": 0.14437216520309448,
206
+ "learning_rate": 0.00010485622221144484,
207
+ "loss": 1.3264,
208
+ "mean_token_accuracy": 0.6815439119935036,
209
+ "step": 250
210
+ },
211
+ {
212
+ "epoch": 0.5295315682281059,
213
+ "grad_norm": 0.139692023396492,
214
+ "learning_rate": 9.838069311974986e-05,
215
+ "loss": 1.3195,
216
+ "mean_token_accuracy": 0.6713656410574913,
217
+ "step": 260
218
+ },
219
+ {
220
+ "epoch": 0.5498981670061099,
221
+ "grad_norm": 0.16701075434684753,
222
+ "learning_rate": 9.19119559638596e-05,
223
+ "loss": 1.3618,
224
+ "mean_token_accuracy": 0.6655304417014122,
225
+ "step": 270
226
+ },
227
+ {
228
+ "epoch": 0.570264765784114,
229
+ "grad_norm": 0.15681137144565582,
230
+ "learning_rate": 8.5477142875451e-05,
231
+ "loss": 1.3549,
232
+ "mean_token_accuracy": 0.6620298072695732,
233
+ "step": 280
234
+ },
235
+ {
236
+ "epoch": 0.5906313645621182,
237
+ "grad_norm": 0.1511278599500656,
238
+ "learning_rate": 7.91032436968725e-05,
239
+ "loss": 1.3159,
240
+ "mean_token_accuracy": 0.6778326541185379,
241
+ "step": 290
242
+ },
243
+ {
244
+ "epoch": 0.6109979633401222,
245
+ "grad_norm": 0.16071146726608276,
246
+ "learning_rate": 7.281699277636572e-05,
247
+ "loss": 1.3977,
248
+ "mean_token_accuracy": 0.6576820626854897,
249
+ "step": 300
250
+ },
251
+ {
252
+ "epoch": 0.6313645621181263,
253
+ "grad_norm": 0.1634049415588379,
254
+ "learning_rate": 6.664475683491796e-05,
255
+ "loss": 1.3118,
256
+ "mean_token_accuracy": 0.6826698362827301,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 0.6517311608961304,
261
+ "grad_norm": 0.15096673369407654,
262
+ "learning_rate": 6.061242437507131e-05,
263
+ "loss": 1.3492,
264
+ "mean_token_accuracy": 0.6674635127186775,
265
+ "step": 320
266
+ },
267
+ {
268
+ "epoch": 0.6720977596741344,
269
+ "grad_norm": 0.16971102356910706,
270
+ "learning_rate": 5.474529709554612e-05,
271
+ "loss": 1.4399,
272
+ "mean_token_accuracy": 0.6552789464592934,
273
+ "step": 330
274
+ },
275
+ {
276
+ "epoch": 0.6924643584521385,
277
+ "grad_norm": 0.14999577403068542,
278
+ "learning_rate": 4.9067983767123736e-05,
279
+ "loss": 1.4486,
280
+ "mean_token_accuracy": 0.6491607405245304,
281
+ "step": 340
282
+ },
283
+ {
284
+ "epoch": 0.7128309572301426,
285
+ "grad_norm": 0.16720150411128998,
286
+ "learning_rate": 4.360429701490934e-05,
287
+ "loss": 1.3931,
288
+ "mean_token_accuracy": 0.6607658788561821,
289
+ "step": 350
290
+ },
291
+ {
292
+ "epoch": 0.7331975560081466,
293
+ "grad_norm": 0.138802170753479,
294
+ "learning_rate": 3.8377153439907266e-05,
295
+ "loss": 1.2131,
296
+ "mean_token_accuracy": 0.6939047828316689,
297
+ "step": 360
298
+ },
299
+ {
300
+ "epoch": 0.7535641547861507,
301
+ "grad_norm": 0.18726502358913422,
302
+ "learning_rate": 3.340847749883191e-05,
303
+ "loss": 1.2142,
304
+ "mean_token_accuracy": 0.6881774321198464,
305
+ "step": 370
306
+ },
307
+ {
308
+ "epoch": 0.7739307535641547,
309
+ "grad_norm": 0.190961092710495,
310
+ "learning_rate": 2.8719109545317103e-05,
311
+ "loss": 1.3967,
312
+ "mean_token_accuracy": 0.6626075744628906,
313
+ "step": 380
314
+ },
315
+ {
316
+ "epoch": 0.7942973523421588,
317
+ "grad_norm": 0.14274843037128448,
318
+ "learning_rate": 2.432871841823047e-05,
319
+ "loss": 1.3339,
320
+ "mean_token_accuracy": 0.671070359647274,
321
+ "step": 390
322
+ },
323
+ {
324
+ "epoch": 0.814663951120163,
325
+ "grad_norm": 0.1557278335094452,
326
+ "learning_rate": 2.025571894372794e-05,
327
+ "loss": 1.296,
328
+ "mean_token_accuracy": 0.6788829267024994,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 0.835030549898167,
333
+ "grad_norm": 0.17218472063541412,
334
+ "learning_rate": 1.65171946970729e-05,
335
+ "loss": 1.2488,
336
+ "mean_token_accuracy": 0.6864925757050514,
337
+ "step": 410
338
+ },
339
+ {
340
+ "epoch": 0.8553971486761711,
341
+ "grad_norm": 0.14736278355121613,
342
+ "learning_rate": 1.3128826348184887e-05,
343
+ "loss": 1.3984,
344
+ "mean_token_accuracy": 0.6586415357887745,
345
+ "step": 420
346
+ },
347
+ {
348
+ "epoch": 0.8757637474541752,
349
+ "grad_norm": 0.16238394379615784,
350
+ "learning_rate": 1.010482589146048e-05,
351
+ "loss": 1.2756,
352
+ "mean_token_accuracy": 0.6823231220245362,
353
+ "step": 430
354
+ },
355
+ {
356
+ "epoch": 0.8961303462321792,
357
+ "grad_norm": 0.1799972802400589,
358
+ "learning_rate": 7.457877035729588e-06,
359
+ "loss": 1.3597,
360
+ "mean_token_accuracy": 0.6700396433472633,
361
+ "step": 440
362
+ },
363
+ {
364
+ "epoch": 0.9164969450101833,
365
+ "grad_norm": 0.18641585111618042,
366
+ "learning_rate": 5.199082004372957e-06,
367
+ "loss": 1.2676,
368
+ "mean_token_accuracy": 0.6873666003346444,
369
+ "step": 450
370
+ },
371
+ {
372
+ "epoch": 0.9368635437881874,
373
+ "grad_norm": 0.15163969993591309,
374
+ "learning_rate": 3.3379149687388867e-06,
375
+ "loss": 1.3899,
376
+ "mean_token_accuracy": 0.6645716562867164,
377
+ "step": 460
378
+ },
379
+ {
380
+ "epoch": 0.9572301425661914,
381
+ "grad_norm": 0.15963105857372284,
382
+ "learning_rate": 1.882182310176095e-06,
383
+ "loss": 1.342,
384
+ "mean_token_accuracy": 0.6667100310325622,
385
+ "step": 470
386
+ },
387
+ {
388
+ "epoch": 0.9775967413441955,
389
+ "grad_norm": 0.14211098849773407,
390
+ "learning_rate": 8.379898773574924e-07,
391
+ "loss": 1.3966,
392
+ "mean_token_accuracy": 0.6606692716479301,
393
+ "step": 480
394
+ },
395
+ {
396
+ "epoch": 0.9979633401221996,
397
+ "grad_norm": 0.16811959445476532,
398
+ "learning_rate": 2.0971737622883515e-07,
399
+ "loss": 1.4444,
400
+ "mean_token_accuracy": 0.655492453277111,
401
+ "step": 490
402
+ },
403
+ {
404
+ "epoch": 1.0183299389002036,
405
+ "grad_norm": 0.17285843193531036,
406
+ "learning_rate": 0.0,
407
+ "loss": 1.3099,
408
+ "mean_token_accuracy": 0.6747938022017479,
409
+ "step": 500
410
  }
411
  ],
412
  "logging_steps": 10,
413
+ "max_steps": 500,
414
  "num_input_tokens_seen": 0,
415
+ "num_train_epochs": 2,
416
  "save_steps": 10,
417
  "stateful_callbacks": {
418
  "TrainerControl": {
 
426
  "attributes": {}
427
  }
428
  },
429
+ "total_flos": 6.528555810816e+16,
430
  "train_batch_size": 4,
431
  "trial_name": null,
432
  "trial_params": null
phi2-qlora-finetuned/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:064fa4c2e54fa8a50b7b4d9697c1ac99d08654a2dc35cecd028c114e5f16ce98
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2222e57016c4ac8210ba59089f39992c759a360a06d2f33c716bdd5ea3bed7f8
3
  size 5560