Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- app.py +24 -0
- fine_tuned_model (1).zip +3 -0
- requirements.txt +5 -0
- text_generation_finetunning_notebook.ipynb +290 -0
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline
|
3 |
+
import torch
|
4 |
+
|
5 |
+
st.title("Text_Generator Fine tunning model")
|
6 |
+
|
7 |
+
# Load model and tokenizer
|
8 |
+
model_dir = "fine_tuned_model (1)"
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
10 |
+
model = AutoModelForCausalLM.from_pretrained(model_dir)
|
11 |
+
|
12 |
+
|
13 |
+
code_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
14 |
+
|
15 |
+
prompt = "def quicksort(arr):"
|
16 |
+
|
17 |
+
|
18 |
+
inputs_text=st.text_input("Please enter the text",value="I think I really like this place. Ayesha and I had a chance to visit Cheuvront on a Monday night. It wasn\'t terribly busy when we arrived and we were warmly greeted. Unfortunately we were seated next to a loud group of young children that thought they knew something of the world ")
|
19 |
+
|
20 |
+
if st.button("submit"):
|
21 |
+
generated_code = code_generator(inputs_text, max_length=200, num_return_sequences=1)
|
22 |
+
|
23 |
+
st.write(generated_code[0]["generated_text"])
|
24 |
+
|
fine_tuned_model (1).zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e571aaa5e03efcbab67c083fb3884631f00fec87b86aeef60a6dbc298b4ed31a
|
3 |
+
size 463917526
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
pandas==2.2.2
|
3 |
+
torch==2.5.1
|
4 |
+
transformers==4.48.3
|
5 |
+
streamlit==1.41.1
|
text_generation_finetunning_notebook.ipynb
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 6,
|
6 |
+
"id": "9665f082-b1e2-4094-a9c4-f5fa4560e01f",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
11 |
+
"\n",
|
12 |
+
"model_name = \"gpt2\" \n",
|
13 |
+
"model = AutoModelForCausalLM.from_pretrained(model_name)\n",
|
14 |
+
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
15 |
+
"\n",
|
16 |
+
"# Ensure the tokenizer uses padding if necessary\n",
|
17 |
+
"tokenizer.pad_token = tokenizer.eos_token \n"
|
18 |
+
]
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"cell_type": "code",
|
22 |
+
"execution_count": 7,
|
23 |
+
"id": "8c81406c-1335-4491-b8cd-67770e86e390",
|
24 |
+
"metadata": {},
|
25 |
+
"outputs": [],
|
26 |
+
"source": [
|
27 |
+
"from datasets import load_dataset\n",
|
28 |
+
"\n",
|
29 |
+
"dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\")\n"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 8,
|
35 |
+
"id": "2fd0c7d7-1c01-416c-af00-2d11a51663f1",
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [
|
38 |
+
{
|
39 |
+
"data": {
|
40 |
+
"application/vnd.jupyter.widget-view+json": {
|
41 |
+
"model_id": "833d3e6bacf94b4f83849b76e554c187",
|
42 |
+
"version_major": 2,
|
43 |
+
"version_minor": 0
|
44 |
+
},
|
45 |
+
"text/plain": [
|
46 |
+
"Map: 0%| | 0/36718 [00:00<?, ? examples/s]"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
"metadata": {},
|
50 |
+
"output_type": "display_data"
|
51 |
+
}
|
52 |
+
],
|
53 |
+
"source": [
|
54 |
+
"def tokenize_function(examples):\n",
|
55 |
+
" return tokenizer(examples[\"text\"], truncation=True, padding=\"max_length\", max_length=512)\n",
|
56 |
+
"\n",
|
57 |
+
"tokenized_datasets = dataset.map(tokenize_function, batched=True)\n"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"cell_type": "code",
|
62 |
+
"execution_count": 9,
|
63 |
+
"id": "85a7f1be-a72d-4b94-b232-4942616810f9",
|
64 |
+
"metadata": {},
|
65 |
+
"outputs": [
|
66 |
+
{
|
67 |
+
"name": "stderr",
|
68 |
+
"output_type": "stream",
|
69 |
+
"text": [
|
70 |
+
"/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/transformers/training_args.py:1594: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
71 |
+
" warnings.warn(\n"
|
72 |
+
]
|
73 |
+
}
|
74 |
+
],
|
75 |
+
"source": [
|
76 |
+
"from transformers import TrainingArguments\n",
|
77 |
+
"\n",
|
78 |
+
"training_args = TrainingArguments(\n",
|
79 |
+
" output_dir=\"./results\",\n",
|
80 |
+
" evaluation_strategy=\"epoch\",\n",
|
81 |
+
" save_strategy=\"epoch\",\n",
|
82 |
+
" per_device_train_batch_size=8, # Adjust based on your GPU\n",
|
83 |
+
" per_device_eval_batch_size=8,\n",
|
84 |
+
" logging_dir=\"./logs\",\n",
|
85 |
+
" logging_steps=10,\n",
|
86 |
+
" num_train_epochs=1,\n",
|
87 |
+
" report_to=\"none\", # Change to \"wandb\" or \"tensorboard\" if using logging\n",
|
88 |
+
")\n"
|
89 |
+
]
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"cell_type": "code",
|
93 |
+
"execution_count": 10,
|
94 |
+
"id": "cb46a328-74ef-420a-b5d7-b3159cc8f5b0",
|
95 |
+
"metadata": {},
|
96 |
+
"outputs": [
|
97 |
+
{
|
98 |
+
"data": {
|
99 |
+
"text/html": [
|
100 |
+
"\n",
|
101 |
+
" <div>\n",
|
102 |
+
" \n",
|
103 |
+
" <progress value='4590' max='4590' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
104 |
+
" [4590/4590 1:19:10, Epoch 1/1]\n",
|
105 |
+
" </div>\n",
|
106 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
107 |
+
" <thead>\n",
|
108 |
+
" <tr style=\"text-align: left;\">\n",
|
109 |
+
" <th>Epoch</th>\n",
|
110 |
+
" <th>Training Loss</th>\n",
|
111 |
+
" <th>Validation Loss</th>\n",
|
112 |
+
" </tr>\n",
|
113 |
+
" </thead>\n",
|
114 |
+
" <tbody>\n",
|
115 |
+
" <tr>\n",
|
116 |
+
" <td>1</td>\n",
|
117 |
+
" <td>3.239600</td>\n",
|
118 |
+
" <td>3.291132</td>\n",
|
119 |
+
" </tr>\n",
|
120 |
+
" </tbody>\n",
|
121 |
+
"</table><p>"
|
122 |
+
],
|
123 |
+
"text/plain": [
|
124 |
+
"<IPython.core.display.HTML object>"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
"metadata": {},
|
128 |
+
"output_type": "display_data"
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"data": {
|
132 |
+
"text/plain": [
|
133 |
+
"TrainOutput(global_step=4590, training_loss=3.347612351062251, metrics={'train_runtime': 4751.264, 'train_samples_per_second': 7.728, 'train_steps_per_second': 0.966, 'total_flos': 9594120830976000.0, 'train_loss': 3.347612351062251, 'epoch': 1.0})"
|
134 |
+
]
|
135 |
+
},
|
136 |
+
"execution_count": 10,
|
137 |
+
"metadata": {},
|
138 |
+
"output_type": "execute_result"
|
139 |
+
}
|
140 |
+
],
|
141 |
+
"source": [
|
142 |
+
"from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling\n",
|
143 |
+
"\n",
|
144 |
+
"data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n",
|
145 |
+
"\n",
|
146 |
+
"trainer = Trainer(\n",
|
147 |
+
" model=model,\n",
|
148 |
+
" args=training_args,\n",
|
149 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
150 |
+
" eval_dataset=tokenized_datasets[\"validation\"],\n",
|
151 |
+
" data_collator=data_collator,\n",
|
152 |
+
")\n",
|
153 |
+
"\n",
|
154 |
+
"trainer.train()\n"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "code",
|
159 |
+
"execution_count": 11,
|
160 |
+
"id": "d257f423-a9ea-4fe2-9fcf-bebcf1cd356d",
|
161 |
+
"metadata": {},
|
162 |
+
"outputs": [
|
163 |
+
{
|
164 |
+
"data": {
|
165 |
+
"text/plain": [
|
166 |
+
"('fine_tuned_model/tokenizer_config.json',\n",
|
167 |
+
" 'fine_tuned_model/special_tokens_map.json',\n",
|
168 |
+
" 'fine_tuned_model/vocab.json',\n",
|
169 |
+
" 'fine_tuned_model/merges.txt',\n",
|
170 |
+
" 'fine_tuned_model/added_tokens.json',\n",
|
171 |
+
" 'fine_tuned_model/tokenizer.json')"
|
172 |
+
]
|
173 |
+
},
|
174 |
+
"execution_count": 11,
|
175 |
+
"metadata": {},
|
176 |
+
"output_type": "execute_result"
|
177 |
+
}
|
178 |
+
],
|
179 |
+
"source": [
|
180 |
+
"model.save_pretrained(\"fine_tuned_model\")\n",
|
181 |
+
"tokenizer.save_pretrained(\"fine_tuned_model\")"
|
182 |
+
]
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"cell_type": "code",
|
186 |
+
"execution_count": 12,
|
187 |
+
"id": "493e4e36-45a6-4cd2-b37d-2e8e534f1a39",
|
188 |
+
"metadata": {},
|
189 |
+
"outputs": [
|
190 |
+
{
|
191 |
+
"name": "stdout",
|
192 |
+
"output_type": "stream",
|
193 |
+
"text": [
|
194 |
+
"Folder 'fine_tuned_model' has been zipped as 'fine_tuned_model.zip'.\n"
|
195 |
+
]
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"source": [
|
199 |
+
"import shutil\n",
|
200 |
+
"\n",
|
201 |
+
"# Specify the folder to be zipped\n",
|
202 |
+
"folder_path = \"fine_tuned_model\" # Replace with your actual folder name\n",
|
203 |
+
"zip_name = \"fine_tuned_model.zip\" # Desired zip file name\n",
|
204 |
+
"\n",
|
205 |
+
"# Create a zip archive\n",
|
206 |
+
"shutil.make_archive(zip_name.replace('.zip', ''), 'zip', folder_path)\n",
|
207 |
+
"\n",
|
208 |
+
"print(f\"Folder '{folder_path}' has been zipped as '{zip_name}'.\")"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"cell_type": "code",
|
213 |
+
"execution_count": 18,
|
214 |
+
"id": "fda9cf8b-1e3c-47c2-8a60-11cccf2d608a",
|
215 |
+
"metadata": {},
|
216 |
+
"outputs": [],
|
217 |
+
"source": [
|
218 |
+
"from transformers import pipeline"
|
219 |
+
]
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"cell_type": "code",
|
223 |
+
"execution_count": 20,
|
224 |
+
"id": "d60aa595-6bff-4686-a9ba-3e9b993a54ed",
|
225 |
+
"metadata": {},
|
226 |
+
"outputs": [
|
227 |
+
{
|
228 |
+
"name": "stderr",
|
229 |
+
"output_type": "stream",
|
230 |
+
"text": [
|
231 |
+
"Device set to use cuda:0\n",
|
232 |
+
"Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"name": "stdout",
|
237 |
+
"output_type": "stream",
|
238 |
+
"text": [
|
239 |
+
"def quicksort(arr): \n",
|
240 |
+
"\n",
|
241 |
+
"Proscure = \n",
|
242 |
+
"\n",
|
243 |
+
"Faced with a choice between the current and previous values, an error's resolution in a new value is not necessarily in order, since the first one is the first one that does not change. Prof will have to return a retry call for all possible errors returned from the previous value, which is equivalent to a new retry ( q @-@ f ). A simple recursion will perform only one recursion on the results. \n",
|
244 |
+
"\n",
|
245 |
+
"A recursion in alliter @-@ ordered values is done if it's possible to reorder them at all. This means a recursion in the first function of an array's contents is done if it isn 't possible to reorder them at all. This means, for example, that an array would have to be returned the same number of times in order to work as an array is. \n",
|
246 |
+
"\n",
|
247 |
+
"A recursion in\n"
|
248 |
+
]
|
249 |
+
}
|
250 |
+
],
|
251 |
+
"source": [
|
252 |
+
"code_generator = pipeline(\"text-generation\", model=\"fine_tuned_model\", tokenizer=tokenizer)\n",
|
253 |
+
"\n",
|
254 |
+
"prompt = \"def quicksort(arr):\"\n",
|
255 |
+
"generated_code = code_generator(prompt, max_length=200, num_return_sequences=1)\n",
|
256 |
+
"\n",
|
257 |
+
"print(generated_code[0][\"generated_text\"])"
|
258 |
+
]
|
259 |
+
},
|
260 |
+
{
|
261 |
+
"cell_type": "code",
|
262 |
+
"execution_count": null,
|
263 |
+
"id": "7c82d049-147d-49e0-bc87-b7793c01dba1",
|
264 |
+
"metadata": {},
|
265 |
+
"outputs": [],
|
266 |
+
"source": []
|
267 |
+
}
|
268 |
+
],
|
269 |
+
"metadata": {
|
270 |
+
"kernelspec": {
|
271 |
+
"display_name": "Python 3",
|
272 |
+
"language": "python",
|
273 |
+
"name": "python3"
|
274 |
+
},
|
275 |
+
"language_info": {
|
276 |
+
"codemirror_mode": {
|
277 |
+
"name": "ipython",
|
278 |
+
"version": 3
|
279 |
+
},
|
280 |
+
"file_extension": ".py",
|
281 |
+
"mimetype": "text/x-python",
|
282 |
+
"name": "python",
|
283 |
+
"nbconvert_exporter": "python",
|
284 |
+
"pygments_lexer": "ipython3",
|
285 |
+
"version": "3.10.10"
|
286 |
+
}
|
287 |
+
},
|
288 |
+
"nbformat": 4,
|
289 |
+
"nbformat_minor": 5
|
290 |
+
}
|