irmchek commited on
Commit
9496a5f
·
1 Parent(s): 57d40ed

adding app to test on huggingface

Browse files
Files changed (1) hide show
  1. app.py +174 -0
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nbformat
2
+ import spacy
3
+ import gradio as gr
4
+ from transformers import pipeline
5
+ from tokenize import tokenize
6
+ from transformers import (
7
+ AutoModelForSeq2SeqLM,
8
+ AutoTokenizer,
9
+ AutoConfig,
10
+ pipeline,
11
+ )
12
+ import re
13
+ import nltk
14
+
15
+ PYTHON_CODE_MODEL = "sagard21/python-code-explainer"
16
+ TITLE_SUMMARIZE_MODEL = "fabiochiu/t5-small-medium-title-generation"
17
+
18
+
19
+ class NotebookEnhancer:
20
+ def __init__(self):
21
+ # models + tokenizer for generating titles from code summaries
22
+ self.title_tokenizer = AutoTokenizer.from_pretrained(TITLE_SUMMARIZE_MODEL)
23
+ self.title_summarization_model = AutoModelForSeq2SeqLM.from_pretrained(
24
+ TITLE_SUMMARIZE_MODEL
25
+ )
26
+
27
+ # models + tokenizer for generating summaries from Python code
28
+ self.python_model = AutoModelForSeq2SeqLM.from_pretrained(PYTHON_CODE_MODEL)
29
+ self.python_tokenizer = AutoTokenizer.from_pretrained(
30
+ PYTHON_CODE_MODEL, padding=True
31
+ )
32
+ self.python_pipeline = pipeline(
33
+ "summarization",
34
+ model=PYTHON_CODE_MODEL,
35
+ config=AutoConfig.from_pretrained(PYTHON_CODE_MODEL),
36
+ tokenizer=self.python_tokenizer,
37
+ )
38
+ # initiate the language model
39
+ self.nlp = spacy.load("en_core_web_sm")
40
+
41
+ def generate_title(self, summary: str):
42
+ """Generate a concise title for a code cell"""
43
+ inputs = self.title_tokenizer.batch_encode_plus(
44
+ ["summarize: " + summary],
45
+ max_length=1024,
46
+ return_tensors="pt",
47
+ padding=True,
48
+ ) # Batch size 1
49
+ output = self.title_summarization_model.generate(
50
+ **inputs, num_beams=8, do_sample=True, min_length=10, max_length=10
51
+ )
52
+ decoded_output = self.title_tokenizer.batch_decode(
53
+ output, skip_special_tokens=True
54
+ )[0]
55
+ predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
56
+ return f"# {predicted_title}"
57
+
58
+ def _count_num_words(self, code):
59
+ words = code.split(" ")
60
+ return len(words)
61
+
62
+ def generate_summary(self, code):
63
+ """Generate a detailed summary for a code cell"""
64
+ result = self.python_pipeline(code, min_length=5, max_length=64)
65
+ summary = result[0]["summary_text"].strip()
66
+ title, summary = self._postprocess_summary(summary)
67
+ return f"# {title}", f"{summary}"
68
+
69
+ def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
70
+ """Add title and summary markdown cells before each code cell"""
71
+ # Create a new notebook
72
+ enhanced_notebook = nbformat.v4.new_notebook()
73
+ enhanced_notebook.metadata = notebook.metadata
74
+ # Process each cell
75
+ i = 0
76
+ id = len(notebook.cells) + 1
77
+ while i < len(notebook.cells):
78
+ cell = notebook.cells[i]
79
+ # For code cells, add title and summary markdown cells
80
+ if cell.cell_type == "code" and cell.source.strip():
81
+ # Generate summary
82
+ title, summary = self.generate_summary(cell.source)
83
+ summary_cell = nbformat.v4.new_markdown_cell(summary)
84
+ summary_cell.outputs = []
85
+ summary_cell.id = id
86
+ id += 1
87
+ title_cell = nbformat.v4.new_markdown_cell(title)
88
+ title_cell.outputs = []
89
+ title_cell.id = id
90
+ id += 1
91
+
92
+ enhanced_notebook.cells.append(title_cell)
93
+ enhanced_notebook.cells.append(summary_cell)
94
+ # Add the original cell
95
+ cell.outputs = []
96
+ enhanced_notebook.cells.append(cell)
97
+ i += 1
98
+ return enhanced_notebook
99
+
100
+ def is_valid(self, words: list[str]):
101
+ has_noun = False
102
+ has_verb = False
103
+ for word in words:
104
+ if word.pos_ in ["NOUN", "PROPN", "PRON"]:
105
+ has_noun = True
106
+ if word.pos_ == "VERB":
107
+ has_verb = True
108
+ return has_noun and has_verb
109
+
110
+ def _postprocess_summary(self, summary: str):
111
+ doc = self.nlp(summary)
112
+ sentences = list(doc.sents)
113
+ # remove the trailing list enumeration
114
+ postprocessed_sentences = []
115
+ for sentence in sentences:
116
+ if self.is_valid(sentence):
117
+ sentence_text = sentence.text
118
+ sentence_text = re.sub("[0-9]+\.", "", sentence_text)
119
+ postprocessed_sentences.append(sentence_text)
120
+ title = postprocessed_sentences[0]
121
+ summary = postprocessed_sentences[1:]
122
+ return title, " ".join(summary)
123
+
124
+
125
+ def process_notebook(file_path):
126
+ """Process an uploaded notebook file"""
127
+ enhancer = NotebookEnhancer()
128
+ nb = None
129
+ with open(file_path, "r", encoding="utf-8") as f:
130
+ nb = nbformat.read(f, as_version=4)
131
+ # Process the notebook
132
+ enhanced_notebook = enhancer.enhance_notebook(nb)
133
+ enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
134
+ # Save to temp file
135
+ output_path = "enhanced_notebook.ipynb"
136
+ with open(output_path, "w", encoding="utf-8") as f:
137
+ f.write(enhanced_notebook_str)
138
+
139
+ return output_path
140
+
141
+
142
+ def build_gradio_interface():
143
+ """Create and launch the Gradio interface"""
144
+ with gr.Blocks(title="Notebook Enhancer") as demo:
145
+ gr.Markdown("# Jupyter Notebook Enhancer")
146
+ gr.Markdown(
147
+ """
148
+ Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
149
+
150
+ This tool uses Hugging Face models to:
151
+ 1. Generate concise titles for code cells
152
+ 2. Create explanatory summaries of what the code does
153
+ """
154
+ )
155
+
156
+ with gr.Row():
157
+ with gr.Column():
158
+ file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
159
+ process_btn = gr.Button("Enhance Notebook")
160
+
161
+ with gr.Column():
162
+ output = gr.File(label="Enhanced Notebook")
163
+
164
+ process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)
165
+
166
+ return demo
167
+
168
+
169
+ # This will be the entry point when running the script
170
+ if __name__ == "__main__":
171
+ # file_input = "my_notebook.json"
172
+ # test = process_notebook(file_input)
173
+ demo = build_gradio_interface()
174
+ demo.launch()