File size: 6,339 Bytes
3658694
462fea8
3658694
 
462fea8
 
 
 
 
 
 
 
57d40ed
462fea8
57d40ed
 
3658694
 
 
 
57d40ed
 
 
 
 
 
 
 
 
 
 
 
462fea8
57d40ed
 
 
3658694
57d40ed
462fea8
3658694
57d40ed
3658694
57d40ed
 
 
 
 
 
 
 
 
 
 
 
 
 
462fea8
 
 
 
3658694
 
 
57d40ed
462fea8
57d40ed
 
462fea8
 
3658694
 
 
 
 
 
462fea8
3658694
 
 
 
 
57d40ed
3658694
462fea8
 
 
 
 
 
 
 
 
3658694
 
462fea8
3658694
 
462fea8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57d40ed
 
 
 
 
 
462fea8
 
 
3658694
 
462fea8
 
 
3658694
462fea8
 
3658694
 
 
462fea8
3658694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7949eb2
3658694
 
 
 
 
 
 
 
 
 
 
 
57d40ed
 
 
d062c90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import nbformat
import spacy
import gradio as gr
from transformers import pipeline
from tokenize import tokenize
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoConfig,
    pipeline,
)
import re
import nltk

PYTHON_CODE_MODEL = "sagard21/python-code-explainer"
TITLE_SUMMARIZE_MODEL = "fabiochiu/t5-small-medium-title-generation"


class NotebookEnhancer:
    def __init__(self):
        # models + tokenizer for generating titles from code summaries
        self.title_tokenizer = AutoTokenizer.from_pretrained(TITLE_SUMMARIZE_MODEL)
        self.title_summarization_model = AutoModelForSeq2SeqLM.from_pretrained(
            TITLE_SUMMARIZE_MODEL
        )

        # models + tokenizer for generating summaries from Python code
        self.python_model = AutoModelForSeq2SeqLM.from_pretrained(PYTHON_CODE_MODEL)
        self.python_tokenizer = AutoTokenizer.from_pretrained(
            PYTHON_CODE_MODEL, padding=True
        )
        self.python_pipeline = pipeline(
            "summarization",
            model=PYTHON_CODE_MODEL,
            config=AutoConfig.from_pretrained(PYTHON_CODE_MODEL),
            tokenizer=self.python_tokenizer,
        )
        # initiate the language model
        self.nlp = spacy.load("en_core_web_sm")

    def generate_title(self, summary: str):
        """Generate a concise title for a code cell"""
        inputs = self.title_tokenizer.batch_encode_plus(
            ["summarize: " + summary],
            max_length=1024,
            return_tensors="pt",
            padding=True,
        )  # Batch size 1
        output = self.title_summarization_model.generate(
            **inputs, num_beams=8, do_sample=True, min_length=10, max_length=10
        )
        decoded_output = self.title_tokenizer.batch_decode(
            output, skip_special_tokens=True
        )[0]
        predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
        return f"# {predicted_title}"

    def _count_num_words(self, code):
        words = code.split(" ")
        return len(words)

    def generate_summary(self, code):
        """Generate a detailed summary for a code cell"""
        result = self.python_pipeline(code, min_length=5, max_length=64)
        summary = result[0]["summary_text"].strip()
        title, summary = self._postprocess_summary(summary)
        return f"# {title}", f"{summary}"

    def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
        """Add title and summary markdown cells before each code cell"""
        # Create a new notebook
        enhanced_notebook = nbformat.v4.new_notebook()
        enhanced_notebook.metadata = notebook.metadata
        # Process each cell
        i = 0
        id = len(notebook.cells) + 1
        while i < len(notebook.cells):
            cell = notebook.cells[i]
            # For code cells, add title and summary markdown cells
            if cell.cell_type == "code" and cell.source.strip():
                # Generate summary
                title, summary = self.generate_summary(cell.source)
                summary_cell = nbformat.v4.new_markdown_cell(summary)
                summary_cell.outputs = []
                summary_cell.id = id
                id += 1
                title_cell = nbformat.v4.new_markdown_cell(title)
                title_cell.outputs = []
                title_cell.id = id
                id += 1

                enhanced_notebook.cells.append(title_cell)
                enhanced_notebook.cells.append(summary_cell)
            # Add the original cell
            cell.outputs = []
            enhanced_notebook.cells.append(cell)
            i += 1
        return enhanced_notebook

    def is_valid(self, words: list[str]):
        has_noun = False
        has_verb = False
        for word in words:
            if word.pos_ in ["NOUN", "PROPN", "PRON"]:
                has_noun = True
            if word.pos_ == "VERB":
                has_verb = True
        return has_noun and has_verb

    def _postprocess_summary(self, summary: str):
        doc = self.nlp(summary)
        sentences = list(doc.sents)
        # remove the trailing list enumeration
        postprocessed_sentences = []
        for sentence in sentences:
            if self.is_valid(sentence):
                sentence_text = sentence.text
                sentence_text = re.sub("[0-9]+\.", "", sentence_text)
                postprocessed_sentences.append(sentence_text)
        title = postprocessed_sentences[0]
        summary = postprocessed_sentences[1:]
        return title, " ".join(summary)


def process_notebook(file_path):
    """Process an uploaded notebook file"""
    enhancer = NotebookEnhancer()
    nb = None
    with open(file_path, "r", encoding="utf-8") as f:
        nb = nbformat.read(f, as_version=4)
    # Process the notebook
    enhanced_notebook = enhancer.enhance_notebook(nb)
    enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
    # Save to temp file
    output_path = "enhanced_notebook.ipynb"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(enhanced_notebook_str)

    return output_path


def build_gradio_interface():
    """Create and launch the Gradio interface"""
    with gr.Blocks(title="Notebook Enhancer") as demo:
        gr.Markdown("# Jupyter Notebook Enhancer")
        gr.Markdown(
            """
        Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
        
        This tool uses Hugging Face models to:
        1. Generate concise titles for code cells
        2. Create explanatory summaries of what the code does
        """
        )

        with gr.Row():
            with gr.Column():
                file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
                print(file_input)
                process_btn = gr.Button("Enhance Notebook")

            with gr.Column():
                output = gr.File(label="Enhanced Notebook")

        process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)

    return demo


# This will be the entry point when running the script
if __name__ == "__main__":
    # file_input = "my_notebook.json"
    # test = process_notebook(file_input)
    demo = build_gradio_interface()
    demo.launch(share=True)