Spaces:

irmchek
/

mynotebooksummary

Sleeping

App Files Files Community

irmchek commited on 14 days ago

Commit

9496a5f

1 Parent(s): 57d40ed

adding app to test on huggingface

Browse files

Files changed (1) hide show

app.py +174 -0

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import nbformat
+import spacy
+import gradio as gr
+from transformers import pipeline
+from tokenize import tokenize
+from transformers import (
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    AutoConfig,
+    pipeline,
+)
+import re
+import nltk
+PYTHON_CODE_MODEL = "sagard21/python-code-explainer"
+TITLE_SUMMARIZE_MODEL = "fabiochiu/t5-small-medium-title-generation"
+class NotebookEnhancer:
+    def __init__(self):
+        # models + tokenizer for generating titles from code summaries
+        self.title_tokenizer = AutoTokenizer.from_pretrained(TITLE_SUMMARIZE_MODEL)
+        self.title_summarization_model = AutoModelForSeq2SeqLM.from_pretrained(
+            TITLE_SUMMARIZE_MODEL
+        )
+        # models + tokenizer for generating summaries from Python code
+        self.python_model = AutoModelForSeq2SeqLM.from_pretrained(PYTHON_CODE_MODEL)
+        self.python_tokenizer = AutoTokenizer.from_pretrained(
+            PYTHON_CODE_MODEL, padding=True
+        )
+        self.python_pipeline = pipeline(
+            "summarization",
+            model=PYTHON_CODE_MODEL,
+            config=AutoConfig.from_pretrained(PYTHON_CODE_MODEL),
+            tokenizer=self.python_tokenizer,
+        )
+        # initiate the language model
+        self.nlp = spacy.load("en_core_web_sm")
+    def generate_title(self, summary: str):
+        """Generate a concise title for a code cell"""
+        inputs = self.title_tokenizer.batch_encode_plus(
+            ["summarize: " + summary],
+            max_length=1024,
+            return_tensors="pt",
+            padding=True,
+        )  # Batch size 1
+        output = self.title_summarization_model.generate(
+            **inputs, num_beams=8, do_sample=True, min_length=10, max_length=10
+        )
+        decoded_output = self.title_tokenizer.batch_decode(
+            output, skip_special_tokens=True
+        )[0]
+        predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
+        return f"# {predicted_title}"
+    def _count_num_words(self, code):
+        words = code.split(" ")
+        return len(words)
+    def generate_summary(self, code):
+        """Generate a detailed summary for a code cell"""
+        result = self.python_pipeline(code, min_length=5, max_length=64)
+        summary = result[0]["summary_text"].strip()
+        title, summary = self._postprocess_summary(summary)
+        return f"# {title}", f"{summary}"
+    def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
+        """Add title and summary markdown cells before each code cell"""
+        # Create a new notebook
+        enhanced_notebook = nbformat.v4.new_notebook()
+        enhanced_notebook.metadata = notebook.metadata
+        # Process each cell
+        i = 0
+        id = len(notebook.cells) + 1
+        while i < len(notebook.cells):
+            cell = notebook.cells[i]
+            # For code cells, add title and summary markdown cells
+            if cell.cell_type == "code" and cell.source.strip():
+                # Generate summary
+                title, summary = self.generate_summary(cell.source)
+                summary_cell = nbformat.v4.new_markdown_cell(summary)
+                summary_cell.outputs = []
+                summary_cell.id = id
+                id += 1
+                title_cell = nbformat.v4.new_markdown_cell(title)
+                title_cell.outputs = []
+                title_cell.id = id
+                id += 1
+                enhanced_notebook.cells.append(title_cell)
+                enhanced_notebook.cells.append(summary_cell)
+            # Add the original cell
+            cell.outputs = []
+            enhanced_notebook.cells.append(cell)
+            i += 1
+        return enhanced_notebook
+    def is_valid(self, words: list[str]):
+        has_noun = False
+        has_verb = False
+        for word in words:
+            if word.pos_ in ["NOUN", "PROPN", "PRON"]:
+                has_noun = True
+            if word.pos_ == "VERB":
+                has_verb = True
+        return has_noun and has_verb
+    def _postprocess_summary(self, summary: str):
+        doc = self.nlp(summary)
+        sentences = list(doc.sents)
+        # remove the trailing list enumeration
+        postprocessed_sentences = []
+        for sentence in sentences:
+            if self.is_valid(sentence):
+                sentence_text = sentence.text
+                sentence_text = re.sub("[0-9]+\.", "", sentence_text)
+                postprocessed_sentences.append(sentence_text)
+        title = postprocessed_sentences[0]
+        summary = postprocessed_sentences[1:]
+        return title, " ".join(summary)
+def process_notebook(file_path):
+    """Process an uploaded notebook file"""
+    enhancer = NotebookEnhancer()
+    nb = None
+    with open(file_path, "r", encoding="utf-8") as f:
+        nb = nbformat.read(f, as_version=4)
+    # Process the notebook
+    enhanced_notebook = enhancer.enhance_notebook(nb)
+    enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
+    # Save to temp file
+    output_path = "enhanced_notebook.ipynb"
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(enhanced_notebook_str)
+    return output_path
+def build_gradio_interface():
+    """Create and launch the Gradio interface"""
+    with gr.Blocks(title="Notebook Enhancer") as demo:
+        gr.Markdown("# Jupyter Notebook Enhancer")
+        gr.Markdown(
+            """
+        Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
+        This tool uses Hugging Face models to:
+        1. Generate concise titles for code cells
+        2. Create explanatory summaries of what the code does
+        """
+        )
+        with gr.Row():
+            with gr.Column():
+                file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
+                process_btn = gr.Button("Enhance Notebook")
+            with gr.Column():
+                output = gr.File(label="Enhanced Notebook")
+        process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)
+    return demo
+# This will be the entry point when running the script
+if __name__ == "__main__":
+    # file_input = "my_notebook.json"
+    # test = process_notebook(file_input)
+    demo = build_gradio_interface()
+    demo.launch()