Spaces:

irmchek
/

mynotebooksummary

Sleeping

File size: 6,339 Bytes

import nbformat
import spacy
import gradio as gr
from transformers import pipeline
from tokenize import tokenize
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    AutoConfig,
    pipeline,
)
import re
import nltk

PYTHON_CODE_MODEL = "sagard21/python-code-explainer"
TITLE_SUMMARIZE_MODEL = "fabiochiu/t5-small-medium-title-generation"


class NotebookEnhancer:
    def __init__(self):
        # models + tokenizer for generating titles from code summaries
        self.title_tokenizer = AutoTokenizer.from_pretrained(TITLE_SUMMARIZE_MODEL)
        self.title_summarization_model = AutoModelForSeq2SeqLM.from_pretrained(
            TITLE_SUMMARIZE_MODEL
        )

        # models + tokenizer for generating summaries from Python code
        self.python_model = AutoModelForSeq2SeqLM.from_pretrained(PYTHON_CODE_MODEL)
        self.python_tokenizer = AutoTokenizer.from_pretrained(
            PYTHON_CODE_MODEL, padding=True
        )
        self.python_pipeline = pipeline(
            "summarization",
            model=PYTHON_CODE_MODEL,
            config=AutoConfig.from_pretrained(PYTHON_CODE_MODEL),
            tokenizer=self.python_tokenizer,
        )
        # initiate the language model
        self.nlp = spacy.load("en_core_web_sm")

    def generate_title(self, summary: str):
        """Generate a concise title for a code cell"""
        inputs = self.title_tokenizer.batch_encode_plus(
            ["summarize: " + summary],
            max_length=1024,
            return_tensors="pt",
            padding=True,
        )  # Batch size 1
        output = self.title_summarization_model.generate(
            **inputs, num_beams=8, do_sample=True, min_length=10, max_length=10
        )
        decoded_output = self.title_tokenizer.batch_decode(
            output, skip_special_tokens=True
        )[0]
        predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
        return f"# {predicted_title}"

    def _count_num_words(self, code):
        words = code.split(" ")
        return len(words)

    def generate_summary(self, code):
        """Generate a detailed summary for a code cell"""
        result = self.python_pipeline(code, min_length=5, max_length=64)
        summary = result[0]["summary_text"].strip()
        title, summary = self._postprocess_summary(summary)
        return f"# {title}", f"{summary}"

    def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
        """Add title and summary markdown cells before each code cell"""
        # Create a new notebook
        enhanced_notebook = nbformat.v4.new_notebook()
        enhanced_notebook.metadata = notebook.metadata
        # Process each cell
        i = 0
        id = len(notebook.cells) + 1
        while i < len(notebook.cells):
            cell = notebook.cells[i]
            # For code cells, add title and summary markdown cells
            if cell.cell_type == "code" and cell.source.strip():
                # Generate summary
                title, summary = self.generate_summary(cell.source)
                summary_cell = nbformat.v4.new_markdown_cell(summary)
                summary_cell.outputs = []
                summary_cell.id = id
                id += 1
                title_cell = nbformat.v4.new_markdown_cell(title)
                title_cell.outputs = []
                title_cell.id = id
                id += 1

                enhanced_notebook.cells.append(title_cell)
                enhanced_notebook.cells.append(summary_cell)
            # Add the original cell
            cell.outputs = []
            enhanced_notebook.cells.append(cell)
            i += 1
        return enhanced_notebook

    def is_valid(self, words: list[str]):
        has_noun = False
        has_verb = False
        for word in words:
            if word.pos_ in ["NOUN", "PROPN", "PRON"]:
                has_noun = True
            if word.pos_ == "VERB":
                has_verb = True
        return has_noun and has_verb

    def _postprocess_summary(self, summary: str):
        doc = self.nlp(summary)
        sentences = list(doc.sents)
        # remove the trailing list enumeration
        postprocessed_sentences = []
        for sentence in sentences:
            if self.is_valid(sentence):
                sentence_text = sentence.text
                sentence_text = re.sub("[0-9]+\.", "", sentence_text)
                postprocessed_sentences.append(sentence_text)
        title = postprocessed_sentences[0]
        summary = postprocessed_sentences[1:]
        return title, " ".join(summary)


def process_notebook(file_path):
    """Process an uploaded notebook file"""
    enhancer = NotebookEnhancer()
    nb = None
    with open(file_path, "r", encoding="utf-8") as f:
        nb = nbformat.read(f, as_version=4)
    # Process the notebook
    enhanced_notebook = enhancer.enhance_notebook(nb)
    enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
    # Save to temp file
    output_path = "enhanced_notebook.ipynb"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(enhanced_notebook_str)

    return output_path


def build_gradio_interface():
    """Create and launch the Gradio interface"""
    with gr.Blocks(title="Notebook Enhancer") as demo:
        gr.Markdown("# Jupyter Notebook Enhancer")
        gr.Markdown(
            """
        Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
        
        This tool uses Hugging Face models to:
        1. Generate concise titles for code cells
        2. Create explanatory summaries of what the code does
        """
        )

        with gr.Row():
            with gr.Column():
                file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
                print(file_input)
                process_btn = gr.Button("Enhance Notebook")

            with gr.Column():
                output = gr.File(label="Enhanced Notebook")

        process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)

    return demo


# This will be the entry point when running the script
if __name__ == "__main__":
    # file_input = "my_notebook.json"
    # test = process_notebook(file_input)
    demo = build_gradio_interface()
    demo.launch(share=True)