irmchek's picture
adding app to test on huggingface
9496a5f
raw
history blame
6.3 kB
import nbformat
import spacy
import gradio as gr
from transformers import pipeline
from tokenize import tokenize
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
AutoConfig,
pipeline,
)
import re
import nltk
PYTHON_CODE_MODEL = "sagard21/python-code-explainer"
TITLE_SUMMARIZE_MODEL = "fabiochiu/t5-small-medium-title-generation"
class NotebookEnhancer:
def __init__(self):
# models + tokenizer for generating titles from code summaries
self.title_tokenizer = AutoTokenizer.from_pretrained(TITLE_SUMMARIZE_MODEL)
self.title_summarization_model = AutoModelForSeq2SeqLM.from_pretrained(
TITLE_SUMMARIZE_MODEL
)
# models + tokenizer for generating summaries from Python code
self.python_model = AutoModelForSeq2SeqLM.from_pretrained(PYTHON_CODE_MODEL)
self.python_tokenizer = AutoTokenizer.from_pretrained(
PYTHON_CODE_MODEL, padding=True
)
self.python_pipeline = pipeline(
"summarization",
model=PYTHON_CODE_MODEL,
config=AutoConfig.from_pretrained(PYTHON_CODE_MODEL),
tokenizer=self.python_tokenizer,
)
# initiate the language model
self.nlp = spacy.load("en_core_web_sm")
def generate_title(self, summary: str):
"""Generate a concise title for a code cell"""
inputs = self.title_tokenizer.batch_encode_plus(
["summarize: " + summary],
max_length=1024,
return_tensors="pt",
padding=True,
) # Batch size 1
output = self.title_summarization_model.generate(
**inputs, num_beams=8, do_sample=True, min_length=10, max_length=10
)
decoded_output = self.title_tokenizer.batch_decode(
output, skip_special_tokens=True
)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
return f"# {predicted_title}"
def _count_num_words(self, code):
words = code.split(" ")
return len(words)
def generate_summary(self, code):
"""Generate a detailed summary for a code cell"""
result = self.python_pipeline(code, min_length=5, max_length=64)
summary = result[0]["summary_text"].strip()
title, summary = self._postprocess_summary(summary)
return f"# {title}", f"{summary}"
def enhance_notebook(self, notebook: nbformat.notebooknode.NotebookNode):
"""Add title and summary markdown cells before each code cell"""
# Create a new notebook
enhanced_notebook = nbformat.v4.new_notebook()
enhanced_notebook.metadata = notebook.metadata
# Process each cell
i = 0
id = len(notebook.cells) + 1
while i < len(notebook.cells):
cell = notebook.cells[i]
# For code cells, add title and summary markdown cells
if cell.cell_type == "code" and cell.source.strip():
# Generate summary
title, summary = self.generate_summary(cell.source)
summary_cell = nbformat.v4.new_markdown_cell(summary)
summary_cell.outputs = []
summary_cell.id = id
id += 1
title_cell = nbformat.v4.new_markdown_cell(title)
title_cell.outputs = []
title_cell.id = id
id += 1
enhanced_notebook.cells.append(title_cell)
enhanced_notebook.cells.append(summary_cell)
# Add the original cell
cell.outputs = []
enhanced_notebook.cells.append(cell)
i += 1
return enhanced_notebook
def is_valid(self, words: list[str]):
has_noun = False
has_verb = False
for word in words:
if word.pos_ in ["NOUN", "PROPN", "PRON"]:
has_noun = True
if word.pos_ == "VERB":
has_verb = True
return has_noun and has_verb
def _postprocess_summary(self, summary: str):
doc = self.nlp(summary)
sentences = list(doc.sents)
# remove the trailing list enumeration
postprocessed_sentences = []
for sentence in sentences:
if self.is_valid(sentence):
sentence_text = sentence.text
sentence_text = re.sub("[0-9]+\.", "", sentence_text)
postprocessed_sentences.append(sentence_text)
title = postprocessed_sentences[0]
summary = postprocessed_sentences[1:]
return title, " ".join(summary)
def process_notebook(file_path):
"""Process an uploaded notebook file"""
enhancer = NotebookEnhancer()
nb = None
with open(file_path, "r", encoding="utf-8") as f:
nb = nbformat.read(f, as_version=4)
# Process the notebook
enhanced_notebook = enhancer.enhance_notebook(nb)
enhanced_notebook_str = nbformat.writes(enhanced_notebook, version=4)
# Save to temp file
output_path = "enhanced_notebook.ipynb"
with open(output_path, "w", encoding="utf-8") as f:
f.write(enhanced_notebook_str)
return output_path
def build_gradio_interface():
"""Create and launch the Gradio interface"""
with gr.Blocks(title="Notebook Enhancer") as demo:
gr.Markdown("# Jupyter Notebook Enhancer")
gr.Markdown(
"""
Upload a Jupyter notebook to enhance it with automatically generated titles and summaries for each code cell.
This tool uses Hugging Face models to:
1. Generate concise titles for code cells
2. Create explanatory summaries of what the code does
"""
)
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload Jupyter Notebook (.ipynb)")
process_btn = gr.Button("Enhance Notebook")
with gr.Column():
output = gr.File(label="Enhanced Notebook")
process_btn.click(fn=process_notebook, inputs=file_input, outputs=output)
return demo
# This will be the entry point when running the script
if __name__ == "__main__":
# file_input = "my_notebook.json"
# test = process_notebook(file_input)
demo = build_gradio_interface()
demo.launch()