|
import random |
|
from typing import AnyStr |
|
|
|
import itertools |
|
import streamlit as st |
|
import torch.nn.parameter |
|
from bs4 import BeautifulSoup |
|
import numpy as np |
|
import base64 |
|
|
|
import validators |
|
from spacy_streamlit.util import get_svg |
|
from validators import ValidationFailure |
|
|
|
from custom_renderer import render_sentence_custom |
|
from flair.data import Sentence |
|
from flair.models import SequenceTagger |
|
|
|
import spacy |
|
from spacy import displacy |
|
from spacy_streamlit import visualize_parser |
|
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
from transformers import pipeline |
|
import os |
|
from transformers_interpret import SequenceClassificationExplainer |
|
|
|
|
|
model_names_to_URLs = { |
|
'ml6team/distilbert-base-dutch-cased-toxic-comments': |
|
'https://huggingface.co./ml6team/distilbert-base-dutch-cased-toxic-comments', |
|
'ml6team/robbert-dutch-base-toxic-comments': |
|
'https://huggingface.co./ml6team/robbert-dutch-base-toxic-comments', |
|
} |
|
|
|
about_page_markdown = f"""# 🤬 Dutch Toxic Comment Detection Space |
|
|
|
Made by [ML6](https://ml6.eu/). |
|
|
|
Token attribution is performed using [transformers-interpret](https://github.com/cdpierse/transformers-interpret). |
|
""" |
|
|
|
regular_emojis = [ |
|
'😐', '🙂', '👶', '😇', |
|
] |
|
undecided_emojis = [ |
|
'🤨', '🧐', '🥸', '🥴', '🤷', |
|
] |
|
potty_mouth_emojis = [ |
|
'🤐', '👿', '😡', '🤬', '☠️', '☣️', '☢️', |
|
] |
|
|
|
|
|
st.set_page_config( |
|
page_title="Post-processing summarization fact checker", |
|
page_icon="", |
|
layout="centered", |
|
initial_sidebar_state="auto", |
|
menu_items={ |
|
'Get help': None, |
|
'Report a bug': None, |
|
'About': about_page_markdown, |
|
} |
|
) |
|
|
|
|
|
|
|
@st.cache(allow_output_mutation=True, |
|
suppress_st_warning=True, |
|
show_spinner=False) |
|
def load_pipeline(model_name): |
|
with st.spinner('Loading model (this might take a while)...'): |
|
toxicity_pipeline = pipeline( |
|
'text-classification', |
|
model=model_name, |
|
tokenizer=model_name) |
|
cls_explainer = SequenceClassificationExplainer( |
|
toxicity_pipeline.model, |
|
toxicity_pipeline.tokenizer) |
|
return toxicity_pipeline, cls_explainer |
|
|
|
|
|
|
|
def format_explainer_html(html_string): |
|
"""Extract tokens with attribution-based background color.""" |
|
inside_token_prefix = '##' |
|
soup = BeautifulSoup(html_string, 'html.parser') |
|
p = soup.new_tag('p', |
|
attrs={'style': 'color: black; background-color: white;'}) |
|
|
|
current_word = None |
|
for token in soup.find_all('td')[-1].find_all('mark')[1:-1]: |
|
text = token.font.text.strip() |
|
if text.startswith(inside_token_prefix): |
|
text = text[len(inside_token_prefix):] |
|
else: |
|
|
|
if current_word is not None: |
|
p.append(current_word) |
|
p.append(' ') |
|
current_word = soup.new_tag('span') |
|
token.string = text |
|
token.attrs['style'] = f"{token.attrs['style']}; padding: 0.2em 0em;" |
|
current_word.append(token) |
|
|
|
|
|
p.append(current_word) |
|
|
|
|
|
for span in p.find_all('span'): |
|
span.find_all('mark')[0].attrs['style'] = ( |
|
f"{span.find_all('mark')[0].attrs['style']}; padding-left: 0.2em;") |
|
span.find_all('mark')[-1].attrs['style'] = ( |
|
f"{span.find_all('mark')[-1].attrs['style']}; padding-right: 0.2em;") |
|
|
|
return p |
|
|
|
|
|
def list_all_article_names() -> list: |
|
filenames = [] |
|
for file in sorted(os.listdir('./sample-articles/')): |
|
if file.endswith('.txt'): |
|
filenames.append(file.replace('.txt', '')) |
|
return filenames |
|
|
|
|
|
def fetch_article_contents(filename: str) -> AnyStr: |
|
with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f: |
|
data = f.read() |
|
return data |
|
|
|
|
|
def fetch_summary_contents(filename: str) -> AnyStr: |
|
with open(f'./sample-summaries/{filename.lower()}.txt', 'r') as f: |
|
data = f.read() |
|
return data |
|
|
|
|
|
def fetch_entity_specific_contents(filename: str) -> AnyStr: |
|
with open(f'./entity-specific-text/{filename.lower()}.txt', 'r') as f: |
|
data = f.read() |
|
return data |
|
|
|
|
|
def fetch_dependency_specific_contents(filename: str) -> AnyStr: |
|
with open(f'./dependency-specific-text/{filename.lower()}.txt', 'r') as f: |
|
data = f.read() |
|
return data |
|
|
|
|
|
def classify_comment(comment, selected_model): |
|
"""Classify the given comment and augment with additional information.""" |
|
toxicity_pipeline, cls_explainer = load_pipeline(selected_model) |
|
result = toxicity_pipeline(comment)[0] |
|
result['model_name'] = selected_model |
|
|
|
|
|
result['word_attribution'] = cls_explainer(comment, class_name="non-toxic") |
|
result['visualitsation_html'] = cls_explainer.visualize()._repr_html_() |
|
result['tokens_with_background'] = format_explainer_html( |
|
result['visualitsation_html']) |
|
|
|
|
|
label, score = result['label'], result['score'] |
|
if label == 'toxic' and score > 0.1: |
|
emoji = random.choice(potty_mouth_emojis) |
|
elif label in ['non_toxic', 'non-toxic'] and score > 0.1: |
|
emoji = random.choice(regular_emojis) |
|
else: |
|
emoji = random.choice(undecided_emojis) |
|
result.update({'text': comment, 'emoji': emoji}) |
|
|
|
|
|
st.session_state.results.append(result) |
|
|
|
|
|
def display_summary(article_name: str): |
|
summary_content = fetch_summary_contents(article_name) |
|
st.session_state.summary_output = summary_content |
|
soup = BeautifulSoup(summary_content, features="html.parser") |
|
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>""" |
|
return HTML_WRAPPER.format(soup) |
|
|
|
|
|
|
|
def get_spacy(): |
|
nlp = spacy.load('en_core_web_lg') |
|
return nlp |
|
|
|
|
|
|
|
@st.cache(hash_funcs={torch.nn.parameter.Parameter: lambda _: None}, allow_output_mutation=True) |
|
def get_flair_tagger(): |
|
tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast") |
|
return tagger |
|
|
|
|
|
def get_all_entities_per_sentence(text): |
|
|
|
nlp = get_spacy() |
|
tagger = get_flair_tagger() |
|
doc = nlp(text) |
|
|
|
sentences = list(doc.sents) |
|
|
|
entities_all_sentences = [] |
|
for sentence in sentences: |
|
entities_this_sentence = [] |
|
|
|
|
|
for entity in sentence.ents: |
|
entities_this_sentence.append(str(entity)) |
|
|
|
|
|
sentence_entities = Sentence(str(sentence)) |
|
tagger.predict(sentence_entities) |
|
for entity in sentence_entities.get_spans('ner'): |
|
entities_this_sentence.append(entity.text) |
|
entities_all_sentences.append(entities_this_sentence) |
|
|
|
return entities_all_sentences |
|
|
|
|
|
def get_all_entities(text): |
|
all_entities_per_sentence = get_all_entities_per_sentence(text) |
|
return list(itertools.chain.from_iterable(all_entities_per_sentence)) |
|
|
|
|
|
|
|
def get_and_compare_entities(article_name: str): |
|
article_content = fetch_article_contents(article_name) |
|
all_entities_per_sentence = get_all_entities_per_sentence(article_content) |
|
|
|
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence)) |
|
|
|
summary_content = fetch_summary_contents(article_name) |
|
all_entities_per_sentence = get_all_entities_per_sentence(summary_content) |
|
|
|
entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence)) |
|
|
|
matched_entities = [] |
|
unmatched_entities = [] |
|
for entity in entities_summary: |
|
|
|
if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article): |
|
matched_entities.append(entity) |
|
else: |
|
unmatched_entities.append(entity) |
|
return matched_entities, unmatched_entities |
|
|
|
|
|
def highlight_entities(article_name: str): |
|
summary_content = fetch_summary_contents(article_name) |
|
|
|
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">" |
|
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">" |
|
markdown_end = "</mark>" |
|
|
|
matched_entities, unmatched_entities = get_and_compare_entities(article_name) |
|
|
|
for entity in matched_entities: |
|
summary_content = summary_content.replace(entity, markdown_start_green + entity + markdown_end) |
|
|
|
for entity in unmatched_entities: |
|
summary_content = summary_content.replace(entity, markdown_start_red + entity + markdown_end) |
|
soup = BeautifulSoup(summary_content, features="html.parser") |
|
|
|
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; |
|
margin-bottom: 2.5rem">{}</div> """ |
|
|
|
return HTML_WRAPPER.format(soup) |
|
|
|
|
|
def render_dependency_parsing(text: str): |
|
html = render_sentence_custom(text) |
|
html = html.replace("\n\n", "\n") |
|
st.write(get_svg(html), unsafe_allow_html=True) |
|
|
|
|
|
|
|
def check_dependency(article: bool): |
|
nlp = spacy.load('en_core_web_lg') |
|
if article: |
|
text = st.session_state.article_text |
|
all_entities = get_all_entities_per_sentence(text) |
|
|
|
else: |
|
text = st.session_state.summary_output |
|
all_entities = get_all_entities_per_sentence(text) |
|
|
|
doc = nlp(text) |
|
tok_l = doc.to_json()['tokens'] |
|
|
|
test_list_dict_output = [] |
|
|
|
sentences = list(doc.sents) |
|
for i, sentence in enumerate(sentences): |
|
start_id = sentence.start |
|
end_id = sentence.end |
|
for t in tok_l: |
|
|
|
if t["id"] < start_id or t["id"] > end_id: |
|
continue |
|
head = tok_l[t['head']] |
|
if t['dep'] == 'amod' or t['dep'] == "pobj": |
|
object_here = text[t['start']:t['end']] |
|
object_target = text[head['start']:head['end']] |
|
if t['dep'] == "pobj" and str.lower(object_target) != "in": |
|
continue |
|
|
|
if object_here in all_entities[i]: |
|
|
|
identifier = object_here + t['dep'] + object_target |
|
test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start), |
|
"target_word_index": (t['head'] - sentence.start), |
|
"identifier": identifier, "sentence": str(sentence)}) |
|
elif object_target in all_entities[i]: |
|
|
|
identifier = object_here + t['dep'] + object_target |
|
test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start), |
|
"target_word_index": (t['head'] - sentence.start), |
|
"identifier": identifier, "sentence": str(sentence)}) |
|
else: |
|
continue |
|
|
|
return test_list_dict_output |
|
|
|
|
|
|
|
def is_valid_url(url: str) -> bool: |
|
result = validators.url(url) |
|
if isinstance(result, ValidationFailure): |
|
return False |
|
return True |
|
|
|
|
|
|
|
if 'results' not in st.session_state: |
|
st.session_state.results = [] |
|
|
|
|
|
st.title('Summarization fact checker') |
|
|
|
|
|
st.header("Introduction") |
|
st.markdown("""Recent work using transformers on large text corpora has shown great succes when fine-tuned on several |
|
different downstream NLP tasks. One such task is that of text summarization. The goal of text summarization is to |
|
generate concise and accurate summaries from input document(s). There are 2 types of summarization: extractive and |
|
abstractive. **Exstractive summarization** merely copies informative fragments from the input, whereas **abstractive |
|
summarization** may generate novel words. A good abstractive summary should cover principal information in the input |
|
and has to be linguistically fluent. This blogpost will focus on this more difficult task of abstractive summary |
|
generation.""") |
|
|
|
st.markdown("""To generate summaries we will use the [PEGASUS] (https://huggingface.co./google/pegasus-cnn_dailymail) |
|
model, producing abstractive summaries from large articles. These summaries often still contain sentences with |
|
different kinds of errors. Rather than improving the core model, we will look at possible post-processing steps to |
|
improve the generated summaries by detecting such possible errors. By comparing contents of the summary with the |
|
source text, we can create some sort of factualness metric, indicating the trustworthiness of the generated |
|
summary.""") |
|
|
|
|
|
st.header("Generating summaries") |
|
st.markdown("Let’s start by selecting an article text for which we want to generate a summary, or you can provide " |
|
"text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary " |
|
"generated might not be optimal to start from.") |
|
|
|
|
|
selected_article = st.selectbox('Select an article or provide your own:', |
|
list_all_article_names()) |
|
st.session_state.article_text = fetch_article_contents(selected_article) |
|
article_text = st.text_area( |
|
label='Full article text', |
|
value=st.session_state.article_text, |
|
height=150 |
|
) |
|
|
|
st.markdown("Below you can find the generated summary for the article. The summaries of the example articles " |
|
"vary in quality, but are chosen as such. Based on some common errors, we will discuss possible " |
|
"methods to improve or rank the summaries in the following paragraphs. The idea is that in " |
|
"production, you could generate a set of summaries for the same article, with different " |
|
"parameters (or even different models). By using post-processing methods and metrics, " |
|
"we can detect some errors in summaries, and choose the best one to actually use.") |
|
if st.session_state.article_text: |
|
with st.spinner('Generating summary...'): |
|
|
|
|
|
summary_displayed = display_summary(selected_article) |
|
|
|
st.write("**Generated summary:**", summary_displayed, unsafe_allow_html=True) |
|
else: |
|
st.error('**Error**: No comment to classify. Please provide a comment.', |
|
help="Generate summary for the given article text") |
|
|
|
if is_valid_url(article_text): |
|
print("YES") |
|
else: |
|
print("NO") |
|
def render_svg(svg_file): |
|
with open(svg_file, "r") as f: |
|
lines = f.readlines() |
|
svg = "".join(lines) |
|
|
|
|
|
b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8") |
|
html = r'<img src="data:image/svg+xml;base64,%s"/>' % b64 |
|
return html |
|
|
|
|
|
|
|
st.header("Entity matching") |
|
st.markdown("**Named entity recognition** (NER) is the task of identifying and categorising key information (" |
|
"entities) in text. An entity can be a singular word or a series of words that consistently refers to the " |
|
"same thing. Common entity classes are person names, organisations, locations and so on. By applying NER " |
|
"to both the article and its summary, we can spot possible **hallucinations**. Hallucinations are words " |
|
"generated by the model that are not supported by the source input. ") |
|
with st.spinner("Calculating and matching entities..."): |
|
entity_match_html = highlight_entities(selected_article) |
|
st.write(entity_match_html, unsafe_allow_html=True) |
|
red_text = """<font color="black"><span style="background-color: rgb(238, 135, 135); opacity: |
|
1;">red</span></font> """ |
|
green_text = """<font color="black"> |
|
<span style="background-color: rgb(121, 236, 121); opacity: 1;">green</span> |
|
</font>""" |
|
|
|
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">" |
|
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">" |
|
st.markdown("Here you can see what this looks like when we apply entity-matching on the summary (compared to the " |
|
"original article). Entities in this summary are marked " + green_text + " when the entity also " |
|
"exists in the article, while unmatched entities are marked " + red_text + ".", |
|
unsafe_allow_html=True) |
|
entity_specific_text = fetch_entity_specific_contents(selected_article) |
|
st.markdown(entity_specific_text) |
|
|
|
|
|
st.header("Dependency comparison") |
|
st.markdown("**Dependency parsing** is the process in which the grammatical structure in a sentence is analysed, " |
|
"to find out related words as well as the type of the relationship between them. For the sentence “Jan’s " |
|
"wife is called Sarah” you would get the following dependency graph:") |
|
|
|
|
|
|
|
st.write(render_svg('ExampleParsing.svg'), unsafe_allow_html=True) |
|
st.markdown("Here, “Jan” is the “poss” (possession modifier) of “wife”. If suddenly the summary would read “Jan’s " |
|
"husband…”, there would be a dependency in the summary that is non-existent in the article itself. " |
|
"However, it could be that such a new dependency is not per se correct, “The borders of Ukraine” have a " |
|
"different dependency between “borders” and “Ukraine” than “Ukraine’s borders”, while this would also be " |
|
"correct. So general matching between summary and article wont work.") |
|
st.markdown("There is however a simple method that we found has potential in post-processing. Based on empirical " |
|
"results, we have found that when there are specific kinds of dependencies in the summary that are not in " |
|
"the article, these specific types are often an indication of a wrongly constructed sentence. Let’s take " |
|
"a look at an example:") |
|
with st.spinner("Doing dependency parsing..."): |
|
summary_deps = check_dependency(False) |
|
article_deps = check_dependency(True) |
|
total_unmatched_deps = [] |
|
for summ_dep in summary_deps: |
|
if not any(summ_dep['identifier'] in art_dep['identifier'] for art_dep in article_deps): |
|
total_unmatched_deps.append(summ_dep) |
|
|
|
|
|
if total_unmatched_deps: |
|
for current_drawing_list in total_unmatched_deps: |
|
render_dependency_parsing(current_drawing_list) |
|
dep_spec_text = fetch_dependency_specific_contents(selected_article) |
|
st.markdown(dep_spec_text) |
|
soup = BeautifulSoup("Example text option with box", features="html.parser") |
|
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; |
|
margin-bottom: 2.5rem">{}</div> """ |
|
st.write(HTML_WRAPPER.format(soup), unsafe_allow_html=True) |
|
|
|
|
|
st.header("Wrapping up") |
|
st.markdown("We have presented 2 methods that try to improve summaries via post-processing steps. Entity matching can " |
|
"be used to solve hallucinations, while checking if specific dependencies are matched between summary and " |
|
"article can be used to filter out some bad sentences (and thus worse summaries). Of course these are " |
|
"only basic methods which were empirically tested, but they are a start at actually making something good " |
|
"(???). (something about that we tested also RE and maybe other things).") |
|
st.markdown("####") |
|
st.markdown("Now based on these methods you can check summaries and whether they are “good” or “bad”. Below you can " |
|
"generate 5 different kind of summaries for the starting article (based on different model params) in " |
|
"which their ranks are estimated, and hopefully the best summary (read: the one that a human would prefer " |
|
"or indicate as the best one) will be at the top.") |
|
|