import streamlit as st
import pandas as pd
import numpy as np

st.markdown('# Semantic search and topic classification (v1)')
st.markdown(' - Author: hcontreras')
st.markdown(' - Description: We want to classify sentences into a predefined set of topics. We use semantic search with a pre-trained transformer and we embed the input sentences and find the score relative to each topic')

st.markdown('## A quick test')
st.markdown('As a test we can create an embedding for a sentence and explore the score for a given topic (Transportation, Health, Space) and the most likey topic. Have fun!')

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

input_sentence = st.text_input('Sentence', 'This is a test for a news article')
input_topic = st.selectbox(
    'Topic',
    ('Space', 'Transportation', 'Health'))
#st.write('The current movie title is', title)

#Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']


#Sentences are encoded by calling model.encode()
embedding_sentence = model.encode(input_sentence)
embedding_topics = model.encode(['Space','Transportation','Health'])
embedding_input_topic = model.encode(input_topic)

#x = st.slider('Select a value')
#embedding = model.encode(input_sentence)
#st.write(x, 'squared is', x * x, 'embedding', embedding[0][0])
#st.write('The embedding of', '"' + input_sentence + '"', 'at position',x,'is',embedding[0][int(x)])

cos_scores = util.cos_sim(embedding_input_topic, embedding_sentence)[0][0]

st.write('Score for topic', input_topic, ':', np.round(cos_scores.tolist(),2))

l_scores = []
for topic in ['Space','Transportation','Health']:
    embedding_input_topic = model.encode(topic)
    cos_scores = util.cos_sim(embedding_input_topic, embedding_sentence)[0][0]
    l_scores.append(cos_scores.tolist())

st.write('Most likely topic:', ['Space','Transportation','Health'][np.argmax(l_scores)])

st.markdown('##  Adding bulk sentences and topics')
st.markdown('In this section we can upload a file with sentences (one column, header "sentence") and a file with topics (one column, header "topics") we can compute the most likely topic')
uploaded_file1 = st.file_uploader("Choose a file: sentence list")
if uploaded_file1 is not None:
    #read csv
    df1=pd.read_csv(uploaded_file1)
    st.write(df1.head())

uploaded_file2 = st.file_uploader("Choose a file: topic list")
if uploaded_file2 is not None:
    #read csv
    df2=pd.read_csv(uploaded_file2)
    st.write(df2.head())

if uploaded_file1 is not None and uploaded_file2 is not None:
    from sentence_transformers import SentenceTransformer, util
    import torch

    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    corpus = df1['sentence']
    topics = df2['topic']

    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    for topic in topics:
        topic_embedding = embedder.encode(topic, convert_to_tensor=True)
        cos_scores = util.cos_sim(topic_embedding, corpus_embeddings)[0]
        df1[str(topic)] = cos_scores

    st.write(df1)

    @st.cache
    def convert_df_to_csv(df):
      # IMPORTANT: Cache the conversion to prevent computation on every rerun
      return df.to_csv().encode('utf-8')


    st.download_button(
      label="Download data as CSV",
      data=convert_df_to_csv(df1),
      file_name='output.csv',
      mime='text/csv',
    )