Spaces:

hacpdsae2023
/

test

Sleeping

File size: 3,438 Bytes

d727a16
4b1c6eb
ad3e5f6
d727a16
094d202
 
 
 
376e087
c9f5c49
376e087
c319aca
2430162
 
376e087
 
 
 
00e69cc
 
2430162
 
 
 
 
4b945ef
376e087
4b945ef
2430162
376e087
581cd9f
00e69cc
376e087
4b1c6eb
366746e
4b945ef
4d072c1
4b1c6eb
ad3e5f6
 
 
 
 
 
 
 
4d072c1
 
3dfdc35
 
5b6d5fb
d952d3e
ba3e19e
3dfdc35
 
 
 
ba3e19e
 
3dfdc35
e7c7d84
 
 
 
 
 
a3ce6a9
e7c7d84
 
 
 
3152fdc
e7c7d84
 
 
 
90a444c
 
 
 
 
 
94f8c57
90a444c
 
 
 
94f8c57
 
e7c7d84

import streamlit as st
import pandas as pd
import numpy as np

st.markdown('# Semantic search and topic classification (v1)')
st.markdown(' - Author: hcontreras')
st.markdown(' - Description: We want to classify sentences into a predefined set of topics. We use semantic search with a pre-trained transformer and we embed the input sentences and find the score relative to each topic')

st.markdown('## A quick test')
st.markdown('As a test we can create an embedding for a sentence and explore the score for a given topic (Transportation, Health, Space) and the most likey topic. Have fun!')

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

input_sentence = st.text_input('Sentence', 'This is a test for a news article')
input_topic = st.selectbox(
    'Topic',
    ('Space', 'Transportation', 'Health'))
#st.write('The current movie title is', title)

#Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']


#Sentences are encoded by calling model.encode()
embedding_sentence = model.encode(input_sentence)
embedding_topics = model.encode(['Space','Transportation','Health'])
embedding_input_topic = model.encode(input_topic)

#x = st.slider('Select a value')
#embedding = model.encode(input_sentence)
#st.write(x, 'squared is', x * x, 'embedding', embedding[0][0])
#st.write('The embedding of', '"' + input_sentence + '"', 'at position',x,'is',embedding[0][int(x)])

cos_scores = util.cos_sim(embedding_input_topic, embedding_sentence)[0][0]

st.write('Score for topic', input_topic, ':', np.round(cos_scores.tolist(),2))

l_scores = []
for topic in ['Space','Transportation','Health']:
    embedding_input_topic = model.encode(topic)
    cos_scores = util.cos_sim(embedding_input_topic, embedding_sentence)[0][0]
    l_scores.append(cos_scores.tolist())

st.write('Most likely topic:', ['Space','Transportation','Health'][np.argmax(l_scores)])

st.markdown('##  Adding bulk sentences and topics')
st.markdown('In this section we can upload a file with sentences (one column, header "sentence") and a file with topics (one column, header "topics") we can compute the most likely topic')
uploaded_file1 = st.file_uploader("Choose a file: sentence list")
if uploaded_file1 is not None:
    #read csv
    df1=pd.read_csv(uploaded_file1)
    st.write(df1.head())

uploaded_file2 = st.file_uploader("Choose a file: topic list")
if uploaded_file2 is not None:
    #read csv
    df2=pd.read_csv(uploaded_file2)
    st.write(df2.head())

if uploaded_file1 is not None and uploaded_file2 is not None:
    from sentence_transformers import SentenceTransformer, util
    import torch

    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    corpus = df1['sentence']
    topics = df2['topic']

    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    for topic in topics:
        topic_embedding = embedder.encode(topic, convert_to_tensor=True)
        cos_scores = util.cos_sim(topic_embedding, corpus_embeddings)[0]
        df1[str(topic)] = cos_scores

    st.write(df1)

    @st.cache
    def convert_df_to_csv(df):
      # IMPORTANT: Cache the conversion to prevent computation on every rerun
      return df.to_csv().encode('utf-8')


    st.download_button(
      label="Download data as CSV",
      data=convert_df_to_csv(df1),
      file_name='output.csv',
      mime='text/csv',
    )