Spaces:
Sleeping
Sleeping
File size: 3,438 Bytes
d727a16 4b1c6eb ad3e5f6 d727a16 094d202 376e087 c9f5c49 376e087 c319aca 2430162 376e087 00e69cc 2430162 4b945ef 376e087 4b945ef 2430162 376e087 581cd9f 00e69cc 376e087 4b1c6eb 366746e 4b945ef 4d072c1 4b1c6eb ad3e5f6 4d072c1 3dfdc35 5b6d5fb d952d3e ba3e19e 3dfdc35 ba3e19e 3dfdc35 e7c7d84 a3ce6a9 e7c7d84 3152fdc e7c7d84 90a444c 94f8c57 90a444c 94f8c57 e7c7d84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import streamlit as st
import pandas as pd
import numpy as np
st.markdown('# Semantic search and topic classification (v1)')
st.markdown(' - Author: hcontreras')
st.markdown(' - Description: We want to classify sentences into a predefined set of topics. We use semantic search with a pre-trained transformer and we embed the input sentences and find the score relative to each topic')
st.markdown('## A quick test')
st.markdown('As a test we can create an embedding for a sentence and explore the score for a given topic (Transportation, Health, Space) and the most likey topic. Have fun!')
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
input_sentence = st.text_input('Sentence', 'This is a test for a news article')
input_topic = st.selectbox(
'Topic',
('Space', 'Transportation', 'Health'))
#st.write('The current movie title is', title)
#Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']
#Sentences are encoded by calling model.encode()
embedding_sentence = model.encode(input_sentence)
embedding_topics = model.encode(['Space','Transportation','Health'])
embedding_input_topic = model.encode(input_topic)
#x = st.slider('Select a value')
#embedding = model.encode(input_sentence)
#st.write(x, 'squared is', x * x, 'embedding', embedding[0][0])
#st.write('The embedding of', '"' + input_sentence + '"', 'at position',x,'is',embedding[0][int(x)])
cos_scores = util.cos_sim(embedding_input_topic, embedding_sentence)[0][0]
st.write('Score for topic', input_topic, ':', np.round(cos_scores.tolist(),2))
l_scores = []
for topic in ['Space','Transportation','Health']:
embedding_input_topic = model.encode(topic)
cos_scores = util.cos_sim(embedding_input_topic, embedding_sentence)[0][0]
l_scores.append(cos_scores.tolist())
st.write('Most likely topic:', ['Space','Transportation','Health'][np.argmax(l_scores)])
st.markdown('## Adding bulk sentences and topics')
st.markdown('In this section we can upload a file with sentences (one column, header "sentence") and a file with topics (one column, header "topics") we can compute the most likely topic')
uploaded_file1 = st.file_uploader("Choose a file: sentence list")
if uploaded_file1 is not None:
#read csv
df1=pd.read_csv(uploaded_file1)
st.write(df1.head())
uploaded_file2 = st.file_uploader("Choose a file: topic list")
if uploaded_file2 is not None:
#read csv
df2=pd.read_csv(uploaded_file2)
st.write(df2.head())
if uploaded_file1 is not None and uploaded_file2 is not None:
from sentence_transformers import SentenceTransformer, util
import torch
embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus = df1['sentence']
topics = df2['topic']
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
for topic in topics:
topic_embedding = embedder.encode(topic, convert_to_tensor=True)
cos_scores = util.cos_sim(topic_embedding, corpus_embeddings)[0]
df1[str(topic)] = cos_scores
st.write(df1)
@st.cache
def convert_df_to_csv(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv().encode('utf-8')
st.download_button(
label="Download data as CSV",
data=convert_df_to_csv(df1),
file_name='output.csv',
mime='text/csv',
)
|