File size: 7,786 Bytes
01b8e8e
f99d6db
 
b8acde7
 
f99d6db
 
b8acde7
 
01b8e8e
39503cb
01b8e8e
f456ef3
acb72cc
 
 
 
 
f99d6db
 
 
 
 
acb72cc
 
 
 
5692cb3
 
 
 
 
 
 
 
 
 
 
 
 
 
acb72cc
5692cb3
 
 
acb72cc
 
 
 
5692cb3
 
acb72cc
 
 
 
5692cb3
acb72cc
 
 
 
 
 
 
 
 
f99d6db
 
 
 
acb72cc
 
 
 
 
 
 
 
 
710a34d
 
 
 
f99d6db
 
 
 
710a34d
 
 
 
 
 
 
 
39503cb
01b8e8e
5634055
01b8e8e
843bc9e
 
 
 
5634055
 
 
dd7488f
39503cb
 
01b8e8e
 
 
 
 
 
5692cb3
6a6afbf
843bc9e
42468fb
101be32
42468fb
5692cb3
 
 
 
 
 
01b8e8e
 
39503cb
843bc9e
01b8e8e
 
 
 
 
 
 
6a6afbf
01b8e8e
 
 
 
6a6afbf
843bc9e
1b47089
 
843bc9e
1b47089
 
 
 
 
 
 
6a6afbf
1b47089
 
 
 
dbcf2e8
46323da
 
6a6afbf
dbcf2e8
6a6afbf
843bc9e
4107940
 
843bc9e
4107940
 
 
 
 
dbcf2e8
710a34d
dbcf2e8
f99d6db
4107940
f99d6db
6a6afbf
4107940
 
 
 
 
 
dbcf2e8
46323da
 
6a6afbf
dbcf2e8
6a6afbf
843bc9e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import streamlit as st

from interface.draw_pipelines import get_pipeline_graph
from interface.utils import (
    extract_text_from_file,
    extract_text_from_url,
    get_pipelines,
    reset_vars_data,
)


def component_select_pipeline(container):
    pipeline_names, pipeline_funcs, pipeline_func_parameters = get_pipelines()
    with st.spinner("Loading Pipeline..."):
        with container:
            selected_pipeline = st.selectbox(
                "Select pipeline",
                pipeline_names,
                index=(
                    pipeline_names.index("Keyword Search")
                    if "Keyword Search" in pipeline_names
                    else 0
                ),
            )
            index_pipe = pipeline_names.index(selected_pipeline)
            st.write("---")
            st.header("Pipeline Parameters")

            # Process audio_output first to ensure top_k is set correctly
            audio_output_value = False
            for parameter, value in pipeline_func_parameters[index_pipe].items():
                if parameter == "audio_output":
                    audio_output_value = st.checkbox(parameter, value)
                    pipeline_func_parameters[index_pipe][
                        "audio_output"
                    ] = audio_output_value
                    if audio_output_value:
                        pipeline_func_parameters[index_pipe]["top_k"] = 3
                    break

            # Then process all other parameters
            for parameter, value in pipeline_func_parameters[index_pipe].items():
                if parameter == "audio_output":
                    continue
                elif isinstance(value, str):
                    value = st.text_input(parameter, value)
                elif isinstance(value, bool):
                    value = st.checkbox(parameter, value)
                elif isinstance(value, int):
                    if parameter == "top_k" and audio_output_value:
                        value = 3
                    value = int(st.number_input(parameter, value=value))
                elif isinstance(value, float):
                    value = float(st.number_input(parameter, value=value))
                pipeline_func_parameters[index_pipe][parameter] = value

            if (
                st.session_state["pipeline"] is None
                or st.session_state["pipeline"]["name"] != selected_pipeline
                or list(
                    st.session_state["pipeline_func_parameters"][index_pipe].values()
                )
                != list(pipeline_func_parameters[index_pipe].values())
            ):
                st.session_state["pipeline_func_parameters"] = pipeline_func_parameters
                (
                    search_pipeline,
                    index_pipeline,
                ) = pipeline_funcs[
                    index_pipe
                ](**pipeline_func_parameters[index_pipe])
                st.session_state["pipeline"] = {
                    "name": selected_pipeline,
                    "search_pipeline": search_pipeline,
                    "index_pipeline": index_pipeline,
                    "doc": pipeline_funcs[index_pipe].__doc__,
                }
                reset_vars_data()
            # TODO: Use elasticsearch and remove this workaround for TFIDF
            # Reload if Keyword Search is selected
            elif st.session_state["pipeline"]["name"] == "Keyword Search":
                st.session_state["pipeline_func_parameters"] = pipeline_func_parameters
                (
                    search_pipeline,
                    index_pipeline,
                ) = pipeline_funcs[
                    index_pipe
                ](**pipeline_func_parameters[index_pipe])
                st.session_state["pipeline"] = {
                    "name": selected_pipeline,
                    "search_pipeline": search_pipeline,
                    "index_pipeline": index_pipeline,
                    "doc": pipeline_funcs[index_pipe].__doc__,
                }


def component_show_pipeline(pipeline, pipeline_name):
    """Draw the pipeline"""
    expander_text = "Show pipeline"
    if pipeline["doc"] is not None and "BUG" in pipeline["doc"]:
        expander_text += "  ⚠️"
    with st.expander(expander_text):
        if pipeline["doc"] is not None:
            st.markdown(pipeline["doc"])
        fig = get_pipeline_graph(pipeline[pipeline_name])
        st.plotly_chart(fig, use_container_width=True)


def component_show_search_result(container, results):
    with container:
        for idx, document in enumerate(results):
            st.markdown(f"### Match {idx+1}")
            st.markdown(f"**Text**: {document['text']}")
            st.markdown(f"**Document**: {document['id']}")
            st.json(document)
            if "_split_id" in document["meta"]:
                st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
            if "score" in document:
                st.markdown(f"**Score**: {document['score']:.3f}")
            if "content_audio" in document:
                try:
                    with open(document["content_audio"], "rb") as audio_file:
                        audio_bytes = audio_file.read()
                        st.audio(audio_bytes, format="audio/wav")
                except Exception as e:
                    st.error(f"Error loading audio: {str(e)}")
            st.markdown("---")


def component_text_input(container, doc_id):
    """Draw the Text Input widget"""
    with container:
        texts = []
        with st.expander("Enter documents"):
            while True:
                text = st.text_input(f"Document {doc_id}", key=doc_id)
                if text != "":
                    texts.append({"text": text, "doc_id": doc_id})
                    doc_id += 1
                    st.markdown("---")
                else:
                    break
        corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in texts]
        return corpus, doc_id


def component_article_url(container, doc_id):
    """Draw the Article URL widget"""
    with container:
        urls = []
        with st.expander("Enter URLs"):
            while True:
                url = st.text_input(f"URL {doc_id}", key=doc_id)
                if url != "":
                    urls.append({"text": extract_text_from_url(url), "doc_id": doc_id})
                    doc_id += 1
                    st.markdown("---")
                else:
                    break

        for idx, doc in enumerate(urls):
            with st.expander(f"Preview URL {idx}"):
                st.write(doc["text"])

        corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in urls]
        return corpus, doc_id


def component_file_input(container, doc_id):
    """Draw the extract text from file widget"""
    with container:
        files = []
        with st.expander("Enter Files"):
            while True:
                file = st.file_uploader(
                    "Upload a .txt, .pdf, .csv, image file, audio file", key=doc_id
                )
                if file is not None:
                    extracted_text = extract_text_from_file(file)
                    if extracted_text is not None:
                        files.append({"text": extracted_text, "doc_id": doc_id})
                        doc_id += 1
                        st.markdown("---")
                    else:
                        break
                else:
                    break

        for idx, doc in enumerate(files):
            with st.expander(f"Preview File {idx}"):
                st.write(doc["text"])

        corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in files]
        return corpus, doc_id