File size: 3,948 Bytes
cb35e87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655c971
cb35e87
 
 
 
 
 
 
 
 
 
 
 
 
655c971
 
 
cb35e87
655c971
cb35e87
655c971
 
 
 
 
 
cb35e87
655c971
cb35e87
655c971
 
 
cb35e87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655c971
 
cb35e87
 
 
 
 
 
 
01b1b14
cb35e87
 
 
 
 
 
 
01b1b14
 
cb35e87
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import http.client as http_client
import json
import logging
import os
import re
import string

import gradio as gr
import requests


def process_results(results, highlight_terms):
    if len(results) == 0:
        return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
                No results retrieved.</p><br><hr>"""

    results_html = ""
    for result in results:
        tokens = result["text"].split()
        tokens_html = []
        for token in tokens:
            if token in highlight_terms:
                tokens_html.append("<b>{}</b>".format(token))
            else:
                tokens_html.append(token)
        tokens_html = " ".join(tokens_html)
        meta_html = (
            """
                <p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
                <a href='{}' target='_blank'>{}</a></p>""".format(
                result["meta"]["url"], result["meta"]["url"]
            )
            if "meta" in result and result["meta"] is not None and "url" in result["meta"]
            else ""
        )
        docid_html = str(result["docid"])
        results_html += """{}
            <p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
            <p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
            <p style='font-family: Arial;'>{}</p>
            <br>
        """.format(
            meta_html, docid_html, result["lang"], tokens_html
        )
    return results_html + "<hr>"


def scisearch(query, language, num_results=10):

    query = " ".join(query.split())
    if query == "" or query is None:
        return ""

    post_data = {"query": query, "k": num_results}

    output = requests.post(
        os.environ.get("address"),
        headers={"Content-type": "application/json"},
        data=json.dumps(post_data),
        timeout=60,
    )

    payload = json.loads(output.text)

    results = payload["results"]
    highlight_terms = payload["highlight_terms"]
    return process_results(results, highlight_terms)


description = """# <p style="text-align: center;"> ๐ŸŒธ ๐Ÿ”Ž ROOTS search tool ๐Ÿ” ๐ŸŒธ </p>
The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co./bigscience/bloom). This tool allows
you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages included in
ROOTS. You can read more about the details of the tool design
[here](https://huggingface.co./spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf). For more
information and instructions on how to access the full corpus check [this form](https://forms.gle/qyYswbEL5kA23Wu99)."""


if __name__ == "__main__":
    demo = gr.Blocks(
        css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; color:Silver; }"
    )

    with demo:
        with gr.Row():
            gr.Markdown(value=description)
        with gr.Row():
            query = gr.Textbox(lines=1, max_lines=1, placeholder="Type your query here...", label="Query")

            
        with gr.Row():
            k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
        with gr.Row():
            submit_btn = gr.Button("Submit")
        with gr.Row():
            results = gr.HTML(label="Results")

        def submit(query, k, lang="en"):
            query = query.strip()
            if query is None or query == "":
                return "", ""
            return {
                results: scisearch(query, lang, k),
            }

        query.submit(fn=submit, inputs=[query, k], outputs=[results])
        submit_btn.click(submit, inputs=[query, k], outputs=[results])

    demo.launch(enable_queue=True, debug=True)