Spaces:
Sleeping
Sleeping
File size: 2,902 Bytes
a00452d e3d555e a00452d e3d555e a00452d 7e36185 a00452d c24687b ac80b7b a00452d ba87074 a00452d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# Inspired by https://huggingface.co./spaces/asoria/duckdb-parquet-demo
from typing import List
import gradio as gr
import duckdb
import pandas as pd
import requests
DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co"
PARQUET_REVISION="refs/convert/parquet"
EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT"
def get_parquet_urls(dataset: str) -> List[str]:
splits = requests.get(f"{DATASETS_SERVER_ENDPOINT}/splits?dataset={dataset}", timeout=60).json().get("splits")
split = splits[0]
response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={split['config']}", timeout=60)
if response.status_code != 200:
raise Exception(response)
response = response.json()
parquet_files = response["parquet_files"]
urls = [content["url"] for content in parquet_files if content["split"] == split["split"]]
if len(urls) == 0:
raise Exception("No parquet files found for dataset")
return urls
def run_command(query: str) -> pd.DataFrame:
try:
result = duckdb.execute("SELECT fts_main_data.match_bm25(id, ?) AS score, id, instruction, input, output FROM data WHERE score IS NOT NULL ORDER BY score DESC;", [query])
print("Ok")
except Exception as error:
print(f"Error: {str(error)}")
return pd.DataFrame({"Error": [f"❌ {str(error)}"]})
print(result)
return result.df()
def import_data():
duckdb.execute("INSTALL 'httpfs';")
duckdb.execute("LOAD 'httpfs';")
duckdb.execute("INSTALL 'fts';")
duckdb.execute("LOAD 'fts';")
duckdb.sql("select * from duckdb_extensions();").show()
# Import data + index
parquet_url = get_parquet_urls(EXAMPLE_DATASET_NAME)[0]
print("parquet_url", parquet_url)
duckdb.sql("CREATE SEQUENCE serial START 1;")
# We need a sequence id column for Full text search
# I'm very rusty in SQL so it's very possible there are simpler ways.
duckdb.sql(f"CREATE TABLE data AS SELECT nextval('serial') AS id, * FROM '{parquet_url}';")
duckdb.sql("PRAGMA create_fts_index('data', 'id', '*');")
duckdb.sql("DESCRIBE SELECT * FROM data").show()
print("foo foo")
with gr.Blocks() as demo:
gr.Markdown("""
## Full-text search using DuckDB on top of datasets-server Parquet files 🐤
Inspired by https://huggingface.co./spaces/asoria/duckdb-parquet-demo
""")
gr.CheckboxGroup(label="Dataset", choices=["LLMs/Alpaca-ShareGPT"], value="LLMs/Alpaca-ShareGPT", info="Dataset to query"),
query = gr.Textbox(label="query", placeholder="Full-text search...")
run_button = gr.Button("Run")
gr.Markdown("### Result")
cached_responses_table = gr.DataFrame()
run_button.click(run_command, inputs=[query], outputs=cached_responses_table)
if __name__ == "__main__":
import_data()
demo.launch()
|