ICIJ_SQL_agent / app.py
Didier's picture
Update app.py
9a919aa verified
raw
history blame
4.75 kB
"""app.py
Smolagents agent given an SQL tool over a SQLite database built with data files
from the Internation Consortium of Investigative Journalism (ICIJ.org).
Agentic framework:
- smolagents
Database:
- SQLite
Generation:
- Mistral
:author: Didier Guillevic
:date: 2025-01-12
"""
import gradio as gr
import icij_utils
import smolagents
import os
import pathlib
#
# Init a SQLite database with the data files from ICIJ.org
#
ICIJ_LEAKS_DB_NAME = 'icij_leaks.db'
ICIJ_LEAKS_DATA_DIR = './icij_data'
# Remove existing database (if present), since we will recreate it below.
icij_db_path = pathlib.Path(ICIJ_LEAKS_DB_NAME)
icij_db_path.unlink(missing_ok=True)
# Load ICIJ data files into an SQLite database
loader = icij_utils.ICIJDataLoader(ICIJ_LEAKS_DB_NAME)
loader.load_all_files(ICIJ_LEAKS_DATA_DIR)
#
# Init an SQLAchemy instane (over the SQLite database)
#
db = icij_utils.ICIJDatabaseConnector(ICIJ_LEAKS_DB_NAME)
schema = db.get_full_database_schema()
#
# Build an SQL tool
#
schema = db.get_full_database_schema()
metadata = icij_utils.ICIJDatabaseMetadata()
tool_description = (
"Tool for querying the ICIJ offshore database containing financial data leaks. "
"This tool can execute SQL queries and return the results. "
"Beware that this tool's output is a string representation of the execution output.\n"
"It can use the following tables:"
)
# Add table documentation
for table, doc in metadata.TABLE_DOCS.items():
tool_description += f"\n\nTable: {table}\n"
tool_description += f"Description: {doc.strip()}\n"
tool_description += "Columns:\n"
# Add column documentation and types
if table in schema:
for col_name, col_type in schema[table].items():
col_doc = metadata.COLUMN_DOCS.get(table, {}).get(col_name, "No documentation available")
#tool_description += f" - {col_name}: {col_type}: {col_doc}\n"
tool_description += f" - {col_name}: {col_type}\n"
# Add source documentation
#tool_description += "\n\nSource IDs:\n"
#for source_id, descrip in metadata.SOURCE_IDS.items():
# tool_description += f"- {source_id}: {descrip}\n"
@smolagents.tool
def sql_tool(query: str) -> str:
"""Description to be set beloiw...
Args:
query: The query to perform. This should be correct SQL.
"""
output = ""
with db.get_engine().connect() as con:
rows = con.execute(sqlalchemy.text(query))
for row in rows:
output += "\n" + str(row)
return output
sql_tool.description = tool_description
#
# language models
#
default_model = smolagents.HfApiModel()
mistral_api_key = os.environ["MISTRAL_API_KEY"]
mistral_model_id = "mistral/codestral-latest"
mistral_model = smolagents.LiteLLMModel(
model_id=mistral_model_id, api_key=mistral_api_key)
#
# Define the agent
#
agent = smolagents.CodeAgent(
tools=[sql_tool],
model=mistral_model
)
def generate_response(query: str) -> str:
"""Generate a response given query.
Args:
Returns:
- the response from the agent having access to a database over the ICIJ
data and a large language model.
"""
agent_output = agent.run(query)
return agent_output
#
# User interface
#
with gr.Blocks() as demo:
gr.Markdown("""
# SQL agent
Database: ICIJ data on offshore financial data leaks.
""")
# Inputs: question
question = gr.Textbox(
label="Question to answer",
placeholder=""
)
# Response
response = gr.Textbox(
label="Response",
placeholder=""
)
# Button
with gr.Row():
response_button = gr.Button("Submit", variant='primary')
clear_button = gr.Button("Clear", variant='secondary')
# Example questions given default provided PDF file
with gr.Accordion("Sample questions", open=False):
gr.Examples(
[
["",],
["",],
],
inputs=[question,],
outputs=[response,],
fn=generate_response,
cache_examples=False,
label="Sample questions"
)
# Documentation
with gr.Accordion("Documentation", open=False):
gr.Markdown("""
- Agentic framework: smolagents
- Data: icij.org
- Database: SQLite, SQLAlchemy
- Generation: Mistral
- Examples: Generated using Claude.ai
""")
# Click actions
response_button.click(
fn=generate_response,
inputs=[question,],
outputs=[response,]
)
clear_button.click(
fn=lambda: ('', ''),
inputs=[],
outputs=[question, response]
)
demo.launch(show_api=False)