Spaces:
Sleeping
Sleeping
File size: 4,043 Bytes
e7e4046 c8709ef e7e4046 c8709ef e7e4046 d988b69 e7e4046 c8709ef e7e4046 d988b69 e7e4046 d988b69 e7e4046 c8709ef d988b69 e7e4046 d988b69 e7e4046 c8709ef e7e4046 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
from smolagents import tool
import pandas as pd
@tool
def read_data(file_path: str) -> pd.DataFrame:
"""A tool that reads an Excel or CSV file from a given path and returns a pandas DataFrame.
Args:
file_path: The path to the Excel (.xlsx) or CSV (.csv) file.
Returns:
A pandas DataFrame containing the data from the file.
"""
try:
if file_path.endswith('.csv'):
df = pd.read_csv(file_path)
elif file_path.endswith('.xls'):
df = pd.read_excel(file_path)
else:
raise f"Unsupported file extension: {file_path}"
return df
except Exception as e:
raise Exception(f"Error reading the file: {str(e)}")
@tool
def get_data_summary(df: pd.DataFrame) -> dict:
"""A tool that gives a summary of the data.
Args:
df: A pandas DataFrame.
Returns: A dictionary containing the number of rows and columns in the DataFrame, and a preview of the first few rows.
"""
try:
return {
"num_rows": df.shape[0],
"num_columns": df.shape[1],
"preview": df.head().to_dict()
}
except Exception as e:
raise Exception(f"Error in analyzing the dataset: {str(e)}")
import pandas as pd
@tool
def get_dataframe_statistics(data: dict) -> dict:
"""A tool that calculates statistical summaries of a pandas DataFrame.
Args:
data: A dictionary where keys are column names and values are lists of column values.
Returns:
A dictionary containing summary statistics such as mean, median, standard deviation,
and count for numerical columns.
"""
try:
# Convert input dictionary to DataFrame
df = pd.DataFrame(data)
# Generate summary statistics
stats = df.describe().to_dict()
# Convert NaN values to None for JSON compatibility
for col, col_stats in stats.items():
stats[col] = {key: (None if pd.isna(value) else value) for key, value in col_stats.items()}
return stats
except Exception as e:
raise Exception(f"error: {str(e)}")
@tool
def get_missing_values(data: dict) -> dict:
"""A tool that calculates the number and percentage of missing values in a pandas DataFrame.
Args:
data: A dictionary where keys are column names and values are lists of column values.
Returns:
A dictionary with column names as keys and missing value statistics (count and percentage).
"""
try:
df = pd.DataFrame(data)
missing_count = df.isnull().sum()
missing_percentage = (missing_count / len(df)) * 100
return {
col: {"missing_count": int(missing_count[col]), "missing_percentage": missing_percentage[col]}
for col in df.columns
}
except Exception as e:
return {"error": str(e)}
@tool
def get_duplicate_rows(data: dict) -> dict:
"""A tool that finds duplicate rows in a pandas DataFrame.
Args:
data: A dictionary where keys are column names and values are lists of column values.
Returns:
A dictionary with the number of duplicate rows and sample duplicate rows.
"""
try:
df = pd.DataFrame(data)
duplicates = df[df.duplicated(keep=False)]
return {
"duplicate_count": int(df.duplicated().sum()),
"duplicate_rows": duplicates.to_dict(orient="records"),
}
except Exception as e:
return {"error": str(e)}
@tool
def get_correlation_matrix(data: dict) -> dict:
"""A tool that calculates the correlation matrix for numerical columns in a pandas DataFrame.
Args:
data: A dictionary where keys are column names and values are lists of column values.
Returns:
A dictionary representing the correlation matrix.
"""
try:
df = pd.DataFrame(data)
correlation_matrix = df.corr().to_dict()
return correlation_matrix
except Exception as e:
return {"error": str(e)}
|