Spaces:
Sleeping
Sleeping
File size: 1,580 Bytes
5caedb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import os
import pandas as pd
from datasets import load_dataset
def download_default_datasets_to_local_folder() -> None:
"""
Downloads the default datasets to a local folder.
The temporary folder is given by the ENV var H2O_LLM_STUDIO_DEMO_DATASETS.
If the ENV var is not set, this function will raise an error.
The datasets are transformed to parquet format and saved in the folder.
"""
path = os.environ.get("H2O_LLM_STUDIO_DEMO_DATASETS")
if path is None:
raise ValueError("H2O_LLM_STUDIO_DEMO_DATASETS is not set.")
if not os.path.exists(path):
os.makedirs(path, exist_ok=True)
# Prepare Causal Language Modeling Dataset
ds = load_dataset("OpenAssistant/oasst2")
train = ds["train"].to_pandas()
val = ds["validation"].to_pandas()
df = pd.concat([train, val], axis=0).reset_index(drop=True)
df.to_parquet(os.path.join(path, "causal_language_modeling.pq"), index=False)
# Prepare DPO Modeling Dataset
df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas()
df.to_parquet(os.path.join(path, "dpo_modeling.pq"), index=False)
# Prepare Classification Modeling Dataset
df = load_dataset("stanfordnlp/imdb")["train"].to_pandas()
df.to_parquet(os.path.join(path, "classification_modeling.pq"), index=False)
# Prepare Regression Modeling Dataset
df = load_dataset("nvidia/HelpSteer2")["train"].to_pandas()
df.to_parquet(os.path.join(path, "regression_modeling.pq"), index=False)
if __name__ == "__main__":
download_default_datasets_to_local_folder()
|