import os import time from multiprocessing import Pool from tqdm import tqdm from huggingface_hub import Repository def save_shard(shard_tuple): """Save shard""" filename, shard = shard_tuple # use to_json instead to save as json file shard.to_parquet(filename) # def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"): # """Save sharded data # Args: # ds (Dataset): dataset to be saved # user (str): user name # remote_dataset_repo (str): remote dataset repository # out_path (str): path to save the shards""" # # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO # # you can save the shards inside it and do git add/commit/push to push data to the hub # out_path = remote_dataset_repo # # if out path doesn't already exist # if not os.path.exists(out_path): # repo = Repository( # local_dir=out_path, # clone_from=user + "/" + remote_dataset_repo, # repo_type="dataset", # private=True, # use_auth_token=True, # git_user=user # ) # # files will be numerous we save them in a folder called data inside out_path # os.mkdir(out_path + "/data") # SHARD_SIZE = 1000 << 20 # if ds._indices is not None: # dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) # else: # dataset_nbytes = ds.data.nbytes # num_shards = int(dataset_nbytes / SHARD_SIZE) + 1 # print(f"Number of shards: {num_shards}") # print("sharding the dataset") # t_start = time.time() # shards = (ds.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards)) # # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files # filenames = (f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" for index in range(num_shards)) # with Pool(16) as p: # list(tqdm(p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), total=num_shards)) # print(f"Time to save dataset: {time.time()-t_start:.2f}") # # to push dataset to hub do: git add/commit/push inside OUT_PATH def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"): """Save sharded data Args: ds (Dataset): dataset to be saved user (str): user name remote_dataset_repo (str): remote dataset repository out_path (str): path to save the shards""" # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO # you can save the shards inside it and do git add/commit/push to push data to the hub out_path = remote_dataset_repo # if out path doesn't already exist if not os.path.exists(out_path): repo_url = f'https://huggingface.co./{user}/{remote_dataset_repo}' repo = Repository(local_dir=out_path, clone_from=repo_url, repo_type="dataset") repo.create_repo(private=True, use_auth_token=True, git_user=user) # files will be numerous we save them in a folder called data inside out_path os.mkdir(out_path + "/data") SHARD_SIZE = 1000 << 20 if ds._indices is not None: dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) else: dataset_nbytes = ds.data.nbytes num_shards = int(dataset_nbytes / SHARD_SIZE) + 1 print(f"Number of shards: {num_shards}") print("sharding the dataset") t_start = time.time() shards = (ds.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards)) # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files filenames = (f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" for index in range(num_shards)) with Pool(16) as p: list(tqdm(p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), total=num_shards)) print(f"Time to save dataset: {time.time()-t_start:.2f}") # to push dataset to hub do: git add/commit/push inside OUT_PATH