COP-GEN-Beta / src /utils.py
mikonvergence
spaces fix
acfd194
import os
import pandas as pd
import spaces
# GLOBAL VARIABLES
if os.path.isfile('data/s2l2a_metadata.parquet'):
l2a_meta_path = 'data/s2l2a_metadata.parquet'
else:
DATASET_NAME = 'Major-TOM/Core-S2L2A'
l2a_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)
if os.path.isfile('data/s2l1c_metadata.parquet'):
l1c_meta_path = 'data/s2l1c_metadata.parquet'
else:
DATASET_NAME = 'Major-TOM/Core-S2L1C'
l1c_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)
if os.path.isfile('/s1rtc_metadata.parquet'):
rtc_meta_path = 'data/s1rtc_metadata.parquet'
else:
DATASET_NAME = 'Major-TOM/Core-S1RTC'
rtc_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)
if os.path.isfile('helpers/dem_metadata.parquet'):
dem_meta_path = 'data/dem_metadata.parquet'
else:
DATASET_NAME = 'Major-TOM/Core-DEM'
dem_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)
print('Loading Major TOM meta...')
l2a_df = pd.read_parquet(l2a_meta_path)
l1c_df = pd.read_parquet(l1c_meta_path)
rtc_df = pd.read_parquet(rtc_meta_path)
dem_df = pd.read_parquet(dem_meta_path)
# skip files with missing parts
l2a_df = l2a_df[l2a_df.nodata == 0]
l1c_df = l1c_df[l1c_df.nodata == 0]
rtc_df = rtc_df[rtc_df.nodata == 0]
dem_df = dem_df[dem_df.nodata == 0]
# collect grid_cells, drop duplicates, and extract grid cell column only
grid_cell_df = l2a_df[l2a_df.grid_cell.isin(l1c_df.grid_cell) &l2a_df.grid_cell.isin(rtc_df.grid_cell) & l2a_df.grid_cell.isin(dem_df.grid_cell)]
gird_cell_df = grid_cell_df.drop_duplicates(subset=['grid_cell'])
grid_cell_df = grid_cell_df.grid_cell
print('[DONE]')
import pyarrow.parquet as pq
import fsspec
from fsspec.parquet import open_parquet_file
from io import BytesIO
from PIL import Image
import random
def row2image(row, fullrow_read=True):
"""
Extracts an image from a specific row in a Parquet file.
Args:
row: A row object containing information about the Parquet file and row index.
It is expected to have attributes 'parquet_row' (the row index within the Parquet file)
and 'parquet_url' (the URL or path to the Parquet file).
fullrow_read (bool, optional): Determines whether to read the entire Parquet file or just the 'thumbnail' column initially.
Defaults to True.
- If True, it opens the Parquet file using fsspec and reads the entire file.
- If False, it uses fsspec.parquet.open_parquet_file to only open the 'thumbnail' column.
Returns:
PIL.Image.Image: An Image object loaded from the 'thumbnail' data in the specified row.
"""
parquet_row = row.parquet_row
parquet_url = row.parquet_url
if fullrow_read:
# Option 1: Read the entire Parquet file
f = fsspec.open(parquet_url)
temp_path = f.open()
else:
# Option 2: Read only the 'thumbnail' column initially
temp_path = open_parquet_file(parquet_url, columns=["thumbnail"])
with pq.ParquetFile(temp_path) as pf:
first_row_group = pf.read_row_group(parquet_row, columns=['thumbnail'])
stream = BytesIO(first_row_group['thumbnail'][0].as_py())
return Image.open(stream)
# Example usage (assuming 'dem_df' is a Pandas DataFrame with the required structure):
# row2image(dem_df.iloc[1000])
def get_rows(grid_cell):
"""
Retrieves the first row from multiple DataFrames based on a given 'grid_cell' value.
Args:
grid_cell: The value to filter the DataFrames by in the 'grid_cell' column.
Returns:
tuple: A tuple containing the first matching row from each of the following DataFrames:
l2a_df, l1c_df, rtc_df, and dem_df. It assumes these DataFrames are defined in the scope.
Each element of the tuple is a Pandas Series representing a row.
"""
return l1c_df[l1c_df.grid_cell == grid_cell].iloc[0], \
l2a_df[l2a_df.grid_cell == grid_cell].iloc[0], \
rtc_df[rtc_df.grid_cell == grid_cell].iloc[0], \
dem_df[dem_df.grid_cell == grid_cell].iloc[0]
def get_images(grid_cell):
"""
Retrieves images corresponding to a specific 'grid_cell' by calling get_rows and row2image.
Args:
grid_cell: The grid cell identifier to fetch images for.
Returns:
list: A list of PIL.Image.Image objects, where each image is extracted from the rows
returned by the get_rows function for the given grid cell.
"""
img_rows = get_rows(grid_cell)
imgs = []
for row in img_rows:
imgs.append(row2image(row))
return imgs
def resize_and_crop(images, image_size=(1068, 1068), crop_size=(256, 256)):
"""
Resizes a list of images to a specified size and then crops a random portion from each.
Args:
images (list): A list of PIL.Image.Image objects to be processed.
image_size (tuple, optional): The target size (width, height) to resize the images to.
Defaults to (1068, 1068).
crop_size (tuple, optional): The size (width, height) of the random crop to be taken
from the resized images. Defaults to (256, 256).
Returns:
list: A list of PIL.Image.Image objects, where each image has been resized and then cropped.
"""
left = random.randint(0, image_size[0] - crop_size[0])
top = random.randint(0, image_size[1] - crop_size[1])
right = left + crop_size[0]
bottom = top + crop_size[1]
return [img.resize(image_size).crop((left, top, right, bottom)) for img in images]
def sample_shuffle():
"""
Randomly selects a 'grid_cell', retrieves corresponding images, and optionally prepares them for an interface.
Args:
interface (bool, optional): If True, the function returns a list where each image is followed by True.
This might be intended for an interface that expects an image and a boolean flag.
If False, it returns just the list of processed images. Defaults to True.
Returns:
list: If interface is False, returns a list of resized and cropped PIL.Image.Image objects.
If interface is True, returns a list where each image is followed by the boolean value True.
"""
grid_cell = grid_cell_df.sample().iloc[0]
return resize_and_crop(get_images(grid_cell))