Spaces:

mikonvergence
/

COP-GEN-Beta

Running on Zero

File size: 6,475 Bytes

import os
import pandas as pd
import spaces

# GLOBAL VARIABLES
if os.path.isfile('data/s2l2a_metadata.parquet'):
    l2a_meta_path = 'data/s2l2a_metadata.parquet'
else:
    DATASET_NAME = 'Major-TOM/Core-S2L2A'
    l2a_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

if os.path.isfile('data/s2l1c_metadata.parquet'):
    l1c_meta_path = 'data/s2l1c_metadata.parquet'
else:
    DATASET_NAME = 'Major-TOM/Core-S2L1C'
    l1c_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

if os.path.isfile('/s1rtc_metadata.parquet'):
    rtc_meta_path = 'data/s1rtc_metadata.parquet'
else:
    DATASET_NAME = 'Major-TOM/Core-S1RTC'
    rtc_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

if os.path.isfile('helpers/dem_metadata.parquet'):
    dem_meta_path = 'data/dem_metadata.parquet'
else:
    DATASET_NAME = 'Major-TOM/Core-DEM'
    dem_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

print('Loading Major TOM meta...')
l2a_df = pd.read_parquet(l2a_meta_path)
l1c_df = pd.read_parquet(l1c_meta_path)
rtc_df = pd.read_parquet(rtc_meta_path)
dem_df = pd.read_parquet(dem_meta_path)

# skip files with missing parts
l2a_df = l2a_df[l2a_df.nodata == 0]
l1c_df = l1c_df[l1c_df.nodata == 0]
rtc_df = rtc_df[rtc_df.nodata == 0]
dem_df = dem_df[dem_df.nodata == 0]

# collect grid_cells, drop duplicates, and extract grid cell column only
grid_cell_df = l2a_df[l2a_df.grid_cell.isin(l1c_df.grid_cell) &l2a_df.grid_cell.isin(rtc_df.grid_cell) & l2a_df.grid_cell.isin(dem_df.grid_cell)]
gird_cell_df = grid_cell_df.drop_duplicates(subset=['grid_cell'])
grid_cell_df = grid_cell_df.grid_cell
print('[DONE]')

import pyarrow.parquet as pq
import fsspec
from fsspec.parquet import open_parquet_file
from io import BytesIO
from PIL import Image
import random

def row2image(row, fullrow_read=True):
    """
    Extracts an image from a specific row in a Parquet file.

    Args:
        row: A row object containing information about the Parquet file and row index.
            It is expected to have attributes 'parquet_row' (the row index within the Parquet file)
            and 'parquet_url' (the URL or path to the Parquet file).
        fullrow_read (bool, optional): Determines whether to read the entire Parquet file or just the 'thumbnail' column initially.
            Defaults to True.
            - If True, it opens the Parquet file using fsspec and reads the entire file.
            - If False, it uses fsspec.parquet.open_parquet_file to only open the 'thumbnail' column.

    Returns:
        PIL.Image.Image: An Image object loaded from the 'thumbnail' data in the specified row.
    """
    parquet_row = row.parquet_row
    parquet_url = row.parquet_url

    if fullrow_read:
        # Option 1: Read the entire Parquet file
        f = fsspec.open(parquet_url)
        temp_path = f.open()
    else:
        # Option 2: Read only the 'thumbnail' column initially
        temp_path = open_parquet_file(parquet_url, columns=["thumbnail"])

    with pq.ParquetFile(temp_path) as pf:
        first_row_group = pf.read_row_group(parquet_row, columns=['thumbnail'])

    stream = BytesIO(first_row_group['thumbnail'][0].as_py())
    return Image.open(stream)

# Example usage (assuming 'dem_df' is a Pandas DataFrame with the required structure):
# row2image(dem_df.iloc[1000])

def get_rows(grid_cell):
    """
    Retrieves the first row from multiple DataFrames based on a given 'grid_cell' value.

    Args:
        grid_cell: The value to filter the DataFrames by in the 'grid_cell' column.

    Returns:
        tuple: A tuple containing the first matching row from each of the following DataFrames:
               l2a_df, l1c_df, rtc_df, and dem_df. It assumes these DataFrames are defined in the scope.
               Each element of the tuple is a Pandas Series representing a row.
    """
    return l1c_df[l1c_df.grid_cell == grid_cell].iloc[0], \
           l2a_df[l2a_df.grid_cell == grid_cell].iloc[0], \
           rtc_df[rtc_df.grid_cell == grid_cell].iloc[0], \
           dem_df[dem_df.grid_cell == grid_cell].iloc[0]

def get_images(grid_cell):
    """
    Retrieves images corresponding to a specific 'grid_cell' by calling get_rows and row2image.

    Args:
        grid_cell: The grid cell identifier to fetch images for.

    Returns:
        list: A list of PIL.Image.Image objects, where each image is extracted from the rows
              returned by the get_rows function for the given grid cell.
    """
    img_rows = get_rows(grid_cell)

    imgs = []
    for row in img_rows:
        imgs.append(row2image(row))

    return imgs

def resize_and_crop(images, image_size=(1068, 1068), crop_size=(256, 256)):
    """
    Resizes a list of images to a specified size and then crops a random portion from each.

    Args:
        images (list): A list of PIL.Image.Image objects to be processed.
        image_size (tuple, optional): The target size (width, height) to resize the images to.
            Defaults to (1068, 1068).
        crop_size (tuple, optional): The size (width, height) of the random crop to be taken
            from the resized images. Defaults to (256, 256).

    Returns:
        list: A list of PIL.Image.Image objects, where each image has been resized and then cropped.
    """
    left = random.randint(0, image_size[0] - crop_size[0])
    top = random.randint(0, image_size[1] - crop_size[1])
    right = left + crop_size[0]
    bottom = top + crop_size[1]

    return [img.resize(image_size).crop((left, top, right, bottom)) for img in images]

def sample_shuffle():
    """
    Randomly selects a 'grid_cell', retrieves corresponding images, and optionally prepares them for an interface.

    Args:
        interface (bool, optional): If True, the function returns a list where each image is followed by True.
            This might be intended for an interface that expects an image and a boolean flag.
            If False, it returns just the list of processed images. Defaults to True.

    Returns:
        list: If interface is False, returns a list of resized and cropped PIL.Image.Image objects.
              If interface is True, returns a list where each image is followed by the boolean value True.
    """
    grid_cell = grid_cell_df.sample().iloc[0]

    return resize_and_crop(get_images(grid_cell))