Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,475 Bytes
5318c78 b42dee3 5318c78 886e812 5318c78 e8d0048 5318c78 acfd194 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import os
import pandas as pd
import spaces
# GLOBAL VARIABLES
if os.path.isfile('data/s2l2a_metadata.parquet'):
l2a_meta_path = 'data/s2l2a_metadata.parquet'
else:
DATASET_NAME = 'Major-TOM/Core-S2L2A'
l2a_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)
if os.path.isfile('data/s2l1c_metadata.parquet'):
l1c_meta_path = 'data/s2l1c_metadata.parquet'
else:
DATASET_NAME = 'Major-TOM/Core-S2L1C'
l1c_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)
if os.path.isfile('/s1rtc_metadata.parquet'):
rtc_meta_path = 'data/s1rtc_metadata.parquet'
else:
DATASET_NAME = 'Major-TOM/Core-S1RTC'
rtc_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)
if os.path.isfile('helpers/dem_metadata.parquet'):
dem_meta_path = 'data/dem_metadata.parquet'
else:
DATASET_NAME = 'Major-TOM/Core-DEM'
dem_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)
print('Loading Major TOM meta...')
l2a_df = pd.read_parquet(l2a_meta_path)
l1c_df = pd.read_parquet(l1c_meta_path)
rtc_df = pd.read_parquet(rtc_meta_path)
dem_df = pd.read_parquet(dem_meta_path)
# skip files with missing parts
l2a_df = l2a_df[l2a_df.nodata == 0]
l1c_df = l1c_df[l1c_df.nodata == 0]
rtc_df = rtc_df[rtc_df.nodata == 0]
dem_df = dem_df[dem_df.nodata == 0]
# collect grid_cells, drop duplicates, and extract grid cell column only
grid_cell_df = l2a_df[l2a_df.grid_cell.isin(l1c_df.grid_cell) &l2a_df.grid_cell.isin(rtc_df.grid_cell) & l2a_df.grid_cell.isin(dem_df.grid_cell)]
gird_cell_df = grid_cell_df.drop_duplicates(subset=['grid_cell'])
grid_cell_df = grid_cell_df.grid_cell
print('[DONE]')
import pyarrow.parquet as pq
import fsspec
from fsspec.parquet import open_parquet_file
from io import BytesIO
from PIL import Image
import random
def row2image(row, fullrow_read=True):
"""
Extracts an image from a specific row in a Parquet file.
Args:
row: A row object containing information about the Parquet file and row index.
It is expected to have attributes 'parquet_row' (the row index within the Parquet file)
and 'parquet_url' (the URL or path to the Parquet file).
fullrow_read (bool, optional): Determines whether to read the entire Parquet file or just the 'thumbnail' column initially.
Defaults to True.
- If True, it opens the Parquet file using fsspec and reads the entire file.
- If False, it uses fsspec.parquet.open_parquet_file to only open the 'thumbnail' column.
Returns:
PIL.Image.Image: An Image object loaded from the 'thumbnail' data in the specified row.
"""
parquet_row = row.parquet_row
parquet_url = row.parquet_url
if fullrow_read:
# Option 1: Read the entire Parquet file
f = fsspec.open(parquet_url)
temp_path = f.open()
else:
# Option 2: Read only the 'thumbnail' column initially
temp_path = open_parquet_file(parquet_url, columns=["thumbnail"])
with pq.ParquetFile(temp_path) as pf:
first_row_group = pf.read_row_group(parquet_row, columns=['thumbnail'])
stream = BytesIO(first_row_group['thumbnail'][0].as_py())
return Image.open(stream)
# Example usage (assuming 'dem_df' is a Pandas DataFrame with the required structure):
# row2image(dem_df.iloc[1000])
def get_rows(grid_cell):
"""
Retrieves the first row from multiple DataFrames based on a given 'grid_cell' value.
Args:
grid_cell: The value to filter the DataFrames by in the 'grid_cell' column.
Returns:
tuple: A tuple containing the first matching row from each of the following DataFrames:
l2a_df, l1c_df, rtc_df, and dem_df. It assumes these DataFrames are defined in the scope.
Each element of the tuple is a Pandas Series representing a row.
"""
return l1c_df[l1c_df.grid_cell == grid_cell].iloc[0], \
l2a_df[l2a_df.grid_cell == grid_cell].iloc[0], \
rtc_df[rtc_df.grid_cell == grid_cell].iloc[0], \
dem_df[dem_df.grid_cell == grid_cell].iloc[0]
def get_images(grid_cell):
"""
Retrieves images corresponding to a specific 'grid_cell' by calling get_rows and row2image.
Args:
grid_cell: The grid cell identifier to fetch images for.
Returns:
list: A list of PIL.Image.Image objects, where each image is extracted from the rows
returned by the get_rows function for the given grid cell.
"""
img_rows = get_rows(grid_cell)
imgs = []
for row in img_rows:
imgs.append(row2image(row))
return imgs
def resize_and_crop(images, image_size=(1068, 1068), crop_size=(256, 256)):
"""
Resizes a list of images to a specified size and then crops a random portion from each.
Args:
images (list): A list of PIL.Image.Image objects to be processed.
image_size (tuple, optional): The target size (width, height) to resize the images to.
Defaults to (1068, 1068).
crop_size (tuple, optional): The size (width, height) of the random crop to be taken
from the resized images. Defaults to (256, 256).
Returns:
list: A list of PIL.Image.Image objects, where each image has been resized and then cropped.
"""
left = random.randint(0, image_size[0] - crop_size[0])
top = random.randint(0, image_size[1] - crop_size[1])
right = left + crop_size[0]
bottom = top + crop_size[1]
return [img.resize(image_size).crop((left, top, right, bottom)) for img in images]
def sample_shuffle():
"""
Randomly selects a 'grid_cell', retrieves corresponding images, and optionally prepares them for an interface.
Args:
interface (bool, optional): If True, the function returns a list where each image is followed by True.
This might be intended for an interface that expects an image and a boolean flag.
If False, it returns just the list of processed images. Defaults to True.
Returns:
list: If interface is False, returns a list of resized and cropped PIL.Image.Image objects.
If interface is True, returns a list where each image is followed by the boolean value True.
"""
grid_cell = grid_cell_df.sample().iloc[0]
return resize_and_crop(get_images(grid_cell)) |