File size: 6,475 Bytes
5318c78
 
b42dee3
5318c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
886e812
 
5318c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8d0048
5318c78
 
 
 
 
 
 
 
 
 
 
 
 
 
acfd194
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import pandas as pd
import spaces

# GLOBAL VARIABLES
if os.path.isfile('data/s2l2a_metadata.parquet'):
    l2a_meta_path = 'data/s2l2a_metadata.parquet'
else:
    DATASET_NAME = 'Major-TOM/Core-S2L2A'
    l2a_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

if os.path.isfile('data/s2l1c_metadata.parquet'):
    l1c_meta_path = 'data/s2l1c_metadata.parquet'
else:
    DATASET_NAME = 'Major-TOM/Core-S2L1C'
    l1c_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

if os.path.isfile('/s1rtc_metadata.parquet'):
    rtc_meta_path = 'data/s1rtc_metadata.parquet'
else:
    DATASET_NAME = 'Major-TOM/Core-S1RTC'
    rtc_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

if os.path.isfile('helpers/dem_metadata.parquet'):
    dem_meta_path = 'data/dem_metadata.parquet'
else:
    DATASET_NAME = 'Major-TOM/Core-DEM'
    dem_meta_path = 'https://huggingface.co./datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

print('Loading Major TOM meta...')
l2a_df = pd.read_parquet(l2a_meta_path)
l1c_df = pd.read_parquet(l1c_meta_path)
rtc_df = pd.read_parquet(rtc_meta_path)
dem_df = pd.read_parquet(dem_meta_path)

# skip files with missing parts
l2a_df = l2a_df[l2a_df.nodata == 0]
l1c_df = l1c_df[l1c_df.nodata == 0]
rtc_df = rtc_df[rtc_df.nodata == 0]
dem_df = dem_df[dem_df.nodata == 0]

# collect grid_cells, drop duplicates, and extract grid cell column only
grid_cell_df = l2a_df[l2a_df.grid_cell.isin(l1c_df.grid_cell) &l2a_df.grid_cell.isin(rtc_df.grid_cell) & l2a_df.grid_cell.isin(dem_df.grid_cell)]
gird_cell_df = grid_cell_df.drop_duplicates(subset=['grid_cell'])
grid_cell_df = grid_cell_df.grid_cell
print('[DONE]')

import pyarrow.parquet as pq
import fsspec
from fsspec.parquet import open_parquet_file
from io import BytesIO
from PIL import Image
import random

def row2image(row, fullrow_read=True):
    """
    Extracts an image from a specific row in a Parquet file.

    Args:
        row: A row object containing information about the Parquet file and row index.
            It is expected to have attributes 'parquet_row' (the row index within the Parquet file)
            and 'parquet_url' (the URL or path to the Parquet file).
        fullrow_read (bool, optional): Determines whether to read the entire Parquet file or just the 'thumbnail' column initially.
            Defaults to True.
            - If True, it opens the Parquet file using fsspec and reads the entire file.
            - If False, it uses fsspec.parquet.open_parquet_file to only open the 'thumbnail' column.

    Returns:
        PIL.Image.Image: An Image object loaded from the 'thumbnail' data in the specified row.
    """
    parquet_row = row.parquet_row
    parquet_url = row.parquet_url

    if fullrow_read:
        # Option 1: Read the entire Parquet file
        f = fsspec.open(parquet_url)
        temp_path = f.open()
    else:
        # Option 2: Read only the 'thumbnail' column initially
        temp_path = open_parquet_file(parquet_url, columns=["thumbnail"])

    with pq.ParquetFile(temp_path) as pf:
        first_row_group = pf.read_row_group(parquet_row, columns=['thumbnail'])

    stream = BytesIO(first_row_group['thumbnail'][0].as_py())
    return Image.open(stream)

# Example usage (assuming 'dem_df' is a Pandas DataFrame with the required structure):
# row2image(dem_df.iloc[1000])

def get_rows(grid_cell):
    """
    Retrieves the first row from multiple DataFrames based on a given 'grid_cell' value.

    Args:
        grid_cell: The value to filter the DataFrames by in the 'grid_cell' column.

    Returns:
        tuple: A tuple containing the first matching row from each of the following DataFrames:
               l2a_df, l1c_df, rtc_df, and dem_df. It assumes these DataFrames are defined in the scope.
               Each element of the tuple is a Pandas Series representing a row.
    """
    return l1c_df[l1c_df.grid_cell == grid_cell].iloc[0], \
           l2a_df[l2a_df.grid_cell == grid_cell].iloc[0], \
           rtc_df[rtc_df.grid_cell == grid_cell].iloc[0], \
           dem_df[dem_df.grid_cell == grid_cell].iloc[0]

def get_images(grid_cell):
    """
    Retrieves images corresponding to a specific 'grid_cell' by calling get_rows and row2image.

    Args:
        grid_cell: The grid cell identifier to fetch images for.

    Returns:
        list: A list of PIL.Image.Image objects, where each image is extracted from the rows
              returned by the get_rows function for the given grid cell.
    """
    img_rows = get_rows(grid_cell)

    imgs = []
    for row in img_rows:
        imgs.append(row2image(row))

    return imgs

def resize_and_crop(images, image_size=(1068, 1068), crop_size=(256, 256)):
    """
    Resizes a list of images to a specified size and then crops a random portion from each.

    Args:
        images (list): A list of PIL.Image.Image objects to be processed.
        image_size (tuple, optional): The target size (width, height) to resize the images to.
            Defaults to (1068, 1068).
        crop_size (tuple, optional): The size (width, height) of the random crop to be taken
            from the resized images. Defaults to (256, 256).

    Returns:
        list: A list of PIL.Image.Image objects, where each image has been resized and then cropped.
    """
    left = random.randint(0, image_size[0] - crop_size[0])
    top = random.randint(0, image_size[1] - crop_size[1])
    right = left + crop_size[0]
    bottom = top + crop_size[1]

    return [img.resize(image_size).crop((left, top, right, bottom)) for img in images]

def sample_shuffle():
    """
    Randomly selects a 'grid_cell', retrieves corresponding images, and optionally prepares them for an interface.

    Args:
        interface (bool, optional): If True, the function returns a list where each image is followed by True.
            This might be intended for an interface that expects an image and a boolean flag.
            If False, it returns just the list of processed images. Defaults to True.

    Returns:
        list: If interface is False, returns a list of resized and cropped PIL.Image.Image objects.
              If interface is True, returns a list where each image is followed by the boolean value True.
    """
    grid_cell = grid_cell_df.sample().iloc[0]

    return resize_and_crop(get_images(grid_cell))