Spaces:

lilkm
/

Text2Cryptopunks

Build error

Text2Cryptopunks / text2punks /loader.py

Khalil

First commit, add text2punps scripts, app file, and requirements file

b41a54a over 3 years ago

3.32 kB

	import numpy as np
	from PIL import Image, UnidentifiedImageError

	from pathlib import Path
	from random import randint, choice

	import torch
	from torch.utils.data import Dataset


	class TextImageDataset(Dataset):
	def __init__(self,
	folder,
	text_len=40,
	truncate_captions=False,
	text_tokenizer=None,
	image_tokenizer=None,
	shuffle=False
	):
	"""
	@param folder: Folder containing images and text files matched by their paths' respective "stem"
	@param truncate_captions: Rather than throw an exception, captions which are too long will be truncated.
	"""
	super().__init__()
	self.shuffle = shuffle
	path = Path(folder)

	text_files = [path.glob('/.txt')]
	image_files = [
	path.glob('/.png'), path.glob('/.jpg'),
	path.glob('/.jpeg'), path.glob('/.bmp')
	]

	text_files = {text_file.stem: text_file for text_file in text_files}
	image_files = {image_file.stem: image_file for image_file in image_files}

	keys = (image_files.keys() & text_files.keys())

	self.keys = list(keys)
	self.text_files = {k: v for k, v in text_files.items() if k in keys}
	self.image_files = {k: v for k, v in image_files.items() if k in keys}
	self.text_len = text_len
	self.truncate_captions = truncate_captions
	self.text_tokenizer = text_tokenizer
	self.image_tokenizer = image_tokenizer


	def __len__(self):
	return len(self.keys)

	def random_sample(self):
	return self.__getitem__(randint(0, self.__len__() - 1))

	def sequential_sample(self, ind):
	if ind >= self.__len__() - 1:
	return self.__getitem__(0)
	return self.__getitem__(ind + 1)

	def skip_sample(self, ind):
	if self.shuffle:
	return self.random_sample()
	return self.sequential_sample(ind=ind)

	def __getitem__(self, ind):
	key = self.keys[ind]

	text_file = self.text_files[key]
	image_file = self.image_files[key]

	descriptions = text_file.read_text().split('\n')
	descriptions = list(filter(lambda t: len(t) > 0, descriptions))
	try:
	description = choice(descriptions)
	except IndexError as zero_captions_in_file_ex:
	print(f"An exception occurred trying to load file {text_file}.")
	print(f"Skipping index {ind}")
	return self.skip_sample(ind)

	tokenized_text = self.text_tokenizer.tokenize(
	description,
	self.text_len,
	truncate_text=self.truncate_captions
	).squeeze(0)
	try:
	image = Image.open(image_file).convert('RGB')
	pixels = np.array(image).reshape(-1, 3)

	tokenized_image = [self.image_tokenizer[str(idx)] for idx in pixels]
	tokenized_image = torch.tensor(tokenized_image)
	except (UnidentifiedImageError, OSError) as corrupt_image_exceptions:
	print(f"An exception occurred trying to load file {image_file}.")
	print(f"Skipping index {ind}")
	return self.skip_sample(ind)

	# Success
	return tokenized_text, tokenized_image