Spaces:

tttoaster
/

SEED-X-17B

Configuration error

yuyingge

Add application file

590af54 over 1 year ago

9.92 kB

	import base64
	import torch
	import math
	import ast
	from PIL import Image
	from io import BytesIO


	def select_best_resolution(original_size, possible_resolutions):
	"""
	Selects the best resolution from a list of possible resolutions based on the original size.

	Args:
	original_size (tuple): The original size of the image in the format (width, height).
	possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].

	Returns:
	tuple: The best fit resolution in the format (width, height).
	"""
	original_width, original_height = original_size
	best_fit = None
	max_effective_resolution = 0
	min_wasted_resolution = float('inf')

	for width, height in possible_resolutions:
	scale = min(width / original_width, height / original_height)
	downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
	effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
	wasted_resolution = (width * height) - effective_resolution

	if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
	max_effective_resolution = effective_resolution
	min_wasted_resolution = wasted_resolution
	best_fit = (width, height)

	return best_fit


	def select_best_resolution_v2(original_size, possible_resolutions):
	"""
	Selects the best resolution from a list of possible resolutions based on the original size and aspect ratio.

	Args:
	original_size (tuple): The original size of the image in the format (width, height).
	possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].

	Returns:
	tuple: The best fit resolution in the format (width, height).
	"""
	original_width, original_height = original_size
	original_aspect_ratio = original_height / original_width
	original_area = original_width * original_height
	best_fit = None
	min_aspect_ratio_diff = float('inf')
	min_area_ratio = float('inf')

	for width, height in possible_resolutions:
	aspect_ratio = height / width
	area = width * height
	aspect_ratio_diff = max(aspect_ratio, original_aspect_ratio) / min(aspect_ratio, original_aspect_ratio)
	area_ratio = max(area, original_area) / min(area, original_area)

	if aspect_ratio_diff < min_aspect_ratio_diff or (aspect_ratio_diff == min_aspect_ratio_diff and area_ratio < min_area_ratio):
	min_aspect_ratio_diff = aspect_ratio_diff
	min_area_ratio = area_ratio
	best_fit = (width, height)

	return best_fit


	def resize_and_pad_image(image, target_resolution, keep_ratio=False):
	"""
	Resize and pad an image to a target resolution

	Args:
	image (PIL.Image.Image): The input image.
	target_resolution (tuple): The target resolution (width, height) of the image.

	Returns:
	PIL.Image.Image: The resized and padded image.
	"""
	original_width, original_height = image.size
	target_width, target_height = target_resolution

	if keep_ratio:
	# maintaining aspect ratio
	scale_w = target_width / original_width
	scale_h = target_height / original_height

	if scale_w < scale_h:
	new_width = target_width
	new_height = min(math.ceil(original_height * scale_w), target_height)
	else:
	new_height = target_height
	new_width = min(math.ceil(original_width * scale_h), target_width)

	# Resize the image
	resized_image = image.resize((new_width, new_height))

	new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
	paste_x = (target_width - new_width) // 2
	paste_y = (target_height - new_height) // 2
	new_image.paste(resized_image, (paste_x, paste_y))
	else:
	# not maintaining aspect ratio
	new_image = image.resize((target_width, target_height))

	return new_image


	def divide_to_patches(image, patch_size):
	"""
	Divides an image into patches of a specified size.

	Args:
	image (PIL.Image.Image): The input image.
	patch_size (int): The size of each patch.

	Returns:
	list: A list of PIL.Image.Image objects representing the patches.
	"""
	patches = []
	width, height = image.size
	for i in range(0, height, patch_size):
	for j in range(0, width, patch_size):
	box = (j, i, j + patch_size, i + patch_size)
	patch = image.crop(box)
	patches.append(patch)

	return patches


	def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
	"""
	Calculate the shape of the image patch grid after the preprocessing for images of any resolution.

	Args:
	image_size (tuple): The size of the input image in the format (width, height).
	grid_pinpoints (str): A string representation of a list of possible resolutions.
	patch_size (int): The size of each image patch.

	Returns:
	tuple: The shape of the image patch grid in the format (width, height).
	"""
	if type(grid_pinpoints) is list:
	possible_resolutions = grid_pinpoints
	else:
	possible_resolutions = ast.literal_eval(grid_pinpoints)
	width1, height1 = select_best_resolution(image_size, possible_resolutions)
	width2, height2 = select_best_resolution_v2(image_size, possible_resolutions)
	if width1height1 > width2height2:
	width, height = width2, height2
	else:
	width, height = width1, height1
	return width // patch_size, height // patch_size


	def process_anyres_image(image, image_transform, grid_pinpoints, base_image_size):
	"""
	Process an image with variable resolutions.

	Args:
	image (PIL.Image.Image): The input image to be processed.
	image_transform: The image processor object.
	grid_pinpoints (str): A string representation of a list of possible resolutions.

	Returns:
	torch.Tensor: A tensor containing the processed image patches.
	"""
	if type(grid_pinpoints) is list:
	possible_resolutions = grid_pinpoints
	else:
	possible_resolutions = ast.literal_eval(grid_pinpoints)
	# best_resolution = select_best_resolution(image.size, possible_resolutions)
	width1, height1 = select_best_resolution(image.size, possible_resolutions)
	width2, height2 = select_best_resolution_v2(image.size, possible_resolutions)
	if width1height1 > width2height2:
	width, height = width2, height2
	else:
	width, height = width1, height1
	best_resolution = [width, height]

	image_padded = resize_and_pad_image(image, best_resolution)

	patches = divide_to_patches(image_padded, base_image_size)

	image_original_resize = image.resize((base_image_size, base_image_size))

	image_patches = patches + [image_original_resize] # add the original image as the last patch
	image_patches = [image_transform(image_patch)
	for image_patch in image_patches]

	patch_grid = (best_resolution[0]//base_image_size, best_resolution[1]//base_image_size)
	x_index = (torch.arange(patch_grid[0]).repeat(patch_grid[1], 1) + 0.5)/patch_grid[0]
	y_index = (torch.arange(patch_grid[1]).unsqueeze(1).repeat(1, patch_grid[0]) + 0.5)/patch_grid[1]
	patch_pos = torch.stack([x_index, y_index], dim=-1).flatten(0, 1) # h*w, 2

	origin_pos = torch.tensor([[0.5, 0.5]])
	patch_pos = torch.cat([patch_pos, origin_pos], dim=0) # h*w+1, 2

	return torch.stack(image_patches, dim=0), patch_pos


	def load_image_from_base64(image):
	return Image.open(BytesIO(base64.b64decode(image)))


	def anyres_data_collate(batch, tokenizer, dataset_name=None):
	results = {}
	keys = batch[0].keys()

	for key in keys:
	cur = [batch[i][key] for i in range(len(batch)) if batch[i][key] is not None]
	if len(cur) == 0:
	results[key] = None
	elif isinstance(cur[0], torch.Tensor):
	if key in ['embeds_gen_mask', 'embeds_cmp_mask', 'images', 'images_patch_length', 'patch_position', 'image_size']:
	results[key] = torch.cat(cur, dim=0)
	else:
	if key in ['input_ids']:
	results[key] = torch.nn.utils.rnn.pad_sequence(cur, batch_first=True, padding_value=tokenizer.pad_token_id)
	elif key in ['attention_mask']:
	results[key] = torch.nn.utils.rnn.pad_sequence(cur, batch_first=True, padding_value=0)
	elif key in ['labels']:
	results[key] = torch.nn.utils.rnn.pad_sequence(cur, batch_first=True, padding_value=-100)
	elif key in ['ids_gen_mask', 'ids_cmp_mask']:
	results[key] = torch.nn.utils.rnn.pad_sequence(cur, batch_first=True, padding_value=False)

	else:
	results[key] = torch.stack(cur, dim=0)
	else:
	results[key] = cur

	results['dataset_name'] = dataset_name

	return results


	def anyres_data_collate_old(batch, dataset_name=None):
	results = {}
	keys = batch[0].keys()

	for key in keys:
	cur = [batch[i][key] for i in range(len(batch)) if batch[i][key] is not None]
	if len(cur) == 0:
	results[key] = None
	elif isinstance(cur[0], torch.Tensor):
	if key in ['embeds_gen_mask', 'embeds_cmp_mask', 'images', 'images_patch_length', 'patch_position', 'image_size']:
	results[key] = torch.cat(cur, dim=0)
	else:
	results[key] = torch.stack(cur, dim=0)
	else:
	results[key] = cur

	results['dataset_name'] = dataset_name

	return results