Added scene image sampler 🌉

manolo-lolo · manolo-lolo · commit a18e08e9ec80 · 2021-09-23T21:45:08.000+02:00
diff --git a/configs/coco_scene_images_transformer.yaml b/configs/coco_scene_images_transformer.yaml
@@ -8,9 +8,12 @@ model:
       params:
         vocab_size: 8192
         block_size: 348  # = 256 + 92 = dim(vqgan_latent_space,16x16) + dim(conditional_builder.embedding_dim)
-        n_layer: 32
+        n_layer: 40
         n_head: 16
-        n_embd: 912
+        n_embd: 1408
+        embd_pdrop: 0.1
+        resid_pdrop: 0.1
+        attn_pdrop: 0.1
     first_stage_config:
       target: taming.models.vqgan.VQModel
       params:
@@ -59,7 +62,7 @@ data:
         crop_method: random-1d
         random_flip: true
         use_group_parameter: true
-        encode_crop: true
+        encode_crop: false
     validation:
       target: taming.data.annotated_objects_coco.AnnotatedObjectsCoco
       params:
@@ -71,7 +74,7 @@ data:
         min_object_area: 0.00001
         min_objects_per_image: 2
         max_objects_per_image: 30
-        crop_method: random-1d
-        random_flip: true
+        crop_method: center
+        random_flip: false
         use_group_parameter: true
         encode_crop: true
diff --git a/main.py b/main.py
@@ -11,6 +11,9 @@
 from pytorch_lightning.callbacks import ModelCheckpoint, Callback, LearningRateMonitor
 from pytorch_lightning.utilities.distributed import rank_zero_only
 
+from taming.data.utils import custom_collate
+
+
 def get_obj_from_str(string, reload=False):
     module, cls = string.rsplit(".", 1)
     if reload:
@@ -160,16 +163,16 @@ def setup(self, stage=None):
 
     def _train_dataloader(self):
         return DataLoader(self.datasets["train"], batch_size=self.batch_size,
-                          num_workers=self.num_workers, shuffle=True)
+                          num_workers=self.num_workers, shuffle=True, collate_fn=custom_collate)
 
     def _val_dataloader(self):
         return DataLoader(self.datasets["validation"],
                           batch_size=self.batch_size,
-                          num_workers=self.num_workers)
+                          num_workers=self.num_workers, collate_fn=custom_collate)
 
     def _test_dataloader(self):
         return DataLoader(self.datasets["test"], batch_size=self.batch_size,
-                          num_workers=self.num_workers)
+                          num_workers=self.num_workers, collate_fn=custom_collate)
 
 
 class SetupCallback(Callback):
diff --git a/scripts/make_scene_samples.py b/scripts/make_scene_samples.py
@@ -0,0 +1,198 @@
+import glob
+import os
+import sys
+from itertools import product
+from pathlib import Path
+from typing import Literal, List, Optional, Tuple
+
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning import seed_everything
+from torch import Tensor
+from torchvision.utils import save_image
+from tqdm import tqdm
+
+from scripts.make_samples import get_parser, load_model_and_dset
+from taming.data.conditional_builder.object_center_points_builder import CoordinatesCenterPointsConditionalBuilder
+from taming.data.helper_types import BoundingBox, Annotation
+from taming.data.annotated_objects_dataset import AnnotatedObjectsDataset
+from taming.models.cond_transformer import Net2NetTransformer
+
+seed_everything(42424242)
+device: Literal['cuda', 'cpu'] = 'cuda'
+first_stage_factor = 16
+trained_on_res = 256
+
+
+def _helper(coord: int, coord_max: int, coord_window: int) -> (int, int):
+    assert 0 <= coord < coord_max
+    coord_desired_center = (coord_window - 1) // 2
+    return np.clip(coord - coord_desired_center, 0, coord_max - coord_window)
+
+
+def get_crop_coordinates(x: int, y: int) -> BoundingBox:
+    WIDTH, HEIGHT = desired_z_shape[1], desired_z_shape[0]
+    x0 = _helper(x, WIDTH, first_stage_factor) / WIDTH
+    y0 = _helper(y, HEIGHT, first_stage_factor) / HEIGHT
+    w = first_stage_factor / WIDTH
+    h = first_stage_factor / HEIGHT
+    return x0, y0, w, h
+
+
+def get_z_indices_crop_out(z_indices: Tensor, predict_x: int, predict_y: int) -> Tensor:
+    WIDTH, HEIGHT = desired_z_shape[1], desired_z_shape[0]
+    x0 = _helper(predict_x, WIDTH, first_stage_factor)
+    y0 = _helper(predict_y, HEIGHT, first_stage_factor)
+    no_images = z_indices.shape[0]
+    cut_out_1 = z_indices[:, y0:predict_y, x0:x0+first_stage_factor].reshape((no_images, -1))
+    cut_out_2 = z_indices[:, predict_y, x0:predict_x]
+    return torch.cat((cut_out_1, cut_out_2), dim=1)
+
+
+@torch.no_grad()
+def sample(model: Net2NetTransformer, annotations: List[Annotation], dataset: AnnotatedObjectsDataset,
+           conditional_builder: CoordinatesCenterPointsConditionalBuilder, no_samples: int,
+           temperature: float, top_k: int) -> Tensor:
+    x_max, y_max = desired_z_shape[1], desired_z_shape[0]
+
+    annotations = [a._replace(category_no=dataset.get_category_number(a.category_id)) for a in annotations]
+
+    recompute_conditional = any((desired_resolution[0] > trained_on_res, desired_resolution[1] > trained_on_res))
+    if not recompute_conditional:
+        crop_coordinates = get_crop_coordinates(0, 0)
+        conditional_indices = conditional_builder.build(annotations, crop_coordinates)
+        c_indices = conditional_indices.to(device).repeat(no_samples, 1)
+        z_indices = torch.zeros((no_samples, 0), device=device).long()
+        output_indices = model.sample(z_indices, c_indices, steps=x_max*y_max, temperature=temperature,
+                                      sample=True, top_k=top_k)
+    else:
+        output_indices = torch.zeros((no_samples, y_max, x_max), device=device).long()
+        for predict_y, predict_x in tqdm(product(range(y_max), range(x_max)), desc='sampling_image', total=x_max*y_max):
+            crop_coordinates = get_crop_coordinates(predict_x, predict_y)
+            z_indices = get_z_indices_crop_out(output_indices, predict_x, predict_y)
+            conditional_indices = conditional_builder.build(annotations, crop_coordinates)
+            c_indices = conditional_indices.to(device).repeat(no_samples, 1)
+            new_index = model.sample(z_indices, c_indices, steps=1, temperature=temperature, sample=True, top_k=top_k)
+            output_indices[:, predict_y, predict_x] = new_index[:, -1]
+    z_shape = (
+        no_samples,
+        model.first_stage_model.quantize.e_dim,  # codebook embed_dim
+        desired_z_shape[0],  # z_height
+        desired_z_shape[1]  # z_width
+    )
+    x_sample = model.decode_to_img(output_indices, z_shape) * 0.5 + 0.5
+    x_sample = x_sample.to('cpu')
+
+    plotter = conditional_builder.plot
+    figure_size = (x_sample.shape[2], x_sample.shape[3])
+    scene_graph = conditional_builder.build(annotations, (0., 0., 1., 1.))
+    plot = plotter(scene_graph, dataset.get_textual_label_for_category_no, figure_size)
+    return torch.cat((x_sample, plot.unsqueeze(0)))
+
+
+def get_resolution(resolution_str: str) -> (Tuple[int, int], Tuple[int, int]):
+    if not resolution_str.count(',') == 1:
+        raise ValueError("Give resolution as in 'height,width'")
+    res_h, res_w = resolution_str.split(',')
+    res_h = max(int(res_h), trained_on_res)
+    res_w = max(int(res_w), trained_on_res)
+    z_h = int(round(res_h/first_stage_factor))
+    z_w = int(round(res_w/first_stage_factor))
+    return (z_h, z_w), (z_h*first_stage_factor, z_w*first_stage_factor)
+
+
+def add_arg_to_parser(parser):
+    parser.add_argument(
+        "-R",
+        "--resolution",
+        type=str,
+        default='256,256',
+        help=f"give resolution in multiples of {first_stage_factor}, default is '256,256'",
+    )
+    parser.add_argument(
+        "-C",
+        "--conditional",
+        type=str,
+        default='objects_bbox',
+        help=f"objects_bbox or objects_center_points",
+    )
+    parser.add_argument(
+        "-N",
+        "--n_samples_per_layout",
+        type=int,
+        default=4,
+        help=f"how many samples to generate per layout",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    sys.path.append(os.getcwd())
+
+    parser = get_parser()
+    parser = add_arg_to_parser(parser)
+
+    opt, unknown = parser.parse_known_args()
+
+    ckpt = None
+    if opt.resume:
+        if not os.path.exists(opt.resume):
+            raise ValueError("Cannot find {}".format(opt.resume))
+        if os.path.isfile(opt.resume):
+            paths = opt.resume.split("/")
+            try:
+                idx = len(paths)-paths[::-1].index("logs")+1
+            except ValueError:
+                idx = -2  # take a guess: path/to/logdir/checkpoints/model.ckpt
+            logdir = "/".join(paths[:idx])
+            ckpt = opt.resume
+        else:
+            assert os.path.isdir(opt.resume), opt.resume
+            logdir = opt.resume.rstrip("/")
+            ckpt = os.path.join(logdir, "checkpoints", "last.ckpt")
+        print(f"logdir:{logdir}")
+        base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*-project.yaml")))
+        opt.base = base_configs+opt.base
+
+    if opt.config:
+        if type(opt.config) == str:
+            opt.base = [opt.config]
+        else:
+            opt.base = [opt.base[-1]]
+
+    configs = [OmegaConf.load(cfg) for cfg in opt.base]
+    cli = OmegaConf.from_dotlist(unknown)
+    if opt.ignore_base_data:
+        for config in configs:
+            if hasattr(config, "data"):
+                del config["data"]
+    config = OmegaConf.merge(*configs, cli)
+    desired_z_shape, desired_resolution = get_resolution(opt.resolution)
+    conditional = opt.conditional
+
+    print(ckpt)
+    gpu = True
+    eval_mode = True
+    show_config = False
+    if show_config:
+        print(OmegaConf.to_container(config))
+
+    dsets, model, global_step = load_model_and_dset(config, ckpt, gpu, eval_mode)
+    print(f"Global step: {global_step}")
+
+    data_loader = dsets.val_dataloader()
+    print(dsets.datasets["validation"].conditional_builders)
+    conditional_builder = dsets.datasets["validation"].conditional_builders[conditional]
+
+    outdir = Path(opt.outdir).joinpath(f"{global_step:06}_{opt.top_k}_{opt.temperature}")
+    outdir.mkdir(exist_ok=True, parents=True)
+    print("Writing samples to ", outdir)
+
+    p_bar_1 = tqdm(enumerate(iter(data_loader)), desc='batch', total=len(data_loader))
+    for batch_no, batch in p_bar_1:
+        save_img: Optional[Tensor] = None
+        for i, annotations in tqdm(enumerate(batch['annotations']), desc='within_batch', total=data_loader.batch_size):
+            imgs = sample(model, annotations, dsets.datasets["validation"], conditional_builder,
+                          opt.n_samples_per_layout, opt.temperature, opt.top_k)
+            save_image(imgs, outdir.joinpath(f'{batch_no:04}_{i:02}.png'), n_row=opt.n_samples_per_layout+1)
diff --git a/taming/data/annotated_objects_coco.py b/taming/data/annotated_objects_coco.py
@@ -12,14 +12,12 @@
 COCO_PATH_STRUCTURE = {
     'train': {
         'top_level': '',
-        'person_annotations': 'annotations/person_keypoints_train2017.json',
         'instances_annotations': 'annotations/instances_train2017.json',
         'stuff_annotations': 'annotations/stuff_train2017.json',
         'files': 'train2017'
     },
     'validation': {
         'top_level': '',
-        'person_annotations': 'annotations/person_keypoints_val2017.json',
         'instances_annotations': 'annotations/instances_val2017.json',
         'stuff_annotations': 'annotations/stuff_val2017.json',
         'files': 'val2017'
diff --git a/taming/data/annotated_objects_dataset.py b/taming/data/annotated_objects_dataset.py
@@ -99,7 +99,7 @@ def no_classes(self) -> int:
         return len(self.categories)
 
     @property
-    def conditional_builders(self):
+    def conditional_builders(self) -> ObjectsCenterPointsConditionalBuilder:
         # cannot set this up in init because no_classes is only known after loading data in init of superclass
         if self._conditional_builders is None:
             self._conditional_builders = {
@@ -109,15 +109,15 @@ def conditional_builders(self):
                     self.no_tokens,
                     self.encode_crop,
                     self.use_group_parameter,
-                    getattr(self, 'self.use_additional_parameters', False)
+                    getattr(self, 'use_additional_parameters', False)
                 ),
                 'objects_bbox': ObjectsBoundingBoxConditionalBuilder(
                     self.no_classes,
                     self.max_objects_per_image,
                     self.no_tokens,
                     self.encode_crop,
                     self.use_group_parameter,
-                    getattr(self, 'self.use_additional_parameters', False)
+                    getattr(self, 'use_additional_parameters', False)
                 )
             }
         return self._conditional_builders
diff --git a/taming/data/utils.py b/taming/data/utils.py
@@ -1,8 +1,15 @@
+import collections
 import os
-import numpy as np
+import tarfile
 import urllib
-import tarfile, zipfile
+import zipfile
 from pathlib import Path
+
+import numpy as np
+import torch
+from taming.data.helper_types import Annotation
+from torch._six import string_classes
+from torch.utils.data._utils.collate import np_str_obj_array_pattern, default_collate_err_msg_format
 from tqdm import tqdm
 
 
@@ -112,3 +119,51 @@ def quadratic_crop(x, bbox, alpha=1.0):
     xmin = int(center[0] - l / 2)
     ymin = int(center[1] - l / 2)
     return np.array(x[ymin : ymin + l, xmin : xmin + l, ...])
+
+
+def custom_collate(batch):
+    r"""source: pytorch 1.9.0, only one modification to original code """
+
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, torch.Tensor):
+        out = None
+        if torch.utils.data.get_worker_info() is not None:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum([x.numel() for x in batch])
+            storage = elem.storage()._new_shared(numel)
+            out = elem.new(storage)
+        return torch.stack(batch, 0, out=out)
+    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+            and elem_type.__name__ != 'string_':
+        if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+            # array of string classes and object
+            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+
+            return custom_collate([torch.as_tensor(b) for b in batch])
+        elif elem.shape == ():  # scalars
+            return torch.as_tensor(batch)
+    elif isinstance(elem, float):
+        return torch.tensor(batch, dtype=torch.float64)
+    elif isinstance(elem, int):
+        return torch.tensor(batch)
+    elif isinstance(elem, string_classes):
+        return batch
+    elif isinstance(elem, collections.abc.Mapping):
+        return {key: custom_collate([d[key] for d in batch]) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+        return elem_type(*(custom_collate(samples) for samples in zip(*batch)))
+    if isinstance(elem, collections.abc.Sequence) and isinstance(elem[0], Annotation):  # added
+        return batch  # added
+    elif isinstance(elem, collections.abc.Sequence):
+        # check to make sure that the elements in batch have consistent size
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all(len(elem) == elem_size for elem in it):
+            raise RuntimeError('each element in list of batch should be of equal size')
+        transposed = zip(*batch)
+        return [custom_collate(samples) for samples in transposed]
+
+    raise TypeError(default_collate_err_msg_format.format(elem_type))