Эх сурвалжийг харах

refactored the way different annotation types are handled

Dimitri Korsch 5 жил өмнө
parent
commit
177127f1ce

+ 7 - 10
cvdatasets/__init__.py

@@ -1,13 +1,10 @@
-from .dataset import Dataset, ImageWrapperDataset
+from cvdatasets.dataset import Dataset
+from cvdatasets.dataset import ImageWrapperDataset
 
-from .annotations import BaseAnnotations
-from .annotations import CUB_Annotations
-from .annotations import NAB_Annotations
-from .annotations import CARS_Annotations
-from .annotations import INAT19_Annotations
-from .annotations import FLOWERS_Annotations
-from .annotations import HED_Annotations
-from .annotations import AnnotationType
+from cvdatasets.annotation import BaseAnnotations
+from cvdatasets.annotation.types import FileListAnnotations
+from cvdatasets.annotation.types import FolderAnnotations
+from cvdatasets.annotation.types import JSONAnnotations
 
-from .utils import _MetaInfo
+from cvdatasets.utils import _MetaInfo
 

+ 29 - 28
cvdatasets/annotation/__init__.py

@@ -1,37 +1,38 @@
-from cvdatasets.annotations.annotation_types import AnnotationType
-from cvdatasets.annotations.impl.birdsnap import BSNAP_Annotations
-from cvdatasets.annotations.impl.cars import CARS_Annotations
-from cvdatasets.annotations.impl.cub import CUB_Annotations
-from cvdatasets.annotations.impl.dogs import DOGS_Annotations
-from cvdatasets.annotations.impl.flowers import FLOWERS_Annotations
-from cvdatasets.annotations.impl.hed import HED_Annotations
-from cvdatasets.annotations.impl.imagenet import INET_Annotations
-from cvdatasets.annotations.impl.inat import INAT18_Annotations
-from cvdatasets.annotations.impl.inat import INAT19_Annotations
-from cvdatasets.annotations.impl.inat import INAT20_Annotations
-from cvdatasets.annotations.impl.nab import NAB_Annotations
-from cvdatasets.annotations.impl.tigers import TIGERS_Annotations
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.mixins.bbox_mixin import BBoxMixin
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
+from cvdatasets.annotation.types import AnnotationType
+
+# from cvdatasets.annotation.types.birdsnap import BSNAP_Annotations
+# from cvdatasets.annotation.types.cars import CARS_Annotations
+# from cvdatasets.annotation.types.cub import CUB_Annotations
+# from cvdatasets.annotation.types.dogs import DOGS_Annotations
+# from cvdatasets.annotation.types.flowers import FLOWERS_Annotations
+# from cvdatasets.annotation.types.hed import HED_Annotations
+# from cvdatasets.annotation.types.imagenet import INET_Annotations
+# from cvdatasets.annotation.types.inat import INAT18_Annotations
+# from cvdatasets.annotation.types.inat import INAT19_Annotations
+# from cvdatasets.annotation.types.inat import INAT20_Annotations
+# from cvdatasets.annotation.types.nab import NAB_Annotations
+# from cvdatasets.annotation.types.tigers import TIGERS_Annotations
 
 
-from cvdatasets.annotations.base import BaseAnnotations
-from cvdatasets.annotations.base.bbox_mixin import BBoxMixin
-from cvdatasets.annotations.base.parts_mixin import PartsMixin
 
 __all__ = [
 	"AnnotationType",
 	"BaseAnnotations",
 	"BBoxMixin",
-	"BSNAP_Annotations",
-	"CARS_Annotations",
-	"CUB_Annotations",
-	"DOGS_Annotations",
-	"FLOWERS_Annotations",
-	"HED_Annotations",
-	"INAT18_Annotations",
-	"INAT19_Annotations",
-	"INAT20_Annotations",
-	"INET_Annotations",
-	"NAB_Annotations",
+	# "BSNAP_Annotations",
+	# "CARS_Annotations",
+	# "CUB_Annotations",
+	# "DOGS_Annotations",
+	# "FLOWERS_Annotations",
+	# "HED_Annotations",
+	# "INAT18_Annotations",
+	# "INAT19_Annotations",
+	# "INAT20_Annotations",
+	# "INET_Annotations",
+	# "NAB_Annotations",
 	"PartsMixin",
-	"TIGERS_Annotations",
+	# "TIGERS_Annotations",
 ]

+ 151 - 5
cvdatasets/annotation/base.py

@@ -4,18 +4,165 @@ import numpy as np
 
 from collections import OrderedDict
 from collections import defaultdict
-from os.path import isdir
-from os.path import isfile
-from os.path import join
+from pathlib import Path
+from typing import Tuple
 
 from cvdatasets.dataset import Dataset
 from cvdatasets.utils import feature_file_name
-from cvdatasets.utils import read_info_file
 from cvdatasets.utils import pretty_print_dict
+from cvdatasets.utils import read_info_file
 from cvdatasets.utils.decorators import only_with_info
 
+
 class BaseAnnotations(abc.ABC):
 
+	def __init__(self, *, root_or_infofile, dataset_key=None, images_folder="images", **kwargs):
+
+		self.dataset_key = dataset_key
+		self.images_folder = images_folder
+
+		root_or_infofile = Path(root_or_infofile)
+		if root_or_infofile.is_dir():
+			self.info = None
+			self.root = root_or_infofile
+
+		elif root_or_infofile.is_file():
+			self.info = read_info_file(root_or_infofile)
+			ds_info = self.dataset_info
+			self.root = self.data_root / ds_info.folder / ds_info.annotations
+
+		else:
+			msg = f"Root folder or info file does not exist: \"{root_or_infofile}\""
+			raise ValueError(msg)
+
+		assert self.root.is_dir(), \
+			f"Annotation directory does not exist: \"{self.root}\"!"
+
+		self.files = self.read_annotation_files()
+		self.parse_annotations()
+
+	@property
+	@only_with_info
+	def data_root(self):
+		return Path(self.info.BASE_DIR) / self.info.DATA_DIR
+
+	@property
+	@only_with_info
+	def dataset_info(self):
+		key = getattr(self.__class__, "name", None)
+
+		if key is None:
+			key = self.dataset_key
+
+		if key not in self.info.DATASETS:
+			raise ValueError(f"Cannot find dataset with key \"{key}\"")
+
+		return self.info.DATASETS[key]
+
+	def parse_annotations(self):
+		logging.debug("Parsing read annotations (uuids, labels and train-test splits)")
+		self._parse_uuids()
+		self._parse_labels()
+		self._parse_split()
+
+	def __getitem__(self, uuid) -> Tuple[str, int]:
+		return self.image(uuid), self.label(uuid)
+
+	def image_path(self, image) -> str:
+		return str(self.root / self.images_folder / image)
+
+	def image(self, uuid) -> str:
+		fname = self.image_names[self.uuid_to_idx[uuid]]
+		return self.image_path(fname)
+
+	def label(self, uuid) -> int:
+		return self.labels[self.uuid_to_idx[uuid]].copy()
+
+	def parts(self, uuid) -> object:
+		return None
+
+	def bounding_box(self, uuid) -> object:
+		return None
+
+	def _uuids(self, split) -> np.ndarray:
+		return self.uuids[split]
+
+	@property
+	def train_uuids(self):
+		return self._uuids(self.train_split)
+
+	@property
+	def test_uuids(self):
+		return self._uuids(self.test_split)
+
+	def new_train_test_datasets(self, dataset_cls=Dataset, **kwargs):
+		return (self.new_dataset(subset, dataset_cls) for subset in ["train", "test"])
+
+	def new_dataset(self, subset=None, dataset_cls=Dataset, **kwargs):
+		if subset is not None:
+			uuids = getattr(self, "{}_uuids".format(subset))
+		else:
+			uuids = self.uuids
+
+		kwargs = self.check_dataset_kwargs(subset, **kwargs)
+		return dataset_cls(uuids=uuids, annotations=self, **kwargs)
+
+	def check_dataset_kwargs(self, subset, **kwargs):
+		dataset_info = self.dataset_info
+		if dataset_info is None:
+			return kwargs
+
+		logging.debug("Dataset info: {}".format(pretty_print_dict(dataset_info)))
+
+		# TODO: pass all scales
+		new_kwargs = {}
+
+		if "scales" in dataset_info:
+			new_kwargs["ratio"] = dataset_info.scales[0]
+
+		if "is_uniform" in dataset_info:
+			new_kwargs["uniform_parts"] = dataset_info.is_uniform
+
+		feature_model = getattr(self, "feature_model", None)
+		if None not in [subset, feature_model]:
+			tried = []
+			model_info = self.info.MODELS[feature_model]
+			for subset_phony in BaseAnnotations.FEATURE_PHONY[subset]:
+				features = feature_file_name(subset_phony, dataset_info, model_info)
+				feature_path = join(self.root, "features", features)
+				if isfile(feature_path): break
+				tried.append(feature_path)
+			else:
+				raise ValueError(
+					"Could not find any features in \"{}\" for {} subset. Tried features: {}".format(
+					join(self.root, "features"), subset, tried))
+
+			logging.info("Using features file from \"{}\"".format(feature_path))
+			new_kwargs["features"] = feature_path
+		new_kwargs.update(kwargs)
+
+		logging.debug("Final kwargs: {}".format(pretty_print_dict(new_kwargs)))
+		return new_kwargs
+
+	@abc.abstractmethod
+	def read_annotation_files(self):
+		raise NotImplementedError
+
+	@abc.abstractmethod
+	def _parse_uuids(self):
+		raise NotImplementedError
+
+	@abc.abstractmethod
+	def _parse_labels(self):
+		raise NotImplementedError
+
+	@abc.abstractmethod
+	def _parse_split(self):
+		raise NotImplementedError
+
+
+class _BaseAnnotations(abc.ABC):
+
 	FEATURE_PHONY = dict(train=["train"], test=["test", "val"])
 
 	@classmethod
@@ -54,7 +201,6 @@ class BaseAnnotations(abc.ABC):
 
 		self.load()
 
-
 	@property
 	@only_with_info
 	def data_root(self):

+ 19 - 6
cvdatasets/annotation/files.py

@@ -1,4 +1,5 @@
-import abc
+# import abc
+import os
 import logging
 import simplejson as json
 import warnings
@@ -9,7 +10,7 @@ from typing import Callable
 from typing import Dict
 from typing import List
 
-class BaseAnnotationFiles(abc.ABC):
+class AnnotationFiles(object):
 
 	@staticmethod
 	def _parse_opts(fpath_and_opts):
@@ -22,9 +23,10 @@ class BaseAnnotationFiles(abc.ABC):
 
 
 	def __init__(self, *files, root=".", load_strict=False, **named_files):
-		super(BaseAnnotationFiles, self).__init__()
+		super(AnnotationFiles, self).__init__()
 		self.load_strict = load_strict
 		self.root = Path(root)
+		self._files = []
 
 		for fpath in files:
 			fpath, opts = self._parse_opts(fpath)
@@ -61,8 +63,18 @@ class BaseAnnotationFiles(abc.ABC):
 
 			return reader(f)
 
-	def read_directory(self, fpath):
-		raise NotImplementedError("IMPLEMENT ME!")
+	def read_directory(self, folder_path):
+		logging.info(f"Loading files from folder \"{folder_path}\" ...")
+
+		_content = [
+			Path(path) / file
+				for path, folders, files in os.walk(folder_path)
+					for file in files
+		]
+
+		logging.info(f"Found {len(_content):,d} files in \"{folder_path}\"")
+		return _content
+		# setattr(self, attr, _content)
 
 	def add_file_content(self, fpath, optional=False, *args, attr=None, **kwargs):
 		fpath = self._path(fpath)
@@ -84,10 +96,11 @@ class BaseAnnotationFiles(abc.ABC):
 		else:
 			logging.debug(f"\"{fpath}\" was not found and was ignored, since it was marked as optional")
 
+		self._files.append(attr)
 		setattr(self, attr, content)
 
 if __name__ == '__main__':
-	files = BaseAnnotationFiles(
+	files = AnnotationFiles(
 		"foo.txt",
 		tad="bar.txt",
 		bar=("fobar.txt", True),

+ 9 - 2
cvdatasets/annotation/mixins/__init__.py

@@ -1,3 +1,10 @@
+from cvdatasets.annotation.mixins.bbox_mixin import BBoxMixin
+from cvdatasets.annotation.mixins.features_mixin import FeaturesMixin
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
+
+__all__ = [
+	"BBoxMixin",
+	"FeaturesMixin",
+	"PartsMixin",
+]
 
-from cvdatasets.annotations.mixins.bbox_mixin import BBoxMixin
-from cvdatasets.annotations.mixins.parts_mixin import PartsMixin

+ 3 - 0
cvdatasets/annotation/mixins/features_mixin.py

@@ -0,0 +1,3 @@
+
+class FeaturesMixin(object):
+	pass

+ 0 - 68
cvdatasets/annotation/types.py

@@ -1,68 +0,0 @@
-from cvdatasets.annotations.impl.birdsnap import BSNAP_Annotations
-from cvdatasets.annotations.impl.cars import CARS_Annotations
-from cvdatasets.annotations.impl.cub import CUB_Annotations
-from cvdatasets.annotations.impl.dogs import DOGS_Annotations
-from cvdatasets.annotations.impl.flowers import FLOWERS_Annotations
-from cvdatasets.annotations.impl.hed import HED_Annotations
-from cvdatasets.annotations.impl.imagenet import INET_Annotations
-from cvdatasets.annotations.impl.inat import INAT18_Annotations
-from cvdatasets.annotations.impl.inat import INAT19_Annotations
-from cvdatasets.annotations.impl.inat import INAT20_Annotations
-from cvdatasets.annotations.impl.nab import NAB_Annotations
-from cvdatasets.annotations.impl.tigers import TIGERS_Annotations
-
-from cvargparse.utils import BaseChoiceType
-from functools import partial
-
-class AnnotationType(BaseChoiceType):
-	IMAGENET = INET_Annotations
-
-	CUB200 = CUB_Annotations
-	BIRDSNAP = BSNAP_Annotations
-	NAB = NAB_Annotations
-
-	CARS = CARS_Annotations
-	DOGS = DOGS_Annotations
-
-	FLOWERS = FLOWERS_Annotations
-
-	HED = HED_Annotations
-	TIGERS = TIGERS_Annotations
-
-	INAT18 = INAT18_Annotations
-	INAT19 = INAT19_Annotations
-	INAT20 = INAT20_Annotations
-
-	Default = CUB200
-
-	@classmethod
-	def phony(cls, key):
-		""" returns for a key a list of datasets,
-			that use the same annotation class """
-
-		return {
-			cls.CUB200 : [ "CUB200_2FOLD", "CUB200_GOOGLE", "CUB200_GOOGLE_SEM" ],
-			cls.TIGERS : [ "TIGERS_TEST" ],
-			cls.INAT19 : [ "INAT19_TEST", "INAT19_MINI" ],
-			cls.INAT20 : [ "INAT20_TEST",
-				"INAT20_IN_CLASS",
-				"INAT20_OUT_CLASS",
-				"INAT20_NOISY_IN_CLASS",
-				"INAT20_NOISY_OUT_CLASS",
-				"INAT20_U_IN_CLASS",
-				"INAT20_U_OUT_CLASS",
-			],
-			cls.IMAGENET : [ "IMAGENET_TOP_INAT20" ],
-		}.get(key, [])
-
-	@classmethod
-	def as_choices(cls, add_phony=True):
-		choices = super(AnnotationType, cls).as_choices()
-		if not add_phony:
-			return choices
-
-		for key in cls:
-			for phony in cls.phony(key):
-				choices[phony.lower()] = choices[key.name.lower()]
-
-		return choices

+ 66 - 0
cvdatasets/annotation/types/__init__.py

@@ -0,0 +1,66 @@
+from cvdatasets.annotation.types.file_list import FileListAnnotations
+from cvdatasets.annotation.types.folder_annotations import FolderAnnotations
+from cvdatasets.annotation.types.json_annotations import JSONAnnotations
+
+from cvargparse.utils import BaseChoiceType
+from cvargparse.utils.enumerations import MetaBaseType
+
+class AnnotationMetaType(MetaBaseType):
+	def __getitem__(cls, key):
+		res = super(AnnotationMetaType, cls).__getitem__(key)
+		res.value.name = key
+		return res
+
+class AnnotationType(BaseChoiceType, metaclass=AnnotationMetaType):
+	FOLDER = FolderAnnotations
+	FILE_LIST = FileListAnnotations
+	JSON = JSONAnnotations
+
+	Default = FILE_LIST
+
+	@classmethod
+	def phony(cls, key):
+		""" returns for a key a list of datasets,
+			that use the same annotation class """
+
+		return {
+			cls.FOLDER : [
+				"IMAGENET", "IMAGENET_TOP_INAT20"
+			],
+
+			cls.FILE_LIST : [
+				"CUB200", "CUB200_2FOLD", "CUB200_GOOGLE", "CUB200_GOOGLE_SEM"
+				"NAB", "BIRDSNAP",
+				"CARS", "DOGS", "FLOWERS"
+				"HED", "TIGERS", "TIGERS_TEST"
+
+			],
+
+			cls.JSON : [
+				"INAT18",
+				"INAT19", "INAT19_TEST", "INAT19_MINI",
+				"INAT20", "INAT20_TEST",
+				"INAT20_IN_CLASS",
+				"INAT20_OUT_CLASS",
+				"INAT20_NOISY_IN_CLASS",
+				"INAT20_NOISY_OUT_CLASS",
+				"INAT20_U_IN_CLASS",
+				"INAT20_U_OUT_CLASS",
+			],
+
+		}.get(key, [])
+
+	@classmethod
+	def as_choices(cls, add_phony=True):
+		choices = super(AnnotationType, cls).as_choices()
+		if not add_phony:
+			return choices
+
+		for key in cls:
+			for phony in cls.phony(key):
+				choices[phony.lower()] = choices[key.name.lower()]
+
+		return choices
+
+if __name__ == '__main__':
+	print(AnnotationType.as_choices)

+ 3 - 3
cvdatasets/annotations/birdsnap.py → cvdatasets/annotation/types/birdsnap.py

@@ -2,9 +2,9 @@ import numpy as np
 
 from os.path import join
 
-from cvdatasets.annotations.base import BaseAnnotations
-from cvdatasets.annotations.base.bbox_mixin import BBoxMixin
-from cvdatasets.annotations.base.parts_mixin import PartsMixin
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.mixins.bbox_mixin import BBoxMixin
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
 from cvdatasets.utils import _MetaInfo
 
 

+ 3 - 3
cvdatasets/annotations/cars.py → cvdatasets/annotation/types/cars.py

@@ -2,9 +2,9 @@ import numpy as np
 
 from os.path import join
 
-from cvdatasets.annotations.base import BaseAnnotations
-from cvdatasets.annotations.base.bbox_mixin import BBoxMixin
-from cvdatasets.annotations.base.parts_mixin import PartsMixin
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.mixins.bbox_mixin import BBoxMixin
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
 from cvdatasets.utils import _MetaInfo
 
 

+ 3 - 3
cvdatasets/annotations/cub.py → cvdatasets/annotation/types/cub.py

@@ -2,9 +2,9 @@ import numpy as np
 
 from os.path import join
 
-from cvdatasets.annotations.base import BaseAnnotations
-from cvdatasets.annotations.base.bbox_mixin import BBoxMixin
-from cvdatasets.annotations.base.parts_mixin import PartsMixin
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.mixins.bbox_mixin import BBoxMixin
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
 from cvdatasets.utils import _MetaInfo
 
 

+ 3 - 3
cvdatasets/annotations/dogs.py → cvdatasets/annotation/types/dogs.py

@@ -2,9 +2,9 @@ import numpy as np
 
 from os.path import join
 
-from cvdatasets.annotations.base import BaseAnnotations
-from cvdatasets.annotations.base.bbox_mixin import BBoxMixin
-from cvdatasets.annotations.base.parts_mixin import PartsMixin
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.mixins.bbox_mixin import BBoxMixin
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
 from cvdatasets.utils import _MetaInfo
 
 

+ 49 - 0
cvdatasets/annotation/types/file_list.py

@@ -0,0 +1,49 @@
+import numpy as np
+
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.files import AnnotationFiles
+
+class FileListAnnotations(BaseAnnotations):
+
+	def read_annotation_files(self) -> AnnotationFiles:
+		return AnnotationFiles(
+			"images.txt", "labels.txt", "tr_ID.txt",
+			root=self.root,
+			load_strict=True,
+		)
+
+	def _parse_uuids(self) -> None:
+		assert self.files.images is not None, \
+			"Images were not loaded!"
+		uuid_fnames = [i.split() for i in self.files.images]
+		self.uuids, self.image_names = map(np.array, zip(*uuid_fnames))
+		self.uuid_to_idx = {uuid: i for i, uuid in enumerate(self.uuids)}
+
+	def _parse_labels(self) -> None:
+		assert self.files.labels is not None, \
+			"Labels were not loaded!"
+		labs = list(map(int, self.files.labels))
+		self.labels = np.array(labs, dtype=np.int32)
+
+	def _parse_split(self) -> None:
+		assert self.files.tr_ID is not None, \
+			"Train-test split was not loaded!"
+		assert hasattr(self, "uuids"), \
+			"UUIDs were not parsed yet! Please call _parse_uuids before this method!"
+		uuid_to_split = {uuid: int(split) for uuid, split in zip(self.uuids, self.files.tr_ID)}
+		self.train_split = np.array([uuid_to_split[uuid] for uuid in self.uuids], dtype=bool)
+		self.test_split = np.logical_not(self.train_split)
+
+if __name__ == '__main__':
+	annot = FileListAnnotations(
+		root_or_infofile="/home/korsch_data/datasets/birds/cub200/ORIGINAL")
+
+	for i, uuid in enumerate(annot.uuids):
+		print(uuid, annot[uuid])
+
+		if i >= 10:
+			break
+
+	train, test = annot.new_train_test_datasets()
+
+	print(len(train), len(test))

+ 3 - 3
cvdatasets/annotations/flowers.py → cvdatasets/annotation/types/flowers.py

@@ -2,9 +2,9 @@ import numpy as np
 
 from os.path import join
 
-from cvdatasets.annotations.base import BaseAnnotations
-from cvdatasets.annotations.base.bbox_mixin import BBoxMixin
-from cvdatasets.annotations.base.parts_mixin import PartsMixin
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.mixins.bbox_mixin import BBoxMixin
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
 from cvdatasets.utils import _MetaInfo
 
 

+ 70 - 0
cvdatasets/annotation/types/folder_annotations.py

@@ -0,0 +1,70 @@
+import numpy as np
+
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.files import AnnotationFiles
+
+class FolderAnnotations(BaseAnnotations):
+
+	def read_annotation_files(self) -> AnnotationFiles:
+		return AnnotationFiles(
+			train_images="ILSVRC2012_img_train",
+			val_images="ILSVRC2012_img_val",
+			test_images=("ILSVRC2012_img_test", True),
+			root=self.root,
+			load_strict=True,
+		)
+
+	@property
+	def _has_test_set(self) -> bool:
+		return self.files.test_images is not None
+
+
+	def _parse_uuids(self) -> None:
+		self.images_folder = ""
+
+		train_uuid_fnames = [(fpath.name, str(fpath.relative_to(self.root))) for
+			fpath in self.files.train_images]
+
+		val_uuid_fnames = [(fpath.name, str(fpath.relative_to(self.root))) for
+			fpath in self.files.val_images]
+
+		if self._has_test_set:
+			test_uuid_fnames = [(fpath.name, str(fpath.relative_to(self.root))) for
+				fpath in self.files.test_images]
+
+		uuid_fnames = train_uuid_fnames + val_uuid_fnames
+		self.uuids, self.image_names = map(np.array, zip(*uuid_fnames))
+		self.uuid_to_idx = {uuid: i for i, uuid in enumerate(self.uuids)}
+
+
+	def _parse_labels(self) -> None:
+		train_labs = [fpath.parent.name for fpath in self.files.train_images]
+		val_labs = [fpath.parent.name for fpath in self.files.val_images]
+		labs = train_labs + val_labs
+
+		if self._has_test_set:
+			self.test_labels = [fpath.parent.name for fpath in self.files.test_images]
+
+		self._classes, self.labels = np.unique(labs, return_inverse=True)
+
+
+	def _parse_split(self) -> None:
+		self.train_split = np.ones(len(self.uuids), dtype=bool)
+		self.train_split[len(self.files.train_images):] = False
+
+		self.test_split = np.logical_not(self.train_split)
+
+
+if __name__ == '__main__':
+	annot = FolderAnnotations(
+		root_or_infofile="/home/korsch_data/datasets/ImageNet/TOP_INAT20")
+
+	for i, uuid in enumerate(annot.uuids):
+		print(uuid, annot[uuid])
+
+		if i >= 10:
+			break
+
+	train, test = annot.new_train_test_datasets()
+
+	print(len(train), len(test))

+ 3 - 3
cvdatasets/annotations/hed.py → cvdatasets/annotation/types/hed.py

@@ -3,9 +3,9 @@ import simplejson as json
 
 from os.path import join
 
-from cvdatasets.annotations.base import BaseAnnotations
-from cvdatasets.annotations.base.bbox_mixin import BBoxMixin
-from cvdatasets.annotations.base.parts_mixin import PartsMixin
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.mixins.bbox_mixin import BBoxMixin
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
 from cvdatasets.utils import _MetaInfo
 
 

+ 2 - 2
cvdatasets/annotations/imagenet.py → cvdatasets/annotation/types/imagenet.py

@@ -4,8 +4,8 @@ import logging
 
 from pathlib import Path
 
-from cvdatasets.annotations.base import BaseAnnotations
-from cvdatasets.annotations.base.parts_mixin import PartsMixin
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
 from cvdatasets.utils import _MetaInfo
 
 class INET_Annotations(PartsMixin, BaseAnnotations):

+ 3 - 4
cvdatasets/annotations/inat.py → cvdatasets/annotation/types/inat.py

@@ -4,12 +4,11 @@ import logging
 import numpy as np
 import simplejson as json
 
-from os.path import isfile
 from os.path import join
 
-from cvdatasets.annotations.base import BaseAnnotations
-from cvdatasets.annotations.base.bbox_mixin import BBoxMixin
-from cvdatasets.annotations.base.parts_mixin import PartsMixin
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.mixins.bbox_mixin import BBoxMixin
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
 from cvdatasets.utils import _MetaInfo
 
 

+ 98 - 0
cvdatasets/annotation/types/json_annotations.py

@@ -0,0 +1,98 @@
+import copy
+import hashlib
+import logging
+import numpy as np
+
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.files import AnnotationFiles
+
+def _uuid_check(uuids):
+	return len(np.unique(uuids)) == len(uuids)
+
+def _uuid_entry(im_info):
+	return hashlib.md5(im_info["file_name"].encode()).hexdigest()
+
+class JSONAnnotations(BaseAnnotations):
+
+	def read_annotation_files(self) -> AnnotationFiles:
+		return AnnotationFiles(
+			"trainval.json", "val.json",
+			("unlabeled_train.json", True),
+			root=self.root,
+			load_strict=True,
+		)
+
+	@property
+	def has_unlabeled_data(self) -> bool:
+		return self.files.unlabeled_train is not None
+
+	def _parse_uuids(self) -> None:
+
+		uuid_fnames = [(str(im["id"]), im["file_name"]) for im in self.files.trainval["images"]]
+		self.uuids, self.image_names = map(np.array, zip(*uuid_fnames))
+
+		assert _uuid_check(self.uuids) , \
+			"UUIDs are not unique!"
+
+		self.uuid_to_idx = {uuid: i for i, uuid in enumerate(self.uuids)}
+
+		if self.has_unlabeled_data:
+			logging.info("Loading unlabeled data...")
+			self._parse_unlabeled()
+		else:
+			logging.info("No unlabeled data was provided!")
+
+	def _parse_unlabeled(self) -> None:
+
+		uuid_fnames = [(_uuid_entry(im), im["file_name"]) for im in self.files.unlabeled_train["images"]]
+
+		self.unlabeled = unlabeled = copy.copy(self)
+
+		unlabeled.uuids, unlabeled.image_names = map(np.array, zip(*uuid_fnames))
+		unlabeled.labels = np.full(unlabeled.image_names.shape, -1, dtype=np.int32)
+		unlabeled.train_split = np.full(unlabeled.image_names.shape, 1, dtype=bool)
+		unlabeled.test_split = np.full(unlabeled.image_names.shape, 0, dtype=bool)
+
+		assert len(np.unique(unlabeled.uuids)) == len(unlabeled.uuids), \
+			"Unlabeled UUIDs are not unique!"
+
+		overlap = set(self.uuids) & set(unlabeled.uuids)
+		assert len(overlap) == 0, \
+			f"Unlabeled and labeled UUIDs overlap: {overlap}"
+
+		unlabeled.uuid_to_idx = {uuid: i for i, uuid in enumerate(unlabeled.uuids)}
+
+
+	def _parse_labels(self) -> None:
+		self.labels = np.zeros(len(self.uuids), dtype=np.int32)
+		labs = {str(annot["image_id"]): annot["category_id"]
+			for annot in self.files.trainval["annotations"]}
+
+		for uuid in self.uuids:
+			self.labels[self.uuid_to_idx[uuid]] = labs[uuid]
+
+
+	def _parse_split(self) -> None:
+		self.train_split = np.ones(len(self.uuids), dtype=bool)
+		val_uuids = [str(im["id"]) for im in self.files.val["images"]]
+		for v_uuid in val_uuids:
+			self.train_split[self.uuid_to_idx[v_uuid]] = False
+
+		self.test_split = np.logical_not(self.train_split)
+
+
+
+if __name__ == '__main__':
+	annot = JSONAnnotations(
+		root_or_infofile="/home/korsch_data/datasets/inat/2020/IN_CLASS")
+
+	for i, uuid in enumerate(annot.uuids):
+		print(uuid, annot[uuid])
+
+		if i >= 4:
+			break
+
+	train, test = annot.new_train_test_datasets()
+
+	print(len(train), len(test))
+

+ 3 - 3
cvdatasets/annotations/nab.py → cvdatasets/annotation/types/nab.py

@@ -2,9 +2,9 @@ import numpy as np
 
 from os.path import join
 
-from cvdatasets.annotations.base import BaseAnnotations
-from cvdatasets.annotations.base.bbox_mixin import BBoxMixin
-from cvdatasets.annotations.base.parts_mixin import PartsMixin
+from cvdatasets.annotation.base import BaseAnnotations
+from cvdatasets.annotation.mixins.bbox_mixin import BBoxMixin
+from cvdatasets.annotation.mixins.parts_mixin import PartsMixin
 from cvdatasets.utils import _MetaInfo
 
 

+ 1 - 1
cvdatasets/annotations/tigers.py → cvdatasets/annotation/types/tigers.py

@@ -5,7 +5,7 @@ from os.path import isfile
 from os.path import join
 from sklearn.model_selection import StratifiedShuffleSplit
 
-from cvdatasets.annotations.base import BaseAnnotations
+from cvdatasets.annotation.base import BaseAnnotations
 from cvdatasets.utils import _MetaInfo
 
 class TIGERS_Annotations(BaseAnnotations):

+ 0 - 0
cvdatasets/annotations/__init__.py


+ 1 - 2
scripts/display.py

@@ -9,7 +9,7 @@ import matplotlib.pyplot as plt
 
 from argparse import ArgumentParser
 
-from cvdatasets.annotations import AnnotationType
+from cvdatasets.annotation import AnnotationType
 from utils import parser, plot_crops
 
 def main(args):
@@ -17,7 +17,6 @@ def main(args):
 		f"AnnotationType is not known: \"{args.dataset}\""
 
 	annotation_cls = AnnotationType[args.dataset].value
-
 	logging.info(f"Loading \"{args.dataset}\" annnotations from \"{args.data}\"")
 	annot = annotation_cls(root_or_infofile=args.data, parts=args.parts, load_strict=False)
 

+ 9 - 0
scripts/info_files/info.yml

@@ -55,6 +55,11 @@ DATASETS:
     annotations: "BJOERN"
     n_classes: 1000
 
+  IMAGENET_TOP_INAT20: &inet_top_inat20
+    <<: *inet
+    annotations: "TOP_INAT20"
+    n_classes: 44
+
   CUB200:         &cub200
     folder: birds/cub200
     annotations: "ORIGINAL"
@@ -230,6 +235,10 @@ PARTS:
     <<: *inet
     <<: *parts_global
 
+  IMAGENET_TOP_INAT20_GLOBAL:
+    <<: *inet_top_inat20
+    <<: *parts_global
+
   CUB200_2FOLD_GLOBAL:
     <<: *cub200_2fold
     <<: *parts_global

+ 1 - 1
scripts/utils/parser.py

@@ -1,7 +1,7 @@
 import os
 from cvargparse import BaseParser, Arg
 
-from cvdatasets.annotations import AnnotationType
+from cvdatasets.annotation import AnnotationType
 
 from cvdatasets.utils import read_info_file
 

+ 2 - 2
tests/test_annotations.py

@@ -9,10 +9,10 @@ from os.path import *
 from abc import ABC, abstractproperty
 
 
-from cvdatasets import BaseAnnotations, _MetaInfo
+from cvdatasets import FileListAnnotations
 from cvdatasets.utils import read_info_file
 
-class MockAnnotation(BaseAnnotations):
+class MockAnnotation(FileListAnnotations):
 	name = "MOCK"
 	index_offset = 0