{ "cells": [ { "cell_type": "raw", "id": "c6cc5e1c", "metadata": {}, "source": [ "! cd .. && flask result export all && cd -" ] }, { "cell_type": "code", "execution_count": 1, "id": "1a32c09b", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import simplejson as json\n", "import tabulate\n", "import pyaml\n", "import yaml\n", "import re\n", "import shutil\n", "import abc\n", "import pandas as pd\n", "import typing as T\n", "import contextlib\n", "\n", "from functools import partial\n", "from collections import Counter\n", "from collections import defaultdict\n", "from uuid import uuid4\n", "\n", "from tqdm.auto import tqdm\n", "from PIL import Image\n", "from IPython.display import display\n", "from pathlib import Path\n", "from cvargparse import cvdataclass\n", "from munch import munchify\n", "from munch import Munch\n", "from matplotlib import pyplot as plt\n", "from matplotlib import scale\n", "from matplotlib import ticker\n", "from dataclasses import dataclass" ] }, { "cell_type": "code", "execution_count": 2, "id": "b5b6eac4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[\r\n", " {\r\n", " \"files\": [\r\n", " {\r\n", " \"collection_id\": null,\r\n", " \"created\": \"24. Nov. 2021 14:52:25\",\r\n", " \"extension\": \".JPG\",\r\n", " \"fps\": null,\r\n", " \"frames\": null,\r\n", " \"has_annotations\": true,\r\n", " \"id\": 3100,\r\n", " \"identifier\": 3100,\r\n", " \"name\": \"2020_08_08_Lichtfang_Hahnengrund_6729\",\r\n", " \"path\": \"/data/2020-08-08_Strommast-Hahnengrund/2020_08_08_Lichtfang_Hahnengrund_6729.JPG\",\r\n", " \"project_id\": 11,\r\n", " \"results\": [\r\n", " {\r\n", " \"data\": {\r\n", " \"h\": 0.09298528300130823,\r\n", " \"w\": 0.035294647278620625,\r\n", " \"x\": 0.10625534596857696,\r\n", " \"y\": 0.21368948247078465\r\n", " },\r\n", " \"file_id\": 3100,\r\n", " \"id\": 3630,\r\n", " \"identifier\": 3630,\r\n", " \"label\": {\r\n", " \"hierarchy_level\": null,\r\n", " \"id\": 5109,\r\n", " \"identifier\": 5109,\r\n", " \"name\": \"Eilema complana (10490)\",\r\n", " \"parent_id\": 5105,\r\n", " \"parent_reference\": \"genus_eilema\",\r\n", " \"project_id\": 11,\r\n", " \"reference\": \"10490\"\r\n", " },\r\n", " \"label_id\": 5109,\r\n", " \"origin\": \"user\",\r\n", " \"type\": \"bounding-box\"\r\n", " },\r\n" ] } ], "source": [ "!head -n40 ../output.json" ] }, { "cell_type": "code", "execution_count": 3, "id": "1a49765d", "metadata": {}, "outputs": [], "source": [ "@cvdataclass\n", "class Args:\n", " \n", " data_root: Path = Path(\"/home/AMMOD_data/moths/PyCS/external_data\")\n", " original_root: Path = Path(\"/data\")\n", " \n", " results_file: Path = Path(\"../output.json\")\n", " \n", " origin: T.Tuple = (\"user\",)\n", " \n", " def __post_init__(self):\n", " self.data_root = Path(self.data_root)\n", " self.original_root = Path(self.original_root)\n", " self.results_file = Path(self.results_file)\n", " " ] }, { "cell_type": "code", "execution_count": 4, "id": "c28d6372", "metadata": {}, "outputs": [], "source": [ "class BaseObject:\n", " serialize_attrs = ()\n", " \n", " def __init__(self, content):\n", " self._content = content\n", " \n", " @staticmethod\n", " def represent(dumper, obj):\n", " return dumper.represent_dict(obj.serialize())\n", " \n", " def serialize(self):\n", " return {\n", " type(self).__name__: {\n", " attr_name: getattr(self, attr_name) for attr_name in self.serialize_attrs\n", " }\n", " \n", " }\n", " \n", "pyaml.PrettyYAMLDumper.add_multi_representer(BaseObject, BaseObject.represent)\n", "pyaml.PrettyYAMLDumper.add_multi_representer(Munch, pyaml.PrettyYAMLDumper.represent_dict)" ] }, { "cell_type": "code", "execution_count": 5, "id": "fbb815a0", "metadata": {}, "outputs": [], "source": [ "class Label(BaseObject):\n", " \n", " objects = defaultdict(list)\n", " id2ref = {}\n", " \n", " serialize_attrs = BaseObject.serialize_attrs + \\\n", " (\"id\", \"parent\", \"ref\", \"name\", \"hierarchy\")\n", " \n", " def __init__(self, content: Munch):\n", " super().__init__(content)\n", " self.id = content.id\n", " self.parent_id = content.parent_id\n", " self.parent_reference = content.parent_reference\n", " self.ref = content.reference\n", " self.name = content.name\n", " self.hierarchy = content.hierarchy_level\n", " \n", " @property\n", " def parent(self):\n", " return Label.get(self.parent_id)\n", " \n", " @classmethod\n", " def clear(cls):\n", " cls.objects.clear()\n", " cls.id2ref.clear()\n", " \n", " @classmethod\n", " def get(cls, idx):\n", " if idx not in cls.id2ref:\n", " return None\n", " ref = cls.id2ref[idx]\n", " for obj in cls.objects[ref]:\n", " if obj.id == idx:\n", " return obj\n", " \n", " @staticmethod\n", " def __new__(cls, content: Munch):\n", " idx = content.id\n", " if idx in cls.id2ref:\n", " return cls.get(idx)\n", " \n", " obj = super().__new__(cls)\n", " cls.objects[content.reference].append(obj)\n", " cls.id2ref[idx] = content.reference\n", " return obj\n", " \n", " " ] }, { "cell_type": "code", "execution_count": 6, "id": "6c3a43fb", "metadata": {}, "outputs": [], "source": [ "class Result(BaseObject):\n", " serialize_attrs = BaseObject.serialize_attrs + \\\n", " (\"file\", \"label\", \"x\", \"y\", \"w\", \"h\")\n", " \n", " def __init__(self, content, file):\n", " super().__init__(content)\n", " assert content.type == \"bounding-box\", \\\n", " \"Currently, only bounding box results are supported!\"\n", " \n", " self.file = file\n", " self.x, self.y = content.data.x, content.data.y\n", " self.w, self.h = content.data.w, content.data.h\n", " self.label = content.label\n", " \n", " @property\n", " def label(self):\n", " return self._label\n", "\n", " @label.setter\n", " def label(self, value):\n", " if isinstance(value, dict):\n", " value = Label(value)\n", " \n", " self._label = value \n", " \n", " def show(self, size, *, ax: plt.Axes = None):\n", " im_h, im_w = size\n", " x = int(im_w * self.x)\n", " y = int(im_h * self.y)\n", " \n", " w = int(im_w * self.w)\n", " h = int(im_h * self.h)\n", " \n", " ax.add_patch(plt.Rectangle((x, y), w, h, fill=False, linewidth=2))\n", " \n", " @property\n", " def bbox(self):\n", " return self.x, self.y, self.x+self.w, self.y+self.h\n", "\n", " def crop(self, square: bool = False) -> Image:\n", " with self.file.image as im:\n", " x0, y0, x1, y1 = self.bbox\n", " w, h = im.size\n", " x0, x1 = int(x0*w), int(x1*w)\n", " y0, y1 = int(y0*h), int(y1*h) \n", " \n", " bbox = (x0, y0, x1, y1)\n", " if square:\n", " bbox = to_square(*bbox)\n", " \n", " return im.crop(bbox)\n", "\n", "def to_square(x0, y0, x1, y1):\n", " w, h = x1-x0, y1-y0\n", " max_wh = max(w, h)\n", " dw, dh = (max_wh - w) / 2, (max_wh - h) / 2\n", " x0, y0 = max(0, x0-dw), max(0, y0-dh)\n", " x1, y1 = x1+dw, y1+dh\n", " \n", " return int(x0), int(y0), int(x1), int(y1)\n", " " ] }, { "cell_type": "code", "execution_count": 7, "id": "0ce9bd9f", "metadata": {}, "outputs": [], "source": [ "class File(BaseObject):\n", " \n", " serialize_attrs = BaseObject.serialize_attrs + \\\n", " (\"id\", \"path\", \"project_id\")\n", " \n", " def __init__(self, args: Args, project, content):\n", " super().__init__(content)\n", " \n", " path = args.data_root / Path(content.path).relative_to(args.original_root)\n", " assert path.exists(), \\\n", " f\"Could not find {path}!\"\n", " \n", " self.id = content.id\n", " self.path = path\n", " self.project = project\n", " self.results = [Result(res, file=self) \n", " for res in content.results \n", " if res.type == \"bounding-box\" \\\n", " and res.origin in args.origin]\n", " \n", " @property\n", " def project_id(self):\n", " return self.project.id\n", " \n", " def get_results(self, with_labels=None) -> T.Iterator[Result]:\n", " \n", " for result in self.results:\n", " if with_labels is None or (with_labels == (result.label is not None)):\n", " yield result\n", " \n", " def count_results(self, with_labels=None):\n", " return len(list(self.get_results(with_labels=with_labels)))\n", " \n", " @property\n", " @contextlib.contextmanager\n", " def image(self):\n", " yield Image.open(self.path)\n", " \n", " def show(self, *, ax: plt.Axes = None):\n", " ax = ax or plt.gca()\n", "\n", " with self.image as _im:\n", " im = np.array(_im)\n", " \n", " ax.imshow(im)\n", " *size, c = im.shape\n", " for res in self.results:\n", " res.show(size, ax=ax)\n", " \n", " ax.axis(\"off\")\n", " " ] }, { "cell_type": "code", "execution_count": 8, "id": "97128fa1", "metadata": {}, "outputs": [], "source": [ "class Project(BaseObject):\n", " \n", " serialize_attrs = BaseObject.serialize_attrs + \\\n", " (\"id\",)\n", " \n", " def __init__(self, args: Args, content):\n", " super().__init__(content)\n", " \n", " self.id = content.project_id\n", " self.files = [File(args, self, file) for file in content.files]\n", " self.all_labels = {label.reference: Label(label) for label in content.labels}\n", " \n", " self.labels = {ref: lab for ref, lab in self.used_labels()}\n", " \n", " def count_results(self, with_labels=None):\n", " return sum([f.count_results(with_labels=with_labels) for f in self.files])\n", " \n", " def results(self, with_labels=None) -> T.Iterator[Result]:\n", " for file in self.files:\n", " yield from file.get_results(with_labels=with_labels)\n", " \n", " \n", " def used_labels(self) -> T.Iterator[Label]:\n", " for result in self.results(with_labels=True):\n", " ref = result.label.ref\n", " yield ref, self.all_labels[ref]\n", " \n", " " ] }, { "cell_type": "code", "execution_count": 9, "id": "9352addd", "metadata": {}, "outputs": [], "source": [ "def get_projects(args: Args, contents):\n", " print(f\"Found results for {len(contents)} projects\")\n", " return [Project(args, content) for content in contents]\n", "\n", "def show_counts(projects: T.List[Project]):\n", " \n", " rows = []\n", " labels = {}\n", " counts = [0, 0, 0, 0]\n", "\n", " for project in projects:\n", "\n", " n_results = project.count_results()\n", " n_results_w_labs = project.count_results(True)\n", " n_results_wo_labs = project.count_results(False)\n", " \n", " counts[0] += len(project.files)\n", " counts[1] += n_results\n", " counts[2] += n_results_w_labs\n", " counts[3] += n_results_wo_labs\n", " \n", " rows.append([\n", " project.id, \n", " len(project.files), \n", " n_results, \n", " n_results_w_labs, \n", " n_results_wo_labs,\n", " len(project.labels)\n", " ])\n", " \n", " labels.update(project.labels)\n", " \n", "\n", " rows.append([\"overall\"] + counts + [len(labels)])\n", " \n", " data_frame = pd.DataFrame(\n", " rows, columns=[\"Project ID\", \n", " \"# files\", \n", " \"# results\", \n", " \"# results \\\\w labels\", \n", " \"# results \\\\wo labels\",\n", " \"# used labels\"\n", " ],\n", " ) \n", " display(data_frame)\n", " \n", " print(f\"{'Family:':>10s} {len([key for key in labels.keys() if key.startswith('family')]): <5,d}\", )\n", " print(f\"{'Genus:':>10s} {len([key for key in labels.keys() if key.startswith('genus')]): <5,d}\", )\n", " print(f\"{'Species:':>10s} {len([key for key in labels.keys() if key.isalnum()]): <5,d}\", )\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "9f22d1ac", "metadata": {}, "outputs": [], "source": [ "def gather_labels(projects: T.List[Project]) -> T.Dict[str, Label]:\n", " labels = {}\n", " for project in projects:\n", " labels.update(project.labels)\n", " \n", " return labels\n", "\n", "def results_and_label_refs(projects: T.List[Project]) -> T.Tuple[T.List[Result], T.List[str]]:\n", " \n", " used_labels = set()\n", " results = []\n", " for project in projects:\n", " for result in project.results(with_labels=True):\n", " results.append(result)\n", " label = result.label\n", " used_labels.add(label.ref)\n", " \n", " while label.parent is not None:\n", " label = label.parent\n", " used_labels.add(label.ref)\n", " \n", " return results, sorted(used_labels)" ] }, { "cell_type": "code", "execution_count": 22, "id": "50ff2267", "metadata": {}, "outputs": [], "source": [ "def plot(counts, keys, lab2idx, lab2name, \n", " lab_filter = lambda lab: True, \n", " ax: plt.Axes = None,\n", " cmap = plt.cm.viridis\n", " ):\n", " \n", " ax = ax or plt.gca()\n", " \n", " n_projects, n_classes = counts.shape\n", " labeled = set()\n", " \n", " i = 0\n", " names = []\n", " for key, _ in keys:\n", " if not lab_filter(key): \n", " continue\n", " idx = lab2idx[key]\n", " names.append(lab2name[key])\n", " offset = 0\n", " for p, count in enumerate(counts[:, idx]):\n", " ax.bar(i, count, 0.8, \n", " bottom=offset, \n", " color=cmap(p / n_projects),\n", " label=f\"Project #{p+11}\" if p not in labeled else None,\n", " )\n", " labeled.add(p)\n", " offset += count\n", " \n", " i += 1\n", " \n", " \n", " scaler = scale.FuncScale(ax, (np.sqrt, np.square))\n", " scaler = scale.FuncScale(ax, (np.log1p, np.expm1))\n", " ax.set_yscale(scaler)\n", " \n", " if len(names) < 30:\n", " ax.set_xticks(np.arange(len(names)))\n", " ax.set_xticklabels(names, rotation=\"vertical\")\n", "\n", "def show_class_distribution(projects: T.List[Project]):\n", " \n", " all_labels = gather_labels(projects)\n", " lab2idx = {key: i for i, key in enumerate(all_labels.keys())}\n", " lab2name = {key: label.name for key, label in all_labels.items()}\n", " \n", " counts = np.zeros((len(projects), len(all_labels)), dtype=np.int32)\n", " total_counter = Counter()\n", " \n", " for i, project in enumerate(projects):\n", " counter = Counter([key for key, label in project.used_labels()])\n", " \n", " for key, count in counter.items():\n", " idx = lab2idx[key]\n", " counts[i, idx] += count\n", " \n", " total_counter += counter\n", " \n", " \n", " \n", " fig, axs = plt.subplots(2, 2, figsize=(12,12))\n", " \n", " for i, (name, func) in enumerate([\n", " (\"All\", lambda lab: True),\n", " (\"Family\", lambda lab: lab.startswith(\"family\")),\n", " (\"Genus\", lambda lab: lab.startswith(\"genus\")),\n", " (\"Species\", lambda lab: lab.isalnum()),\n", " ]):\n", " ax = axs[np.unravel_index(i, axs.shape)]\n", " ax.set_title(name)\n", " ax.set_xlabel(\"Classes\")\n", " ax.set_ylabel(\"# Annotations\")\n", " plot(counts, total_counter.most_common(), lab2idx, lab2name, \n", " lab_filter=func, ax=ax)\n", " ax.legend()\n", " \n", " plt.tight_layout()\n", " plt.show()\n", " plt.close()\n", " \n", " #print(total_counter)\n", " " ] }, { "cell_type": "code", "execution_count": 23, "id": "804454d2", "metadata": {}, "outputs": [], "source": [ "def show_examples(project: Project, n: int = 9):\n", " \n", " results = list(project.results(with_labels=True))\n", " idxs = np.random.choice(len(results), size=n, replace=False)\n", "\n", " nrows = int(np.ceil(np.sqrt(n)))\n", " ncols = int(np.ceil(n / nrows))\n", " fig, axs = plt.subplots(nrows, ncols, figsize=(16,16), squeeze=False)\n", " fig.suptitle(f\"Project #{project.id}\")\n", " \n", " [ax.axis(\"off\") for ax in axs.ravel()]\n", "\n", " for a, i in enumerate(idxs):\n", " res = results[i]\n", " ax = axs[np.unravel_index(a, axs.shape)]\n", "\n", " ax.imshow(res.crop(square=True))\n", " ax.set_title(res.label.name)\n", "\n", " plt.show()\n", " plt.close()" ] }, { "cell_type": "code", "execution_count": 24, "id": "25604ffe", "metadata": {}, "outputs": [], "source": [ "def show_size_distribution(projects: T.List[Project], bins=100):\n", " \n", " sizes = defaultdict(list)\n", " rel_sizes = [[], []]\n", " ratios = []\n", " areas = []\n", " \n", " for project in projects:\n", " for result in project.results():\n", " with result.file.image as im:\n", " w, h = im.size\n", " if 0 in (result.w, result.h):\n", " continue\n", " size = max(result.w * w, result.h * h)\n", " sizes[(w,h)].append(size)\n", " \n", " rel_sizes[0].append(result.w * 100)\n", " rel_sizes[1].append(result.h * 100)\n", " \n", " ratios.append(result.w / result.h)\n", " areas.append(result.w * result.h * 100)\n", " \n", " fig, axs = plt.subplots(2,2, figsize=(16,9), squeeze=False)\n", " \n", " ax0 = axs[0,0]\n", " ax1 = axs[0,1]\n", " ax2 = axs[1,0]\n", " ax3 = axs[1,1]\n", " \n", " \n", " ax0.set_ylabel(\"Density\")\n", " ax0.set_xlabel(\"Size in px\")\n", " \n", " ax1.set_ylabel(\"Density\")\n", " ax1.set_xlabel(\"% of the size\")\n", " \n", " ax2.set_ylabel(\"Density\")\n", " ax2.set_xlabel(\"Aspect ratios\")\n", " \n", " ax3.set_ylabel(\"Count\")\n", " ax3.set_xlabel(\"% of the area\")\n", " \n", " nbins=bins\n", " \n", " for (w, h), values in sizes.items():\n", " label = f\"${w}\\\\times{h}$ px ({len(values)} images)\"\n", " counts, bins, *_ = ax0.hist(values, \n", " label=label,\n", " alpha=0.4,\n", " density=True,\n", " bins=bins)\n", " ax0.legend()\n", " \n", " ax1.hist(rel_sizes[0], label=\"% of width\", alpha=0.4, density=True, bins=nbins)\n", " ax1.hist(rel_sizes[1], label=\"% of height\", alpha=0.4, density=True, bins=nbins)\n", " ax1.legend()\n", " \n", " bins = nbins\n", " counts, bins, *_ = ax2.hist(ratios, label=\"$\\\\dfrac{w}{h}$ ratio\", alpha=0.4, density=True, bins=bins)\n", " ax2.hist([1/r for r in ratios], label=\"$\\\\dfrac{h}{w}$ ratio\", alpha=0.4, density=True, bins=bins)\n", " ax2.legend()\n", " \n", " ax3.hist(areas, label=\"% of the area\", alpha=0.4, bins=30)\n", " ax3.set_yscale(\"log\")\n", " ax3.legend()\n", " \n", " plt.show()\n", " plt.close()" ] }, { "cell_type": "code", "execution_count": 25, "id": "a0f10bba", "metadata": {}, "outputs": [], "source": [ "def main(args: Args):\n", " Label.clear()\n", " \n", " with open(args.results_file) as f:\n", " contents = munchify(json.load(f))\n", " \n", " projects = get_projects(args, contents)\n", " show_counts(projects)\n", " \n", " show_class_distribution(projects)\n", " show_size_distribution(projects)\n", " \n", " for project in projects:\n", " show_examples(project, n=16)\n", "\n", " return projects" ] }, { "cell_type": "code", "execution_count": 26, "id": "aa12af98", "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Args:\n", " data_root: /home/AMMOD_data/moths/PyCS/external_data\n", " original_root: /data\n", " results_file: ../output.json\n", " origin:\n", " - user\n", "\n", "Found results for 10 projects\n" ] }, { "data": { "text/html": [ "
\n", " | Project ID | \n", "# files | \n", "# results | \n", "# results \\w labels | \n", "# results \\wo labels | \n", "# used labels | \n", "
---|---|---|---|---|---|---|
0 | \n", "11 | \n", "83 | \n", "2562 | \n", "206 | \n", "2356 | \n", "20 | \n", "
1 | \n", "12 | \n", "177 | \n", "1506 | \n", "402 | \n", "1104 | \n", "20 | \n", "
2 | \n", "13 | \n", "98 | \n", "1135 | \n", "803 | \n", "332 | \n", "28 | \n", "
3 | \n", "14 | \n", "133 | \n", "1814 | \n", "976 | \n", "838 | \n", "24 | \n", "
4 | \n", "15 | \n", "45 | \n", "552 | \n", "387 | \n", "165 | \n", "25 | \n", "
5 | \n", "16 | \n", "22 | \n", "525 | \n", "379 | \n", "146 | \n", "24 | \n", "
6 | \n", "17 | \n", "19 | \n", "155 | \n", "132 | \n", "23 | \n", "18 | \n", "
7 | \n", "18 | \n", "64 | \n", "167 | \n", "95 | \n", "72 | \n", "6 | \n", "
8 | \n", "19 | \n", "110 | \n", "446 | \n", "143 | \n", "303 | \n", "5 | \n", "
9 | \n", "20 | \n", "83 | \n", "292 | \n", "76 | \n", "216 | \n", "3 | \n", "
10 | \n", "overall | \n", "834 | \n", "9154 | \n", "3599 | \n", "5555 | \n", "110 | \n", "