123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- import re
- import numpy as np
- import pandas as pd
- import typing as T
- from pathlib import Path
- from munch import munchify
- from pycs import app
- from pycs.interfaces.LabelProvider import LabelProvider
- class Provider(LabelProvider):
- names = [
- 'is_local',
- 'rarity',
- 'super_family',
- 'family',
- 'sub_family',
- 'tribe',
- 'german',
- 'swiss',
- 'austrian',
- 'kr_nr',
- 'genus',
- 'species',
- 'authors',
- 'comment',
- 'remove_me',
- 'changed',
- 'version1_comment',
- 'misc', # 'D-CH-A / non-KR / Kaukasus',
- 'german_name',
- ]
- dtype = {
- 'is_local': pd.CategoricalDtype(['nur lokal', 'tagaktiv']),
- 'rarity': np.float32,
- 'super_family': "category",
- 'family': "category",
- 'sub_family': "category",
- 'tribe': "category",
- 'german': pd.CategoricalDtype(['D', 'e', '?']),
- 'swiss': pd.CategoricalDtype(['C', 'e', '?']),
- 'austrian': pd.CategoricalDtype(['A', 'e', '?']),
- 'kr_nr': "object",
- 'genus': "category",
- 'species': "category",
- 'authors': "object",
- 'comment': "object",
- 'remove_me': "category",
- 'changed': "object",
- 'version1_comment': "object",
- 'misc': "object",
- 'german_name': str,
- }
- KR_REGEX = re.compile(r"^[\d\-a-zA-Z]+")
- def __init__(self, root_folder: str, configuration: T.Dict):
- config = munchify(configuration)
- self.root = Path(root_folder)
- self.label_file = self.root / config.filename
- self.min_rarity = config.minimumRarity
- self.hierarchy_levels = config.hierarchyLevels
- self.only_german = config.onlyGerman
- def close(self):
- pass
- def get_labels(self) -> T.List[dict]:
- result = []
- lepi_list = pd.read_csv(self.label_file,
- names=self.names,
- dtype=self.dtype,
- sep="\t", header=0
- )
- app.logger.info(f"Found {len(lepi_list)} labels in {self.label_file}")
- if self.min_rarity is not None:
- mask = lepi_list.rarity >= self.min_rarity
- lepi_list = lepi_list[mask]
- app.logger.info(f"Labels {len(lepi_list):,d} with {self.min_rarity=}")
- if self.only_german:
- mask = (
- lepi_list.german.eq("D") |
- lepi_list.austrian.eq("A") |
- lepi_list.swiss.eq("C")
- ) & \
- lepi_list["remove_me"].isin([np.nan])
- lepi_list = lepi_list[mask]
- app.logger.info(f"Labels {len(lepi_list):,d} for german-speaking countries")
- parents = set()
- for i, entry in lepi_list.iterrows():
- parent_reference = None
- for level, level_name in self.hierarchy_levels:
- level_entry = entry[level]
- if level_entry is None:
- continue
- reference, name = f'{level}_{level_entry.lower()}', level_entry
- # parents should be added once
- if reference not in parents:
- result.append(self.create_label(reference, name, parent_reference, level_name))
- parents.add(reference)
- parent_reference = reference
- # add label itself
- if self.KR_REGEX.match(entry.kr_nr):
- name = f'{entry.genus} {entry.species} ({entry.kr_nr})'
- reference = entry.kr_nr
- else:
- name = f'{entry.genus} {entry.species}'
- reference = f'_{name.lower()}'
- result.append(self.create_label(reference, name, parent_reference))
- app.logger.info(f"Finally, provided {len(result):,d} labels")
- return result
|