Provider.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. import re
  2. import numpy as np
  3. import pandas as pd
  4. import typing as T
  5. from pathlib import Path
  6. from munch import munchify
  7. from pycs import app
  8. from pycs.interfaces.LabelProvider import LabelProvider
  9. class Provider(LabelProvider):
  10. names = [
  11. 'is_local',
  12. 'rarity',
  13. 'super_family',
  14. 'family',
  15. 'sub_family',
  16. 'tribe',
  17. 'german',
  18. 'swiss',
  19. 'austrian',
  20. 'kr_nr',
  21. 'genus',
  22. 'species',
  23. 'species_group',
  24. 'authors',
  25. 'comment',
  26. 'remove_me',
  27. 'changed',
  28. 'version1_comment',
  29. 'misc', # 'D-CH-A / non-KR / Kaukasus',
  30. 'german_name',
  31. ]
  32. dtype = {
  33. 'is_local': pd.CategoricalDtype(['nur lokal', 'tagaktiv']),
  34. 'rarity': np.float32,
  35. 'super_family': "category",
  36. 'family': "category",
  37. 'sub_family': "category",
  38. 'tribe': "category",
  39. 'german': pd.CategoricalDtype(['D', 'e', '?']),
  40. 'swiss': pd.CategoricalDtype(['C', 'e', '?']),
  41. 'austrian': pd.CategoricalDtype(['A', 'e', '?']),
  42. 'kr_nr': "object",
  43. 'genus': "category",
  44. 'species': "category",
  45. 'species_group': "category",
  46. 'authors': "object",
  47. 'comment': "object",
  48. 'remove_me': "category",
  49. 'changed': "object",
  50. 'version1_comment': "object",
  51. 'misc': "object",
  52. 'german_name': str,
  53. }
  54. KR_REGEX = re.compile(r"^[\d\-a-zA-Z]+")
  55. def __init__(self, root_folder: str, configuration: T.Dict):
  56. config = munchify(configuration)
  57. self.root = Path(root_folder)
  58. self.label_file = self.root / config.filename
  59. self.min_rarity = config.minimumRarity
  60. self.hierarchy_levels = config.hierarchyLevels
  61. self.only_german = config.onlyGerman
  62. def close(self):
  63. pass
  64. def get_labels(self) -> T.List[dict]:
  65. result = []
  66. lepi_list = pd.read_csv(self.label_file,
  67. names=self.names,
  68. dtype=self.dtype,
  69. sep="\t", header=0
  70. )
  71. app.logger.info(f"Found {len(lepi_list)} labels in {self.label_file}")
  72. if self.min_rarity is not None:
  73. mask = lepi_list.rarity >= self.min_rarity
  74. lepi_list = lepi_list[mask]
  75. app.logger.info(f"Labels {len(lepi_list):,d} with {self.min_rarity=}")
  76. if self.only_german:
  77. mask = (
  78. lepi_list.german.eq("D") |
  79. lepi_list.austrian.eq("A") |
  80. lepi_list.swiss.eq("C")
  81. ) & \
  82. lepi_list["remove_me"].isin([np.nan])
  83. lepi_list = lepi_list[mask]
  84. app.logger.info(f"Labels {len(lepi_list):,d} for german-speaking countries")
  85. parents = set()
  86. for i, entry in lepi_list.iterrows():
  87. parent_reference = None
  88. for level, level_name in self.hierarchy_levels:
  89. level_entry = entry[level]
  90. if level_entry is None or (isinstance(level_entry, float) and np.isnan(level_entry)):
  91. continue
  92. reference, name = f'{level}_{level_entry.lower()}', level_entry
  93. # parents should be added once
  94. if reference not in parents:
  95. result.append(self.create_label(reference, name, parent_reference, level_name))
  96. parents.add(reference)
  97. parent_reference = reference
  98. # add label itself
  99. if self.KR_REGEX.match(entry.kr_nr):
  100. name = f'{entry.genus} {entry.species} ({entry.kr_nr})'
  101. reference = entry.kr_nr
  102. else:
  103. name = f'{entry.genus} {entry.species}'
  104. reference = f'_{name.lower()}'
  105. result.append(self.create_label(reference, name, parent_reference))
  106. app.logger.info(f"Finally, provided {len(result):,d} labels")
  107. return result