Provider.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import re
  2. import numpy as np
  3. import pandas as pd
  4. import typing as T
  5. from pathlib import Path
  6. from munch import munchify
  7. from pycs import app
  8. from pycs.interfaces.LabelProvider import LabelProvider
  9. class Provider(LabelProvider):
  10. names = [
  11. 'is_local',
  12. 'rarity',
  13. 'super_family',
  14. 'family',
  15. 'sub_family',
  16. 'tribe',
  17. 'german',
  18. 'swiss',
  19. 'austrian',
  20. 'kr_nr',
  21. 'genus',
  22. 'species',
  23. 'authors',
  24. 'comment',
  25. 'remove_me',
  26. 'changed',
  27. 'version1_comment',
  28. 'misc', # 'D-CH-A / non-KR / Kaukasus',
  29. 'german_name',
  30. ]
  31. dtype = {
  32. 'is_local': pd.CategoricalDtype(['nur lokal', 'tagaktiv']),
  33. 'rarity': np.float32,
  34. 'super_family': "category",
  35. 'family': "category",
  36. 'sub_family': "category",
  37. 'tribe': "category",
  38. 'german': pd.CategoricalDtype(['D', 'e', '?']),
  39. 'swiss': pd.CategoricalDtype(['C', 'e', '?']),
  40. 'austrian': pd.CategoricalDtype(['A', 'e', '?']),
  41. 'kr_nr': "object",
  42. 'genus': "category",
  43. 'species': "category",
  44. 'authors': "object",
  45. 'comment': "object",
  46. 'remove_me': "category",
  47. 'changed': "object",
  48. 'version1_comment': "object",
  49. 'misc': "object",
  50. 'german_name': str,
  51. }
  52. KR_REGEX = re.compile(r"^[\d\-a-zA-Z]+")
  53. def __init__(self, root_folder: str, configuration: T.Dict):
  54. config = munchify(configuration)
  55. self.root = Path(root_folder)
  56. self.label_file = self.root / config.filename
  57. self.min_rarity = config.minimumRarity
  58. self.hierarchy_levels = config.hierarchyLevels
  59. self.only_german = config.onlyGerman
  60. def close(self):
  61. pass
  62. def get_labels(self) -> T.List[dict]:
  63. result = []
  64. lepi_list = pd.read_csv(self.label_file,
  65. names=self.names,
  66. dtype=self.dtype,
  67. sep="\t", header=0
  68. )
  69. app.logger.info(f"Found {len(lepi_list)} labels in {self.label_file}")
  70. if self.min_rarity is not None:
  71. mask = lepi_list.rarity >= self.min_rarity
  72. lepi_list = lepi_list[mask]
  73. app.logger.info(f"Labels {len(lepi_list):,d} with {self.min_rarity=}")
  74. if self.only_german:
  75. mask = (
  76. lepi_list.german.eq("D") |
  77. lepi_list.austrian.eq("A") |
  78. lepi_list.swiss.eq("C")
  79. ) & \
  80. lepi_list["remove_me"].isin([np.nan])
  81. lepi_list = lepi_list[mask]
  82. app.logger.info(f"Labels {len(lepi_list):,d} for german-speaking countries")
  83. parents = set()
  84. for i, entry in lepi_list.iterrows():
  85. parent_reference = None
  86. for level, level_name in self.hierarchy_levels:
  87. level_entry = entry[level]
  88. if level_entry is None:
  89. continue
  90. reference, name = f'{level}_{level_entry.lower()}', level_entry
  91. # parents should be added once
  92. if reference not in parents:
  93. result.append(self.create_label(reference, name, parent_reference, level_name))
  94. parents.add(reference)
  95. parent_reference = reference
  96. # add label itself
  97. if self.KR_REGEX.match(entry.kr_nr):
  98. name = f'{entry.genus} {entry.species} ({entry.kr_nr})'
  99. reference = entry.kr_nr
  100. else:
  101. name = f'{entry.genus} {entry.species}'
  102. reference = f'_{name.lower()}'
  103. result.append(self.create_label(reference, name, parent_reference))
  104. app.logger.info(f"Finally, provided {len(result):,d} labels")
  105. return result