remove_duplicates.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. #!/usr/bin/env python
  2. if __name__ != '__main__': raise Exception("Do not import me!")
  3. import os
  4. import numpy as np
  5. import re
  6. import hashlib
  7. from cvargparse import BaseParser, Arg
  8. from os.path import isfile, join
  9. from tqdm import tqdm
  10. from collections import defaultdict
  11. from utils import parser
  12. from utils import imaging
  13. def remove_duplicates(fpaths):
  14. assert len(fpaths) >= 2, f"There should be at least two paths, but were {len(fpaths)}!"
  15. for fpath in fpaths[1:]:
  16. os.remove(fpath)
  17. def main(args):
  18. fname_regex = re.compile(r"^\d+.(.+)\..*$")
  19. content = imaging.get_content(args.folder, args.extensions)
  20. for root, fnames in tqdm(content):
  21. names = [(name, fname_regex.match(name).group(1)) for name in fnames]
  22. counts = defaultdict(int)
  23. name_to_fname = defaultdict(list)
  24. for fname, name in names:
  25. counts[name] += 1
  26. name_to_fname[name].append(fname)
  27. for name, count in counts.items():
  28. if count == 1: continue
  29. md5sums = defaultdict(list)
  30. md5counts = defaultdict(int)
  31. for fname in name_to_fname[name]:
  32. fpath = join(root, fname)
  33. assert isfile(fpath), f"Could not find {fpath}"
  34. with open(fpath, "rb") as f:
  35. md5sum = hashlib.md5(f.read()).hexdigest()
  36. md5sums[md5sum].append(fpath)
  37. md5counts[md5sum] += 1
  38. for md5sum, count in md5counts.items():
  39. if count == 1: continue
  40. remove_duplicates(md5sums[md5sum])
  41. main(parser.parse_args())