12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- #!/usr/bin/env python
- if __name__ != '__main__': raise Exception("Do not import me!")
- import os
- import numpy as np
- import re
- import hashlib
- from cvargparse import BaseParser, Arg
- from os.path import isfile, join
- from tqdm import tqdm
- from collections import defaultdict
- from utils import parser
- from utils import imaging
- def remove_duplicates(fpaths):
- assert len(fpaths) >= 2, f"There should be at least two paths, but were {len(fpaths)}!"
- for fpath in fpaths[1:]:
- os.remove(fpath)
- def main(args):
- fname_regex = re.compile(r"^\d+.(.+)\..*$")
- content = imaging.get_content(args.folder, args.extensions)
- for root, fnames in tqdm(content):
- names = [(name, fname_regex.match(name).group(1)) for name in fnames]
- counts = defaultdict(int)
- name_to_fname = defaultdict(list)
- for fname, name in names:
- counts[name] += 1
- name_to_fname[name].append(fname)
- for name, count in counts.items():
- if count == 1: continue
- md5sums = defaultdict(list)
- md5counts = defaultdict(int)
- for fname in name_to_fname[name]:
- fpath = join(root, fname)
- assert isfile(fpath), f"Could not find {fpath}"
- with open(fpath, "rb") as f:
- md5sum = hashlib.md5(f.read()).hexdigest()
- md5sums[md5sum].append(fpath)
- md5counts[md5sum] += 1
- for md5sum, count in md5counts.items():
- if count == 1: continue
- remove_duplicates(md5sums[md5sum])
- main(parser.parse_args())
|