#!/usr/bin/env python if __name__ != '__main__': raise Exception("Do not import me!") import os import numpy as np import re import hashlib from cvargparse import BaseParser, Arg from os.path import isfile, join from tqdm import tqdm from collections import defaultdict from utils import parser from utils import imaging def remove_duplicates(fpaths): assert len(fpaths) >= 2, f"There should be at least two paths, but were {len(fpaths)}!" for fpath in fpaths[1:]: os.remove(fpath) def main(args): fname_regex = re.compile(r"^\d+.(.+)\..*$") content = imaging.get_content(args.folder, args.extensions) for root, fnames in tqdm(content): names = [(name, fname_regex.match(name).group(1)) for name in fnames] counts = defaultdict(int) name_to_fname = defaultdict(list) for fname, name in names: counts[name] += 1 name_to_fname[name].append(fname) for name, count in counts.items(): if count == 1: continue md5sums = defaultdict(list) md5counts = defaultdict(int) for fname in name_to_fname[name]: fpath = join(root, fname) assert isfile(fpath), f"Could not find {fpath}" with open(fpath, "rb") as f: md5sum = hashlib.md5(f.read()).hexdigest() md5sums[md5sum].append(fpath) md5counts[md5sum] += 1 for md5sum, count in md5counts.items(): if count == 1: continue remove_duplicates(md5sums[md5sum]) main(parser.parse_args())