ich habe versucht, in Python ein zuvor in C++ geschriebenes Programm zu entwickeln. Das Ergebnis ist folgendes:
Code: Alles auswählen
import filecmp
import os
import sys
from pathlib import Path
from timeit import default_timer as timer
def is_duplicate_file(file_a, file_b):
return filecmp.cmp(file_a, file_b, shallow=False)
def recursive_directory_iterator(path):
for entry in os.scandir(path):
if entry.is_dir(follow_symlinks=False):
yield from recursive_directory_iterator(entry);
else:
yield entry
def build_file_index(start_path):
files_by_size = {}
for entry in recursive_directory_iterator(start_path):
if entry.is_file():
files_by_size.setdefault(entry.stat().st_size, []).append(entry)
return files_by_size
def find_duplicate_files(files):
dupes = {}
for size, paths in files.items():
if len(paths) < 2:
continue
matched = [False] * len(paths)
#print(f'Potential duplicates for size {size} bytes: {paths}')
for idx_i, i in enumerate(paths):
for idx_j, j in enumerate(paths[idx_i+1:], start=idx_i+1):
#print(f'Checking [{idx_i}] = {i} and [{idx_j}] = {j}')
if not matched[idx_i] and not matched[idx_j] and is_duplicate_file(i, j):
dupes.setdefault(i.path, []).append(j.path)
matched[idx_j] = True
return dupes
def print_dupes(dupes):
for a, b in dupes.items():
print(f'\nReference: {a}')
for dup in b:
print(f'Duplicate: {dup}')
print(f'\nFound a total of {len(dupes)} duplicate groups.\n')
def main():
if len(sys.argv) != 2:
print("Program expects one argument: <Starting path>")
return 1
start_path = Path(sys.argv[1])
if not start_path.exists():
print("Provided argument is not an existing path.")
return 2
start = timer()
files = build_file_index(start_path)
dupes = find_duplicate_files(files)
print_dupes(dupes)
stop = timer()
print(f'\nDuration: {1000*(stop - start):.6f}ms')
if __name__ == '__main__':
main()
Ich habe bewusst auf fortgeschrittene Techniken wie Analyse der Dateiträgerart und Multithreading verzichtet.