Fixed hang with gifs in duplicates form
This commit is contained in:
@@ -24,11 +24,14 @@ from constants import (
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Result structure for duplicate detection
|
||||
DuplicateResult = collections.namedtuple('DuplicateResult', ['path1', 'path2', 'hash_value', 'is_exception', 'similarity', 'timestamp'])
|
||||
DuplicateResult = collections.namedtuple(
|
||||
'DuplicateResult',
|
||||
['path1', 'path2', 'hash_value', 'is_exception', 'similarity', 'timestamp'])
|
||||
|
||||
|
||||
class BKTree:
|
||||
"""A Burkhard-Keller tree for efficient similarity searching using Hamming distance."""
|
||||
"""A Burkhard-Keller tree for efficient similarity searching using Hamming
|
||||
distance."""
|
||||
def __init__(self, distance_func):
|
||||
self.distance_func = distance_func
|
||||
self.tree = None
|
||||
@@ -210,7 +213,8 @@ class DuplicateCache(QObject):
|
||||
return None, 0, None
|
||||
|
||||
with QWriteLocker(self._hash_cache_lock):
|
||||
self._hash_cache[(dev_id, inode_key_bytes)] = (hash_str, mtime, path_str)
|
||||
self._hash_cache[(dev_id, inode_key_bytes)] = (
|
||||
hash_str, mtime, path_str)
|
||||
return hash_str, mtime, path_str
|
||||
return None, 0, None
|
||||
|
||||
@@ -225,7 +229,8 @@ class DuplicateCache(QObject):
|
||||
return hash_value
|
||||
return None
|
||||
|
||||
def add_hash_for_path(self, path, hash_value, mtime, dev_id=None, inode_key_bytes=None):
|
||||
def add_hash_for_path(self,
|
||||
path, hash_value, mtime, dev_id=None, inode_key_bytes=None):
|
||||
if dev_id is None or inode_key_bytes is None:
|
||||
dev_id, inode_key_bytes = self._get_inode_info(path)
|
||||
if not inode_key_bytes or not self._lmdb_env:
|
||||
@@ -264,8 +269,10 @@ class DuplicateCache(QObject):
|
||||
|
||||
# Also remove any exceptions involving this path
|
||||
if clear_relationships:
|
||||
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._exceptions_db)
|
||||
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._pending_db)
|
||||
self._remove_pair_entries_for_path(
|
||||
dev_id, inode_key_bytes, self._exceptions_db)
|
||||
self._remove_pair_entries_for_path(
|
||||
dev_id, inode_key_bytes, self._pending_db)
|
||||
return True
|
||||
|
||||
def _get_pair_lmdb_key_from_ids(self, dev1, inode1, dev2, inode2):
|
||||
@@ -280,7 +287,9 @@ class DuplicateCache(QObject):
|
||||
return None
|
||||
return self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2)
|
||||
|
||||
def mark_as_exception(self, path1, path2, is_exception=True, similarity=None, timestamp=None):
|
||||
def mark_as_exception(self,
|
||||
path1, path2, is_exception=True, similarity=None,
|
||||
timestamp=None):
|
||||
if not self._lmdb_env:
|
||||
return False
|
||||
|
||||
@@ -323,8 +332,10 @@ class DuplicateCache(QObject):
|
||||
with self._lmdb_env.begin(write=False) as txn:
|
||||
return txn.get(exception_key, db=self._exceptions_db) is not None
|
||||
|
||||
def _remove_pair_entries_for_path(self, target_dev, target_inode, db_handle, txn=None):
|
||||
"""Removes all entries involving a specific (dev, inode) pair from a pair-based DB."""
|
||||
def _remove_pair_entries_for_path(self,
|
||||
target_dev, target_inode, db_handle, txn=None):
|
||||
"""Removes all entries involving a specific (dev, inode) pair from a pair-based
|
||||
DB."""
|
||||
if not self._lmdb_env:
|
||||
return
|
||||
|
||||
@@ -336,8 +347,10 @@ class DuplicateCache(QObject):
|
||||
for key_bytes, _ in cursor:
|
||||
key_str = key_bytes.decode('utf-8')
|
||||
parts = key_str.split('-')
|
||||
if len(parts) < 4: continue
|
||||
dev1, inode1_hex, dev2, inode2_hex = int(parts[0]), parts[1], int(parts[2]), parts[3]
|
||||
if len(parts) < 4:
|
||||
continue
|
||||
dev1, inode1_hex, dev2, inode2_hex = int(
|
||||
parts[0]), parts[1], int(parts[2]), parts[3]
|
||||
if (dev1 == target_dev and inode1_hex == target_inode_hex) or \
|
||||
(dev2 == target_dev and inode2_hex == target_inode_hex):
|
||||
keys_to_delete.append(key_bytes)
|
||||
@@ -351,7 +364,8 @@ class DuplicateCache(QObject):
|
||||
with self._lmdb_env.begin(write=True) as t:
|
||||
do_remove(t)
|
||||
|
||||
def mark_as_pending(self, path1, path2, is_pending=True, similarity=None, timestamp=None):
|
||||
def mark_as_pending(self,
|
||||
path1, path2, is_pending=True, similarity=None, timestamp=None):
|
||||
"""Marks a pair as pending review."""
|
||||
if not self._lmdb_env or self._pending_db is None:
|
||||
return False
|
||||
@@ -392,7 +406,8 @@ class DuplicateCache(QObject):
|
||||
sim = int(parts[2]) if len(parts) > 2 and parts[2] else None
|
||||
ts = int(parts[3]) if len(parts) > 3 else 0
|
||||
if os.path.exists(p1) and os.path.exists(p2):
|
||||
results.append(DuplicateResult(p1, p2, None, False, sim, ts))
|
||||
results.append(
|
||||
DuplicateResult(p1, p2, None, False, sim, ts))
|
||||
else:
|
||||
keys_to_delete.append(key)
|
||||
except Exception:
|
||||
@@ -404,7 +419,8 @@ class DuplicateCache(QObject):
|
||||
with self._lmdb_env.begin(write=True) as txn:
|
||||
for k in keys_to_delete:
|
||||
txn.delete(k, db=self._pending_db)
|
||||
logger.info(f"Cleaned up {len(keys_to_delete)} invalid pending duplicates (files deleted externally)")
|
||||
logger.info(f"Cleaned up {len(keys_to_delete)} invalid "
|
||||
"pending duplicates (files deleted externally)")
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up pending duplicates from DB: {e}")
|
||||
|
||||
@@ -436,23 +452,28 @@ class DuplicateCache(QObject):
|
||||
if len(parts) > 3:
|
||||
ts = int(parts[3])
|
||||
else:
|
||||
ts = int(os.path.getmtime(p1)) if os.path.exists(p1) else 0
|
||||
ts = int(os.path.getmtime(p1)) \
|
||||
if os.path.exists(p1) else 0
|
||||
|
||||
if not p1 or not p2:
|
||||
# Legacy format fallback: lookup paths in hash db
|
||||
key_str = key_bytes.decode('utf-8')
|
||||
kp = key_str.split('-')
|
||||
if len(kp) == 4:
|
||||
k1, k2 = f"{kp[0]}-{kp[1]}".encode(), f"{kp[2]}-{kp[3]}".encode()
|
||||
v1, v2 = txn.get(k1, db=self._hash_db), txn.get(k2, db=self._hash_db)
|
||||
k1, k2 = f"{kp[0]}-{kp[1]}".encode(),
|
||||
f"{kp[2]}-{kp[3]}".encode()
|
||||
v1, v2 = txn.get(k1, db=self._hash_db), \
|
||||
txn.get(k2, db=self._hash_db)
|
||||
if v1 and v2:
|
||||
# Format is hash|mtime|path|dist... path is always index 2
|
||||
# Format is hash|mtime|path|dist... path is always
|
||||
# index 2
|
||||
p1 = v1.decode('utf-8').split('|')[2]
|
||||
p2 = v2.decode('utf-8').split('|')[2]
|
||||
|
||||
if p1 and p2:
|
||||
if os.path.exists(p1) and os.path.exists(p2):
|
||||
results.append(DuplicateResult(p1, p2, None, True, sim, ts))
|
||||
results.append(
|
||||
DuplicateResult(p1, p2, None, True, sim, ts))
|
||||
except Exception:
|
||||
continue
|
||||
return results
|
||||
@@ -484,11 +505,13 @@ class DuplicateCache(QObject):
|
||||
with self._lmdb_env.begin(write=True) as txn:
|
||||
for k in keys_to_delete:
|
||||
txn.delete(k, db=self._hash_db)
|
||||
logger.info(f"Cleaned up {len(keys_to_delete)} stale hash entries (files deleted externally)")
|
||||
logger.info(f"Cleaned up {len(keys_to_delete)} stale hash "
|
||||
"entries (files deleted externally)")
|
||||
return len(keys_to_delete)
|
||||
|
||||
def get_all_hashes_with_paths(self):
|
||||
"""Retrieves all hashes from the database along with their associated paths and inode info."""
|
||||
"""Retrieves all hashes from the database along with their associated paths and
|
||||
inode info."""
|
||||
# hash_value -> [(path, dev_id, inode_key_bytes)]
|
||||
all_hashes = collections.defaultdict(list)
|
||||
if not self._lmdb_env:
|
||||
@@ -527,7 +550,8 @@ class DuplicateCache(QObject):
|
||||
if not old_inode_key_bytes or not new_inode_key_bytes or not self._lmdb_env:
|
||||
return False
|
||||
|
||||
# If the (dev, inode) pair is the same, only the path in the value needs updating.
|
||||
# If the (dev, inode) pair is the same, only the path in the value needs
|
||||
# updating.
|
||||
# This happens if the file is renamed within the same filesystem.
|
||||
if (old_dev, old_inode_key_bytes) == (new_dev, new_inode_key_bytes):
|
||||
hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes)
|
||||
@@ -543,8 +567,10 @@ class DuplicateCache(QObject):
|
||||
# 3. Add a new entry with the new (dev, inode) and path, using the old hash.
|
||||
hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes)
|
||||
if hash_value:
|
||||
self.remove_hash_for_path(old_path) # This removes the old (dev, inode) entry
|
||||
self.add_hash_for_path(new_path, hash_value, mtime) # Adds new (dev, inode) entry
|
||||
# This removes the old (dev, inode) entry
|
||||
self.remove_hash_for_path(old_path)
|
||||
# Adds new (dev, inode) entry
|
||||
self.add_hash_for_path(new_path, hash_value, mtime)
|
||||
self._update_pair_paths(old_path, new_path, self._pending_db)
|
||||
return True
|
||||
return False
|
||||
@@ -573,7 +599,9 @@ class DuplicateDetector(QThread):
|
||||
duplicates_found = Signal(list) # List of DuplicateResult
|
||||
detection_finished = Signal()
|
||||
|
||||
def __init__(self, paths_to_scan, duplicate_cache, pool_manager, method="histogram_hashing", threshold=90, force_full=False):
|
||||
def __init__(self,
|
||||
paths_to_scan, duplicate_cache, pool_manager,
|
||||
method="histogram_hashing", threshold=90, force_full=False):
|
||||
super().__init__()
|
||||
self.paths_to_scan = paths_to_scan
|
||||
self.duplicate_cache = duplicate_cache
|
||||
@@ -585,17 +613,19 @@ class DuplicateDetector(QThread):
|
||||
|
||||
def stop(self):
|
||||
self._is_running = False
|
||||
self.wait() # Add this line
|
||||
self.wait() # Add this line
|
||||
|
||||
def run(self):
|
||||
total_files = len(self.paths_to_scan)
|
||||
found_duplicates = []
|
||||
unique_duplicate_pairs = set() # To store frozenset((path1, path2)) for uniqueness
|
||||
# To store frozenset((path1, path2)) for uniqueness
|
||||
unique_duplicate_pairs = set()
|
||||
last_update_time = 0
|
||||
|
||||
pool = self.pool_manager.get_pool()
|
||||
|
||||
# 1. Load existing pending duplicates from cache to avoid recalculation (unless force_full)
|
||||
# 1. Load existing pending duplicates from cache to avoid recalculation (unless
|
||||
# force_full)
|
||||
if not self.force_full:
|
||||
pending = self.duplicate_cache.get_all_pending_duplicates()
|
||||
for p in pending:
|
||||
@@ -606,7 +636,10 @@ class DuplicateDetector(QThread):
|
||||
|
||||
# Convert similarity threshold (percentage) to Hamming distance
|
||||
distance_threshold = int(MAX_DHASH_DISTANCE * (100 - self.threshold) / 100)
|
||||
logger.info(f"Duplicate detection: Method={self.method}, Similarity Threshold={self.threshold}%, Hamming Distance Threshold={distance_threshold}")
|
||||
logger.info(
|
||||
f"Duplicate detection: Method={self.method}, "
|
||||
f"Similarity Threshold={self.threshold}%, Hamming "
|
||||
f"Distance Threshold={distance_threshold}")
|
||||
|
||||
# 2. Phase 1: Hash Collection (Parallelized)
|
||||
path_to_hash = {}
|
||||
@@ -645,7 +678,8 @@ class DuplicateDetector(QThread):
|
||||
break
|
||||
current_batch = paths_to_hash_parallel[i : i + batch_size]
|
||||
for p_data in current_batch:
|
||||
pool.start(HashWorker(p_data[0], self, new_hashes, results_mutex, sem))
|
||||
pool.start(HashWorker(
|
||||
p_data[0], self, new_hashes, results_mutex, sem))
|
||||
|
||||
for _ in range(len(current_batch)):
|
||||
while not sem.tryAcquire(1, 100):
|
||||
@@ -655,7 +689,9 @@ class DuplicateDetector(QThread):
|
||||
break
|
||||
processed_hashing += 1
|
||||
if time.perf_counter() - last_update_time > 0.05:
|
||||
self.progress_update.emit(processed_hashing, total_files * 2, UITexts.DUPLICATE_MSG_HASHING.format(filename="..."))
|
||||
self.progress_update.emit(
|
||||
processed_hashing, total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_HASHING.format(filename="..."))
|
||||
last_update_time = time.perf_counter()
|
||||
|
||||
for p, mtime, dev, inode in paths_to_hash_parallel:
|
||||
@@ -670,7 +706,9 @@ class DuplicateDetector(QThread):
|
||||
return
|
||||
|
||||
# Signal phase transition to exactly 50%
|
||||
self.progress_update.emit(total_files, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
|
||||
self.progress_update.emit(
|
||||
total_files, total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
|
||||
|
||||
# 3. Phase 2: Comparison (Optimized with BK-Tree)
|
||||
hash_map = collections.defaultdict(list)
|
||||
@@ -684,9 +722,12 @@ class DuplicateDetector(QThread):
|
||||
if self.force_full or p in dirty_paths:
|
||||
dirty_hashes_objs.add(h_obj)
|
||||
|
||||
# Optimization: Only query the tree for hashes associated with new or modified files.
|
||||
# This finds pairs (Dirty, Clean) and (Dirty, Dirty). (Clean, Clean) were handled in previous runs.
|
||||
hashes_to_query = list(dirty_hashes_objs) if not self.force_full else list(hash_map.keys())
|
||||
# Optimization: Only query the tree for hashes associated with new or modified
|
||||
# files.
|
||||
# This finds pairs (Dirty, Clean) and (Dirty, Dirty). (Clean, Clean) were
|
||||
# handled in previous runs.
|
||||
hashes_to_query = list(dirty_hashes_objs) \
|
||||
if not self.force_full else list(hash_map.keys())
|
||||
total_queries = len(hashes_to_query)
|
||||
|
||||
for i, h1 in enumerate(hashes_to_query):
|
||||
@@ -697,8 +738,11 @@ class DuplicateDetector(QThread):
|
||||
|
||||
if time.perf_counter() - last_update_time > 0.1:
|
||||
# Scale Phase 2 progress to the 50%-100% range
|
||||
phase2_progress = int(((i + 1) / total_queries) * total_files) if total_queries > 0 else total_files
|
||||
self.progress_update.emit(total_files + phase2_progress, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
|
||||
phase2_progress = int(((i + 1) / total_queries) * total_files) \
|
||||
if total_queries > 0 else total_files
|
||||
self.progress_update.emit(
|
||||
total_files + phase2_progress, total_files * 2,
|
||||
UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
|
||||
last_update_time = time.perf_counter()
|
||||
|
||||
# Query tree for similar hashes
|
||||
@@ -713,7 +757,8 @@ class DuplicateDetector(QThread):
|
||||
continue
|
||||
|
||||
# Optimization: Skip pair if BOTH were already verified
|
||||
if not self.force_full and p1 not in dirty_paths and p2 not in dirty_paths:
|
||||
if not self.force_full \
|
||||
and p1 not in dirty_paths and p2 not in dirty_paths:
|
||||
continue
|
||||
|
||||
canonical = frozenset((p1, p2))
|
||||
@@ -726,7 +771,8 @@ class DuplicateDetector(QThread):
|
||||
res = DuplicateResult(p1, p2, str(h1), False, sim, ts)
|
||||
found_duplicates.append(res)
|
||||
unique_duplicate_pairs.add(canonical)
|
||||
self.duplicate_cache.mark_as_pending(p1, p2, True, similarity=sim, timestamp=ts)
|
||||
self.duplicate_cache.mark_as_pending(
|
||||
p1, p2, True, similarity=sim, timestamp=ts)
|
||||
|
||||
self.duplicates_found.emit(found_duplicates)
|
||||
self.detection_finished.emit()
|
||||
|
||||
Reference in New Issue
Block a user