Several fixes

This commit is contained in:
Ignacio Serantes
2026-04-06 20:44:49 +02:00
parent ca260d4219
commit a717acef87
8 changed files with 151 additions and 98 deletions

View File

@@ -24,7 +24,7 @@ from constants import (
logger = logging.getLogger(__name__)
# Result structure for duplicate detection
DuplicateResult = collections.namedtuple('DuplicateResult', ['path1', 'path2', 'hash_value', 'is_exception', 'similarity'])
DuplicateResult = collections.namedtuple('DuplicateResult', ['path1', 'path2', 'hash_value', 'is_exception', 'similarity', 'timestamp'])
class BKTree:
@@ -241,7 +241,15 @@ class DuplicateCache(QObject):
self._hash_cache[(dev_id, inode_key_bytes)] = (hash_value, mtime, path)
return True
def remove_hash_for_path(self, path):
def remove_hash_for_path(self, path, clear_relationships=True):
"""
Removes the hash entry for a path.
Args:
path: File path.
clear_relationships: If True, also wipes all entries in pending and
exceptions DBs involving this file.
"""
dev_id, inode_key_bytes = self._get_inode_info(path)
if not inode_key_bytes or not self._lmdb_env:
return False
@@ -255,8 +263,9 @@ class DuplicateCache(QObject):
self._hash_cache.pop((dev_id, inode_key_bytes), None)
# Also remove any exceptions involving this path
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._exceptions_db)
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._pending_db)
if clear_relationships:
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._exceptions_db)
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._pending_db)
return True
def _get_pair_lmdb_key_from_ids(self, dev1, inode1, dev2, inode2):
@@ -271,7 +280,7 @@ class DuplicateCache(QObject):
return None
return self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2)
def mark_as_exception(self, path1, path2, is_exception=True, similarity=None):
def mark_as_exception(self, path1, path2, is_exception=True, similarity=None, timestamp=None):
if not self._lmdb_env:
return False
@@ -285,9 +294,8 @@ class DuplicateCache(QObject):
return False
# Store paths in value to make exception recovery independent of hash DB
val_str = f"{path1}|{path2}"
if similarity is not None:
val_str += f"|{similarity}"
ts = timestamp if timestamp is not None else int(time.time())
val_str = f"{path1}|{path2}|{similarity if similarity is not None else ''}|{ts}"
value = val_str.encode('utf-8')
with QMutexLocker(self._db_lock):
@@ -315,34 +323,35 @@ class DuplicateCache(QObject):
with self._lmdb_env.begin(write=False) as txn:
return txn.get(exception_key, db=self._exceptions_db) is not None
def _remove_pair_entries_for_path(self, target_dev, target_inode, db_handle):
def _remove_pair_entries_for_path(self, target_dev, target_inode, db_handle, txn=None):
"""Removes all entries involving a specific (dev, inode) pair from a pair-based DB."""
if not self._lmdb_env:
return
target_inode_hex = target_inode.hex()
with QMutexLocker(self._db_lock):
with self._lmdb_env.begin(write=True) as txn:
cursor = txn.cursor(db=db_handle)
keys_to_delete = []
for key_bytes, _ in cursor:
key_str = key_bytes.decode('utf-8')
# Key format: "dev1-inode1_hex-dev2-inode2_hex"
parts = key_str.split('-')
dev1 = int(parts[0])
inode1_hex = parts[1]
dev2 = int(parts[2])
inode2_hex = parts[3]
def do_remove(t):
cursor = t.cursor(db=db_handle)
keys_to_delete = []
for key_bytes, _ in cursor:
key_str = key_bytes.decode('utf-8')
parts = key_str.split('-')
if len(parts) < 4: continue
dev1, inode1_hex, dev2, inode2_hex = int(parts[0]), parts[1], int(parts[2]), parts[3]
if (dev1 == target_dev and inode1_hex == target_inode_hex) or \
(dev2 == target_dev and inode2_hex == target_inode_hex):
keys_to_delete.append(key_bytes)
for key in keys_to_delete:
t.delete(key, db=db_handle)
if (dev1 == target_dev and inode1_hex == target_inode_hex) or \
(dev2 == target_dev and inode2_hex == target_inode_hex):
keys_to_delete.append(key_bytes)
if txn:
do_remove(txn)
else:
with QMutexLocker(self._db_lock):
with self._lmdb_env.begin(write=True) as t:
do_remove(t)
for key in keys_to_delete:
txn.delete(key, db=db_handle)
def mark_as_pending(self, path1, path2, is_pending=True, similarity=None):
def mark_as_pending(self, path1, path2, is_pending=True, similarity=None, timestamp=None):
"""Marks a pair as pending review."""
if not self._lmdb_env or self._pending_db is None:
return False
@@ -352,9 +361,8 @@ class DuplicateCache(QObject):
return False
# Store paths in value to allow reconstruction without scanning
val_str = f"{path1}|{path2}"
if similarity is not None:
val_str += f"|{similarity}"
ts = timestamp if timestamp is not None else int(time.time())
val_str = f"{path1}|{path2}|{similarity if similarity is not None else ''}|{ts}"
value = val_str.encode('utf-8')
with QMutexLocker(self._db_lock):
@@ -381,9 +389,10 @@ class DuplicateCache(QObject):
try:
parts = value_bytes.decode('utf-8').split('|')
p1, p2 = parts[0], parts[1]
sim = int(parts[2]) if len(parts) > 2 else None
sim = int(parts[2]) if len(parts) > 2 and parts[2] else None
ts = int(parts[3]) if len(parts) > 3 else 0
if os.path.exists(p1) and os.path.exists(p2):
results.append(DuplicateResult(p1, p2, None, False, sim))
results.append(DuplicateResult(p1, p2, None, False, sim, ts))
else:
keys_to_delete.append(key)
except Exception:
@@ -414,6 +423,7 @@ class DuplicateCache(QObject):
try:
p1, p2 = None, None
sim = None
ts = 0
val_str = value_bytes.decode('utf-8')
if '|' in val_str:
@@ -421,8 +431,12 @@ class DuplicateCache(QObject):
parts = val_str.split('|')
if len(parts) >= 2:
p1, p2 = parts[0], parts[1]
if len(parts) > 2:
if len(parts) > 2 and parts[2]:
sim = int(parts[2])
if len(parts) > 3:
ts = int(parts[3])
else:
ts = int(os.path.getmtime(p1)) if os.path.exists(p1) else 0
if not p1 or not p2:
# Legacy format fallback: lookup paths in hash db
@@ -438,7 +452,7 @@ class DuplicateCache(QObject):
if p1 and p2:
if os.path.exists(p1) and os.path.exists(p2):
results.append(DuplicateResult(p1, p2, None, True, sim))
results.append(DuplicateResult(p1, p2, None, True, sim, ts))
except Exception:
continue
return results
@@ -606,7 +620,7 @@ class DuplicateDetector(QThread):
mtime = stat_info.st_mtime
dev, inode = stat_info.st_dev, struct.pack('Q', stat_info.st_ino)
cached_h = None if self.force_full else \
cached_h = \
self.duplicate_cache.get_hash_for_path(path, mtime, dev, inode)
if cached_h:
@@ -658,13 +672,6 @@ class DuplicateDetector(QThread):
# Signal phase transition to exactly 50%
self.progress_update.emit(total_files, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
if not self.force_full and not dirty_paths:
# No files changed and no re-scan forced.
# We can skip Phase 2 as all results were loaded from the pending cache.
self.duplicates_found.emit(found_duplicates)
self.detection_finished.emit()
return
# 3. Phase 2: Comparison (Optimized with BK-Tree)
hash_map = collections.defaultdict(list)
bk_tree = BKTree(lambda a, b: a - b)
@@ -715,10 +722,11 @@ class DuplicateDetector(QThread):
if canonical not in unique_duplicate_pairs:
if not self.duplicate_cache.is_exception(p1, p2):
sim = int((1.0 - (distance / MAX_DHASH_DISTANCE)) * 100)
res = DuplicateResult(p1, p2, str(h1), False, sim)
ts = int(time.time())
res = DuplicateResult(p1, p2, str(h1), False, sim, ts)
found_duplicates.append(res)
unique_duplicate_pairs.add(canonical)
self.duplicate_cache.mark_as_pending(p1, p2, True, similarity=sim)
self.duplicate_cache.mark_as_pending(p1, p2, True, similarity=sim, timestamp=ts)
self.duplicates_found.emit(found_duplicates)
self.detection_finished.emit()