Several fixes
This commit is contained in:
@@ -24,7 +24,7 @@ from constants import (
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Result structure for duplicate detection
|
||||
DuplicateResult = collections.namedtuple('DuplicateResult', ['path1', 'path2', 'hash_value', 'is_exception', 'similarity'])
|
||||
DuplicateResult = collections.namedtuple('DuplicateResult', ['path1', 'path2', 'hash_value', 'is_exception', 'similarity', 'timestamp'])
|
||||
|
||||
|
||||
class BKTree:
|
||||
@@ -241,7 +241,15 @@ class DuplicateCache(QObject):
|
||||
self._hash_cache[(dev_id, inode_key_bytes)] = (hash_value, mtime, path)
|
||||
return True
|
||||
|
||||
def remove_hash_for_path(self, path):
|
||||
def remove_hash_for_path(self, path, clear_relationships=True):
|
||||
"""
|
||||
Removes the hash entry for a path.
|
||||
|
||||
Args:
|
||||
path: File path.
|
||||
clear_relationships: If True, also wipes all entries in pending and
|
||||
exceptions DBs involving this file.
|
||||
"""
|
||||
dev_id, inode_key_bytes = self._get_inode_info(path)
|
||||
if not inode_key_bytes or not self._lmdb_env:
|
||||
return False
|
||||
@@ -255,8 +263,9 @@ class DuplicateCache(QObject):
|
||||
self._hash_cache.pop((dev_id, inode_key_bytes), None)
|
||||
|
||||
# Also remove any exceptions involving this path
|
||||
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._exceptions_db)
|
||||
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._pending_db)
|
||||
if clear_relationships:
|
||||
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._exceptions_db)
|
||||
self._remove_pair_entries_for_path(dev_id, inode_key_bytes, self._pending_db)
|
||||
return True
|
||||
|
||||
def _get_pair_lmdb_key_from_ids(self, dev1, inode1, dev2, inode2):
|
||||
@@ -271,7 +280,7 @@ class DuplicateCache(QObject):
|
||||
return None
|
||||
return self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2)
|
||||
|
||||
def mark_as_exception(self, path1, path2, is_exception=True, similarity=None):
|
||||
def mark_as_exception(self, path1, path2, is_exception=True, similarity=None, timestamp=None):
|
||||
if not self._lmdb_env:
|
||||
return False
|
||||
|
||||
@@ -285,9 +294,8 @@ class DuplicateCache(QObject):
|
||||
return False
|
||||
|
||||
# Store paths in value to make exception recovery independent of hash DB
|
||||
val_str = f"{path1}|{path2}"
|
||||
if similarity is not None:
|
||||
val_str += f"|{similarity}"
|
||||
ts = timestamp if timestamp is not None else int(time.time())
|
||||
val_str = f"{path1}|{path2}|{similarity if similarity is not None else ''}|{ts}"
|
||||
value = val_str.encode('utf-8')
|
||||
|
||||
with QMutexLocker(self._db_lock):
|
||||
@@ -315,34 +323,35 @@ class DuplicateCache(QObject):
|
||||
with self._lmdb_env.begin(write=False) as txn:
|
||||
return txn.get(exception_key, db=self._exceptions_db) is not None
|
||||
|
||||
def _remove_pair_entries_for_path(self, target_dev, target_inode, db_handle):
|
||||
def _remove_pair_entries_for_path(self, target_dev, target_inode, db_handle, txn=None):
|
||||
"""Removes all entries involving a specific (dev, inode) pair from a pair-based DB."""
|
||||
if not self._lmdb_env:
|
||||
return
|
||||
|
||||
target_inode_hex = target_inode.hex()
|
||||
with QMutexLocker(self._db_lock):
|
||||
with self._lmdb_env.begin(write=True) as txn:
|
||||
cursor = txn.cursor(db=db_handle)
|
||||
keys_to_delete = []
|
||||
for key_bytes, _ in cursor:
|
||||
key_str = key_bytes.decode('utf-8')
|
||||
# Key format: "dev1-inode1_hex-dev2-inode2_hex"
|
||||
parts = key_str.split('-')
|
||||
|
||||
dev1 = int(parts[0])
|
||||
inode1_hex = parts[1]
|
||||
dev2 = int(parts[2])
|
||||
inode2_hex = parts[3]
|
||||
def do_remove(t):
|
||||
cursor = t.cursor(db=db_handle)
|
||||
keys_to_delete = []
|
||||
for key_bytes, _ in cursor:
|
||||
key_str = key_bytes.decode('utf-8')
|
||||
parts = key_str.split('-')
|
||||
if len(parts) < 4: continue
|
||||
dev1, inode1_hex, dev2, inode2_hex = int(parts[0]), parts[1], int(parts[2]), parts[3]
|
||||
if (dev1 == target_dev and inode1_hex == target_inode_hex) or \
|
||||
(dev2 == target_dev and inode2_hex == target_inode_hex):
|
||||
keys_to_delete.append(key_bytes)
|
||||
for key in keys_to_delete:
|
||||
t.delete(key, db=db_handle)
|
||||
|
||||
if (dev1 == target_dev and inode1_hex == target_inode_hex) or \
|
||||
(dev2 == target_dev and inode2_hex == target_inode_hex):
|
||||
keys_to_delete.append(key_bytes)
|
||||
if txn:
|
||||
do_remove(txn)
|
||||
else:
|
||||
with QMutexLocker(self._db_lock):
|
||||
with self._lmdb_env.begin(write=True) as t:
|
||||
do_remove(t)
|
||||
|
||||
for key in keys_to_delete:
|
||||
txn.delete(key, db=db_handle)
|
||||
|
||||
def mark_as_pending(self, path1, path2, is_pending=True, similarity=None):
|
||||
def mark_as_pending(self, path1, path2, is_pending=True, similarity=None, timestamp=None):
|
||||
"""Marks a pair as pending review."""
|
||||
if not self._lmdb_env or self._pending_db is None:
|
||||
return False
|
||||
@@ -352,9 +361,8 @@ class DuplicateCache(QObject):
|
||||
return False
|
||||
|
||||
# Store paths in value to allow reconstruction without scanning
|
||||
val_str = f"{path1}|{path2}"
|
||||
if similarity is not None:
|
||||
val_str += f"|{similarity}"
|
||||
ts = timestamp if timestamp is not None else int(time.time())
|
||||
val_str = f"{path1}|{path2}|{similarity if similarity is not None else ''}|{ts}"
|
||||
value = val_str.encode('utf-8')
|
||||
|
||||
with QMutexLocker(self._db_lock):
|
||||
@@ -381,9 +389,10 @@ class DuplicateCache(QObject):
|
||||
try:
|
||||
parts = value_bytes.decode('utf-8').split('|')
|
||||
p1, p2 = parts[0], parts[1]
|
||||
sim = int(parts[2]) if len(parts) > 2 else None
|
||||
sim = int(parts[2]) if len(parts) > 2 and parts[2] else None
|
||||
ts = int(parts[3]) if len(parts) > 3 else 0
|
||||
if os.path.exists(p1) and os.path.exists(p2):
|
||||
results.append(DuplicateResult(p1, p2, None, False, sim))
|
||||
results.append(DuplicateResult(p1, p2, None, False, sim, ts))
|
||||
else:
|
||||
keys_to_delete.append(key)
|
||||
except Exception:
|
||||
@@ -414,6 +423,7 @@ class DuplicateCache(QObject):
|
||||
try:
|
||||
p1, p2 = None, None
|
||||
sim = None
|
||||
ts = 0
|
||||
val_str = value_bytes.decode('utf-8')
|
||||
|
||||
if '|' in val_str:
|
||||
@@ -421,8 +431,12 @@ class DuplicateCache(QObject):
|
||||
parts = val_str.split('|')
|
||||
if len(parts) >= 2:
|
||||
p1, p2 = parts[0], parts[1]
|
||||
if len(parts) > 2:
|
||||
if len(parts) > 2 and parts[2]:
|
||||
sim = int(parts[2])
|
||||
if len(parts) > 3:
|
||||
ts = int(parts[3])
|
||||
else:
|
||||
ts = int(os.path.getmtime(p1)) if os.path.exists(p1) else 0
|
||||
|
||||
if not p1 or not p2:
|
||||
# Legacy format fallback: lookup paths in hash db
|
||||
@@ -438,7 +452,7 @@ class DuplicateCache(QObject):
|
||||
|
||||
if p1 and p2:
|
||||
if os.path.exists(p1) and os.path.exists(p2):
|
||||
results.append(DuplicateResult(p1, p2, None, True, sim))
|
||||
results.append(DuplicateResult(p1, p2, None, True, sim, ts))
|
||||
except Exception:
|
||||
continue
|
||||
return results
|
||||
@@ -606,7 +620,7 @@ class DuplicateDetector(QThread):
|
||||
mtime = stat_info.st_mtime
|
||||
dev, inode = stat_info.st_dev, struct.pack('Q', stat_info.st_ino)
|
||||
|
||||
cached_h = None if self.force_full else \
|
||||
cached_h = \
|
||||
self.duplicate_cache.get_hash_for_path(path, mtime, dev, inode)
|
||||
|
||||
if cached_h:
|
||||
@@ -658,13 +672,6 @@ class DuplicateDetector(QThread):
|
||||
# Signal phase transition to exactly 50%
|
||||
self.progress_update.emit(total_files, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format(filename="..."))
|
||||
|
||||
if not self.force_full and not dirty_paths:
|
||||
# No files changed and no re-scan forced.
|
||||
# We can skip Phase 2 as all results were loaded from the pending cache.
|
||||
self.duplicates_found.emit(found_duplicates)
|
||||
self.detection_finished.emit()
|
||||
return
|
||||
|
||||
# 3. Phase 2: Comparison (Optimized with BK-Tree)
|
||||
hash_map = collections.defaultdict(list)
|
||||
bk_tree = BKTree(lambda a, b: a - b)
|
||||
@@ -715,10 +722,11 @@ class DuplicateDetector(QThread):
|
||||
if canonical not in unique_duplicate_pairs:
|
||||
if not self.duplicate_cache.is_exception(p1, p2):
|
||||
sim = int((1.0 - (distance / MAX_DHASH_DISTANCE)) * 100)
|
||||
res = DuplicateResult(p1, p2, str(h1), False, sim)
|
||||
ts = int(time.time())
|
||||
res = DuplicateResult(p1, p2, str(h1), False, sim, ts)
|
||||
found_duplicates.append(res)
|
||||
unique_duplicate_pairs.add(canonical)
|
||||
self.duplicate_cache.mark_as_pending(p1, p2, True, similarity=sim)
|
||||
self.duplicate_cache.mark_as_pending(p1, p2, True, similarity=sim, timestamp=ts)
|
||||
|
||||
self.duplicates_found.emit(found_duplicates)
|
||||
self.detection_finished.emit()
|
||||
|
||||
Reference in New Issue
Block a user