""" Duplicate Cache and Detection Module for Bagheera. This module provides the core logic for detecting duplicate images using perceptual hashing (dHash) and managing a persistent cache of these hashes and their relationships using LMDB. Classes: DuplicateCache: Manages the LMDB database for hashes and exceptions. DuplicateDetector: Background thread that performs the duplicate analysis. """ import os import logging import struct import time as time_module import collections import shutil import lmdb from pathlib import Path import PIL.Image from PySide6.QtCore import ( QObject, QThread, Signal, QMutex, QSemaphore, QReadWriteLock, QMutexLocker, QReadLocker, QWriteLocker, QRunnable ) import imagehash # For perceptual hashing from constants import MAX_DHASH_DISTANCE from constants import ( DUPLICATE_CACHE_PATH, DUPLICATE_HASH_DB_NAME, DUPLICATE_EXCEPTIONS_DB_NAME, DUPLICATE_PENDING_DB_NAME, DUPLICATE_BKTREE_DB_NAME, DUPLICATE_HASH_TO_FILES_DB_NAME, UITexts ) logger = logging.getLogger(__name__) # Result structure for duplicate detection DuplicateResult = collections.namedtuple( 'DuplicateResult', ['path1', 'path2', 'hash_value', 'is_exception', 'similarity', 'timestamp']) class HashWorker(QRunnable): """Worker to calculate image hash in a thread pool.""" def __init__(self, path, detector, result_dict, mutex, semaphore): super().__init__() self.path = path self.detector = detector self.result_dict = result_dict self.mutex = mutex self.semaphore = semaphore def run(self): if self.detector._is_running: try: # imagehash requires a PIL/Pillow image object. with PIL.Image.open(self.path) as pil_img: # Using dHash from imagehash library as default h = str(imagehash.dhash(pil_img)) with QMutexLocker(self.mutex): self.result_dict[self.path] = h except Exception as e: logger.warning(f"HashWorker failed for {self.path}: {e}") self.semaphore.release() class DuplicateCache(QObject): """ Manages a persistent LMDB cache for perceptual hashes and duplicate relationships. Uses (device_id, inode) as primary keys for robustness against file renames/moves. """ # noqa: E501 def __init__(self): super().__init__() self._lmdb_env = None self._hash_db = None self._exceptions_db = None self._pending_db = None self._bktree_db = None self._hash_to_files_db = None self._db_lock = QMutex() # Protects LMDB transactions # In-memory cache for hashes: (dev, inode) -> (hash_value, mtime, path) self._hash_cache = {} self._hash_cache_lock = QReadWriteLock() self.lmdb_open() def lmdb_open(self): cache_dir = Path(DUPLICATE_CACHE_PATH) cache_dir.mkdir(parents=True, exist_ok=True) try: self._lmdb_env = lmdb.open( DUPLICATE_CACHE_PATH, map_size=10 * 1024 * 1024 * 1024, # 10GB default # Hashes, exceptions, pending, bktree, hash_to_files max_dbs=5, readonly=False, create=True ) self._hash_db = self._lmdb_env.open_db(DUPLICATE_HASH_DB_NAME) self._exceptions_db = self._lmdb_env.open_db(DUPLICATE_EXCEPTIONS_DB_NAME) self._pending_db = self._lmdb_env.open_db(DUPLICATE_PENDING_DB_NAME) self._bktree_db = self._lmdb_env.open_db(DUPLICATE_BKTREE_DB_NAME) self._hash_to_files_db = self._lmdb_env.open_db( DUPLICATE_HASH_TO_FILES_DB_NAME, dupsort=True) logger.info(f"Duplicate LMDB cache opened: {DUPLICATE_CACHE_PATH}") except Exception as e: logger.error(f"Failed to open duplicate LMDB cache: {e}") self._lmdb_env = None def lmdb_close(self): if self._lmdb_env: self._lmdb_env.close() self._lmdb_env = None self._hash_db = None self._exceptions_db = None self._pending_db = None self._bktree_db = None self._hash_to_files_db = None def get_hash_stats(self): """Returns (count, size_bytes) for the hash database.""" count = 0 if not self._lmdb_env: return 0, 0 with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: count = txn.stat(db=self._hash_db)['entries'] size = 0 disk_path = os.path.join(DUPLICATE_CACHE_PATH, "data.mdb") if os.path.exists(disk_path): size = os.path.getsize(disk_path) return count, size def clear_hashes(self): """Clears all hashes from the database by recreating the environment.""" with QWriteLocker(self._hash_cache_lock): self._hash_cache.clear() self.lmdb_close() try: if os.path.exists(DUPLICATE_CACHE_PATH): shutil.rmtree(DUPLICATE_CACHE_PATH) self.lmdb_open() logger.info("Duplicate hash cache cleared.") except Exception as e: logger.error(f"Error clearing duplicate LMDB: {e}") def clear_exceptions(self): """Clears all entries from the exceptions database.""" if not self._lmdb_env or self._exceptions_db is None: return False with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: # Clear the DB but keep the handle valid txn.drop(self._exceptions_db, delete=False) logger.info("Duplicate exceptions database cleared.") return True def __del__(self): self.lmdb_close() @staticmethod def _get_inode_info(path): try: if not path: return 0, None stat_info = os.stat(path) return stat_info.st_dev, struct.pack('Q', stat_info.st_ino) except OSError: return 0, None @staticmethod def _hash_str_to_bytes(hash_str): return struct.pack('>Q', int(hash_str, 16)) @staticmethod def _hamming_distance(h1_bytes, h2_bytes): return bin(struct.unpack('>Q', h1_bytes)[0] ^ struct.unpack('>Q', h2_bytes)[0]).count('1') def _get_lmdb_key(self, dev_id, inode_key_bytes): return f"{dev_id}-{inode_key_bytes.hex()}".encode('utf-8') def get_hash_and_path(self, dev_id, inode_key_bytes): """Retrieves hash, mtime and path for a given (dev_id, inode_key_bytes).""" # Check in-memory cache first with QReadLocker(self._hash_cache_lock): cached_data = self._hash_cache.get((dev_id, inode_key_bytes)) if cached_data: return cached_data # (hash_value, mtime, path) # Check LMDB if not self._lmdb_env: return None, 0, None with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: lmdb_key = self._get_lmdb_key(dev_id, inode_key_bytes) value_bytes = txn.get(lmdb_key, db=self._hash_db) if value_bytes: # Handle format "hash_value_str|mtime|path_str" or old "hash|path" parts = value_bytes.decode('utf-8').split('|', 2) if len(parts) == 3: hash_str = parts[0] mtime_str = parts[1] path_str = os.path.abspath(os.path.normpath(parts[2])) mtime = float(mtime_str) elif len(parts) == 2: hash_str = parts[0] path_str = os.path.abspath(os.path.normpath(parts[1])) mtime = 0.0 # Force re-hash else: return None, 0, None with QWriteLocker(self._hash_cache_lock): self._hash_cache[(dev_id, inode_key_bytes)] = ( hash_str, mtime, path_str) return hash_str, mtime, path_str return None, 0, None def get_hash_info_for_path(self, path, current_mtime, dev_id=None, inode_key_bytes=None): if dev_id is None or inode_key_bytes is None: dev_id, inode_key_bytes = self._get_inode_info(path) if not inode_key_bytes: return None, 0, None hash_value, cached_mtime, cached_path = self.get_hash_and_path( dev_id, inode_key_bytes) # Return hash only if mtime matches (with small float tolerance) if hash_value and abs(cached_mtime - current_mtime) < 0.001: return hash_value, cached_mtime, cached_path return None, 0, None def add_hash_for_path(self, path, hash_value, mtime, dev_id=None, inode_key_bytes=None): if dev_id is None or inode_key_bytes is None: # noqa: E501 dev_id, inode_key_bytes = self._get_inode_info(path) if not inode_key_bytes or not self._lmdb_env: return False hash_bytes = self._hash_str_to_bytes(hash_value) file_id_bytes = f"{dev_id}-{inode_key_bytes.hex()}".encode('utf-8') path = os.path.abspath(os.path.normpath(path)) value_str = f"{hash_value}|{mtime}|{path}" with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: lmdb_key = self._get_lmdb_key(dev_id, inode_key_bytes) # Verificar si el archivo ya tenía un hash distinto (actualización) # Check if the file already had a different hash (update) old_val = txn.get(lmdb_key, db=self._hash_db) if old_val: old_parts = old_val.decode('utf-8').split('|') if old_parts[0] != hash_value: # noqa: E501 old_h_bytes = self._hash_str_to_bytes(old_parts[0]) txn.delete(old_h_bytes, file_id_bytes, db=self._hash_to_files_db) txn.put(lmdb_key, value_str.encode('utf-8'), db=self._hash_db) txn.put(hash_bytes, file_id_bytes, db=self._hash_to_files_db) # Actualización incremental del BK-Tree persistente # Incremental update of the persistent BK-Tree self._persistent_bktree_add(txn, hash_bytes) # noqa: E501 with QWriteLocker(self._hash_cache_lock): self._hash_cache[(dev_id, inode_key_bytes)] = (hash_value, mtime, path) return True def _persistent_bktree_add(self, txn, hash_bytes): root_hash = txn.get(b'__root__', db=self._bktree_db) if root_hash is None: txn.put(b'__root__', hash_bytes, db=self._bktree_db) return curr_hash = root_hash while True: if curr_hash == hash_bytes: return dist = self._hamming_distance(hash_bytes, curr_hash) children = self._decode_children(txn.get(curr_hash, db=self._bktree_db)) if dist in children: curr_hash = children[dist] else: children[dist] = hash_bytes txn.put(curr_hash, self._encode_children(children), db=self._bktree_db) break def persistent_bktree_query(self, hash_str, max_dist): """Busca en el árbol de disco hashes similares al proporcionado.""" hash_bytes = self._hash_str_to_bytes(hash_str) results = [] with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: root = txn.get(b'__root__', db=self._bktree_db) if not root: return [] candidates = [root] while candidates: curr = candidates.pop() dist = self._hamming_distance(hash_bytes, curr) if dist <= max_dist: results.append((curr, dist)) children = self._decode_children(txn.get(curr, db=self._bktree_db)) for d, child_hash in children.items(): if abs(dist - d) <= max_dist: candidates.append(child_hash) return results def regenerate_bktree(self, progress_callback=None): """ Regenerates the BK-Tree and reverse index from the hashes database. Useful if index corruption is suspected. """ if not self._lmdb_env: return False with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: # 1. Clear existing indices (keeps handles valid) txn.drop(self._bktree_db, delete=False) txn.drop(self._hash_to_files_db, delete=False) # 2. Iterate hashes and rebuild cursor = txn.cursor(db=self._hash_db) count = 0 total = txn.stat(db=self._hash_db)['entries'] hashes_added_to_tree = set() for file_id_bytes, value_bytes in cursor: try: val_str = value_bytes.decode('utf-8') parts = val_str.split('|') if not parts: continue hash_str = parts[0] hash_bytes = self._hash_str_to_bytes(hash_str) if hash_bytes not in hashes_added_to_tree: self._persistent_bktree_add(txn, hash_bytes) hashes_added_to_tree.add(hash_bytes) txn.put(hash_bytes, file_id_bytes, db=self._hash_to_files_db) count += 1 if progress_callback and count % 100 == 0: progress_callback(count, total) except Exception as e: logger.error(f"Error re-indexing {file_id_bytes}: {e}") return True def get_files_for_hash(self, hash_bytes): """Returns all files that share a hash using the reverse index. """ # noqa: E501 files = [] with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: cursor = txn.cursor(db=self._hash_to_files_db) if cursor.set_key(hash_bytes): for file_id_bytes in cursor.iternext_dup(): info = txn.get(file_id_bytes, db=self._hash_db) if info: parts = info.decode('utf-8').split('|') if len(parts) >= 3: fid_str = file_id_bytes.decode('utf-8') dev = int(fid_str.split('-')[0]) inode = bytes.fromhex(fid_str.split('-')[1]) files.append((parts[2], dev, inode)) return files def check_reverse_index_integrity(self): """ Performs a diagnostic of the reverse index. Returns a dictionary with the count of total and orphaned entries. """ stats = {'total': 0, 'orphans': 0, 'mismatches': 0} if not self._lmdb_env: return stats with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: cursor = txn.cursor(db=self._hash_to_files_db) for hash_bytes, file_id_bytes in cursor: stats['total'] += 1 info = txn.get(file_id_bytes, db=self._hash_db) if not info: stats['orphans'] += 1 continue try: stored_hash_str = info.decode('utf-8').split('|')[0] if self._hash_str_to_bytes(stored_hash_str) != hash_bytes: stats['mismatches'] += 1 except Exception: stats['orphans'] += 1 return stats def prune_reverse_index_orphans(self): """Removes orphaned records from the reverse index surgically without rebuilding the entire tree. """ removed = 0 if not self._lmdb_env: return 0 with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: cursor = txn.cursor(db=self._hash_to_files_db) for hash_bytes, file_id_bytes in cursor: info = txn.get(file_id_bytes, db=self._hash_db) should_delete = False if not info: should_delete = True else: stored_hash_str = info.decode('utf-8').split('|')[0] if self._hash_str_to_bytes(stored_hash_str) != hash_bytes: should_delete = True if should_delete: cursor.delete() removed += 1 return removed @staticmethod def _decode_children(data): if not data: return {} res = {} for i in range(0, len(data), 9): res[data[i]] = data[i+1:i+9] return res @staticmethod def _encode_children(children_dict): res = bytearray() for d, h in children_dict.items(): res.append(d) res.extend(h) return bytes(res) def remove_hash_for_path(self, path, clear_relationships=True): """ Removes the hash entry for a path. Args: path: File path. clear_relationships: If True, also wipes all entries in pending and exceptions DBs involving this file. """ dev_id, inode_key_bytes = self._get_inode_info(path) if not inode_key_bytes or not self._lmdb_env: return False with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: lmdb_key = self._get_lmdb_key(dev_id, inode_key_bytes) # Safeguard: Do not proceed if identity information is invalid if dev_id == 0 or not inode_key_bytes: return False # Limpiar el índice inverso antes de borrar la entrada principal val_bytes = txn.get(lmdb_key, db=self._hash_db) if val_bytes: try: parts = val_bytes.decode('utf-8').split('|') h_bytes = self._hash_str_to_bytes(parts[0]) txn.delete(h_bytes, lmdb_key, db=self._hash_to_files_db) except Exception: pass txn.delete(lmdb_key, db=self._hash_db) with QWriteLocker(self._hash_cache_lock): self._hash_cache.pop((dev_id, inode_key_bytes), None) # Also remove any exceptions involving this path if clear_relationships: self._remove_pair_entries_for_path( dev_id, inode_key_bytes, self._exceptions_db) self._remove_pair_entries_for_path( dev_id, inode_key_bytes, self._pending_db) return True def _get_pair_lmdb_key_from_ids(self, dev1, inode1, dev2, inode2): # Ensure canonical order for exception keys key_parts = sorted([f"{dev1}-{inode1.hex()}", f"{dev2}-{inode2.hex()}"]) return f"{key_parts[0]}-{key_parts[1]}".encode('utf-8') def _get_pair_lmdb_key(self, path1, path2): dev1, inode1 = self._get_inode_info(path1) dev2, inode2 = self._get_inode_info(path2) if not inode1 or not inode2: return None return self._get_pair_lmdb_key_from_ids(dev1, inode1, dev2, inode2) def mark_as_exception(self, path1, path2, is_exception=True, similarity=None, timestamp=None): if not self._lmdb_env: return False dev1, inode1 = self._get_inode_info(path1) dev2, inode2 = self._get_inode_info(path2) if not inode1 or not inode2: return False exception_key = self._get_pair_lmdb_key_from_ids( dev1, inode1, dev2, inode2) if not exception_key: return False # Store paths in value to make exception recovery independent of hash DB ts = timestamp if timestamp is not None else int(time_module.time()) val_str = f"{path1}|{path2}|{similarity if similarity is not None else ''}|{ts}" value = val_str.encode('utf-8') with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: if is_exception: txn.put(exception_key, value, db=self._exceptions_db) else: txn.delete(exception_key, db=self._exceptions_db) return True def is_exception(self, path1, path2): if not self._lmdb_env: return False dev1, inode1 = self._get_inode_info(path1) dev2, inode2 = self._get_inode_info(path2) if not inode1 or not inode2: return False exception_key = self._get_pair_lmdb_key_from_ids( dev1, inode1, dev2, inode2) if not exception_key: return False with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: return txn.get(exception_key, db=self._exceptions_db) is not None def _remove_pair_entries_for_path(self, target_dev, target_inode, db_handle, txn=None): """Removes all entries involving a specific (dev, inode) pair from a pair-based DB.""" if not self._lmdb_env: return target_inode_hex = target_inode.hex() def do_remove(t): cursor = t.cursor(db=db_handle) keys_to_delete = [] for key_bytes, _ in cursor: # noqa: E501 key_str = key_bytes.decode('utf-8') parts = key_str.split('-') if len(parts) < 4: continue dev1, inode1_hex, dev2, inode2_hex = int( parts[0]), parts[1], int(parts[2]), parts[3] if (dev1 == target_dev and inode1_hex == target_inode_hex) or \ (dev2 == target_dev and inode2_hex == target_inode_hex): keys_to_delete.append(key_bytes) for key in keys_to_delete: t.delete(key, db=db_handle) if txn: do_remove(txn) else: with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as t: do_remove(t) def mark_as_pending(self, path1, path2, is_pending=True, similarity=None, timestamp=None): """Marks a pair as pending review.""" if not self._lmdb_env or self._pending_db is None: return False key = self._get_pair_lmdb_key(path1, path2) if not key: return False # Store paths in value to allow reconstruction without scanning ts = timestamp if timestamp is not None else int(time_module.time()) val_str = f"{path1}|{path2}|{similarity if similarity is not None else ''}|{ts}" value = val_str.encode('utf-8') with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: if is_pending: txn.put(key, value, db=self._pending_db) else: # Check if it exists before deleting to avoid errors if txn.get(key, db=self._pending_db): txn.delete(key, db=self._pending_db) return True def mark_as_pending_batch(self, pairs_data): """ Marks multiple pairs as pending review in a single transaction. pairs_data: list of (path1, path2, similarity, timestamp) """ if not self._lmdb_env or self._pending_db is None or not pairs_data: return False with QMutexLocker(self._db_lock): # noqa: E501 with self._lmdb_env.begin(write=True) as txn: for p1, p2, similarity, timestamp in pairs_data: key = self._get_pair_lmdb_key(p1, p2) if not key: continue ts = timestamp if timestamp is not None else int(time_module.time()) sim_str = str(similarity) if similarity is not None else "" val_str = f"{p1}|{p2}|{sim_str}|{ts}" value = val_str.encode('utf-8') txn.put(key, value, db=self._pending_db) return True def get_all_exceptions_set(self): """Returns a set of canonical pairs (frozenset of inode-IDs) marked as exceptions.""" exceptions = set() if not self._lmdb_env or self._exceptions_db is None: return exceptions with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: cursor = txn.cursor(db=self._exceptions_db) for key_bytes, _ in cursor: # Format is dev1-ino1-dev2-ino2. Since ino hex is 16 chars, # we parse from parts knowing that IDs always end with hex. parts = key_bytes.decode('utf-8').split('-') # Handle potential negative dev_ids or other hyphenated parts # Each ID is [device_part] + "-" + [16 hex chars inode] if len(parts) >= 4: id2 = f"{parts[-2]}-{parts[-1]}" id1 = "-".join(parts[:-2]) exceptions.add(frozenset((id1, id2))) return exceptions def get_all_pending_duplicates(self): """Retrieves all pending duplicate pairs from the database.""" results = [] if not self._lmdb_env or self._pending_db is None: return results keys_to_delete = [] # Keys to delete from pending DB # List to mark exceptions outside the read transaction links_to_ignore = [] # noqa: E501 with QMutexLocker(self._db_lock): # noqa: E501 with self._lmdb_env.begin(write=False) as txn: # noqa: E501 cursor = txn.cursor(db=self._pending_db) for key, value_bytes in cursor: try: parts = value_bytes.decode('utf-8').split('|') p1 = os.path.abspath(os.path.normpath(parts[0])) # noqa: E501 p2 = os.path.abspath(os.path.normpath(parts[1])) # Prueba definitiva de identidad para symlinks/hardlinks try: if os.path.samefile(p1, p2) or \ os.path.realpath(p1) == os.path.realpath(p2): keys_to_delete.append(key) # Move from pending to exception silently if now links links_to_ignore.append((p1, p2)) # noqa: E501 continue except OSError: # Si el archivo no existe, limpiar de pendientes keys_to_delete.append(key) pass sim = int(parts[2]) if len(parts) > 2 and parts[2] else None ts = int(parts[3]) if len(parts) > 3 else 0 # noqa: E501 if os.path.exists(p1) and os.path.exists(p2): # Verificar si ya es una excepción conocida (por inodo) # Check if it's already a known exception (by inode) dev1, ino1 = self._get_inode_info(p1) dev2, ino2 = self._get_inode_info(p2) if ino1 and ino2: ex_key = self._get_pair_lmdb_key_from_ids( dev1, ino1, dev2, ino2) if txn.get(ex_key, db=self._exceptions_db): keys_to_delete.append(key) continue results.append( DuplicateResult(p1, p2, None, False, sim, ts)) else: keys_to_delete.append(key) except Exception: keys_to_delete.append(key) continue if keys_to_delete: try: with self._lmdb_env.begin(write=True) as txn: for k in keys_to_delete: if txn.get(k, db=self._pending_db): txn.delete(k, db=self._pending_db) logger.info(f"Cleaned up {len(keys_to_delete)} invalid " "pending duplicates (files deleted externally)") except Exception as e: logger.error( f"Error cleaning up pending duplicates from DB: {e}") for p1, p2 in links_to_ignore: self.mark_as_exception(p1, p2, True, similarity=100) return results def get_all_exceptions(self): """Retrieves all duplicate pairs marked as exceptions from the database.""" results = [] if not self._lmdb_env or self._exceptions_db is None: return results with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: cursor = txn.cursor(db=self._exceptions_db) for key_bytes, value_bytes in cursor: try: p1, p2 = None, None sim = None ts = 0 # noqa: E501 val_str = value_bytes.decode('utf-8') if '|' in val_str: # New format: paths are stored in the value parts = val_str.split('|') if len(parts) >= 2: p1, p2 = parts[0], parts[1] if len(parts) > 2 and parts[2]: sim = int(parts[2]) if len(parts) > 3: # noqa: E501 ts = int(parts[3]) else: ts = int(os.path.getmtime(p1)) \ if os.path.exists(p1) else 0 if not p1 or not p2: # Legacy format fallback: lookup paths in hash db # Legacy format fallback: lookup paths in hash db key_str = key_bytes.decode('utf-8') # noqa: E501 kp = key_str.split('-') if len(kp) == 4: k1, k2 = f"{kp[0]}-{kp[1]}".encode(), f"{kp[2]}-{kp[3]}".encode() v1, v2 = txn.get(k1, db=self._hash_db), \ txn.get(k2, db=self._hash_db) if v1 and v2: # Format is hash|mtime|path|dist... path is always # index 2 p1 = v1.decode('utf-8').split('|')[2] p2 = v2.decode('utf-8').split('|')[2] if p1 and p2: if os.path.exists(p1) and os.path.exists(p2): results.append( DuplicateResult(p1, p2, None, True, sim, ts)) except Exception: continue return results def clean_stale_hashes(self): """ Removes hash entries from the database for files that no longer exist on disk. """ if not self._lmdb_env or self._hash_db is None: return 0 keys_to_delete = [] with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: cursor = txn.cursor(db=self._hash_db) for key, value_bytes in cursor: try: # value_bytes is "hash|mtime|path|last_dist" parts = value_bytes.decode('utf-8').split('|') if len(parts) >= 3: path = parts[2] if not os.path.exists(path): keys_to_delete.append(key) except Exception: keys_to_delete.append(key) # Corrupted entry continue if keys_to_delete: with self._lmdb_env.begin(write=True) as txn: for k in keys_to_delete: txn.delete(k, db=self._hash_db) logger.info(f"Cleaned up {len(keys_to_delete)} stale hash " "entries (files deleted externally)") return len(keys_to_delete) def get_all_hashes_with_paths(self): """Retrieves all hashes from the database along with their associated paths and inode info.""" # hash_value -> [(path, dev_id, inode_key_bytes)] all_hashes = collections.defaultdict(list) if not self._lmdb_env: return all_hashes with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=False) as txn: cursor = txn.cursor(db=self._hash_db) for key_bytes, value_bytes in cursor: # key_bytes is like "dev_id-inode_hex" key_str = key_bytes.decode('utf-8') parts = key_str.split('-') dev_id = int(parts[0]) inode_key_bytes = bytes.fromhex(parts[1]) # value_bytes is "hash|mtime|path|last_dist" parts_val = value_bytes.decode('utf-8').split('|') if len(parts_val) >= 3: hash_value = parts_val[0] path = parts_val[2] else: continue all_hashes[hash_value].append((path, dev_id, inode_key_bytes)) return all_hashes def rename_entry(self, old_path, new_path): """ Updates the cache entry for a file that has been renamed or moved. This involves deleting the old (dev, inode) entry and adding a new one with the new (dev, inode) and path, preserving the hash value. """ old_dev, old_inode_key_bytes = self._get_inode_info(old_path) new_dev, new_inode_key_bytes = self._get_inode_info(new_path) if not old_inode_key_bytes or not new_inode_key_bytes or not self._lmdb_env: return False # If the (dev, inode) pair is the same, only the path in the value needs # updating. # This happens if the file is renamed within the same filesystem. if (old_dev, old_inode_key_bytes) == (new_dev, new_inode_key_bytes): hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes) if hash_value: # noqa: E501 self.add_hash_for_path(new_path, hash_value, mtime) self._update_pair_paths(old_path, new_path, self._pending_db) self._update_pair_paths(old_path, new_path, self._exceptions_db) return True return False # If (dev, inode) changed (cross-filesystem move), we need to: # 1. Get the hash from the old entry. # 2. Remove the old entry. # 3. Add a new entry with the new (dev, inode) and path, using the old hash. hash_value, mtime, _ = self.get_hash_and_path(old_dev, old_inode_key_bytes) if hash_value: # Avoid deleting relationships when renaming (only update path) # Avoid deleting relationships when renaming (only update path) self.remove_hash_for_path(old_path, clear_relationships=False) # Adds new (dev, inode) entry self.add_hash_for_path(new_path, hash_value, mtime) self._update_pair_paths(old_path, new_path, self._pending_db) self._update_pair_paths(old_path, new_path, self._exceptions_db) return True return False def _update_pair_paths(self, old_path, new_path, db_handle): """Updates stored paths in a pair-based DB value when a file is renamed.""" if not self._lmdb_env or db_handle is None: return # Optimization: Pre-calculate the inode ID to filter keys during iteration. # Scanning only keys is much faster than retrieving associated values from disk. # Scanning only keys is much faster than retrieving associated values from disk. dev, ino = self._get_inode_info(old_path) if not ino: return file_id_bytes = f"{dev}-{ino.hex()}".encode('utf-8') with QMutexLocker(self._db_lock): with self._lmdb_env.begin(write=True) as txn: cursor = txn.cursor(db=db_handle) # iternext(values=False) iterates only the index keys, avoiding # unnecessary I/O. for key in cursor.iternext(values=False): # noqa: E501 if file_id_bytes in key: value_bytes = txn.get(key, db=db_handle) if not value_bytes: continue val_str = value_bytes.decode('utf-8') parts = val_str.split('|') # noqa: E501 if len(parts) >= 2: p1, p2 = parts[0], parts[1] # noqa: E501 if p1 == old_path or p2 == old_path: # Actualizamos solo la ruta que ha cambiado parts[0] = new_path if p1 == old_path else p1 parts[1] = new_path if p2 == old_path else p2 txn.put(key, "|".join(parts).encode('utf-8'), db=db_handle) class DuplicateDetector(QThread): """ Worker thread for detecting duplicate images using perceptual hashing. """ progress_update = Signal(int, int, str) # current, total, message duplicates_found = Signal(list) # List of DuplicateResult detection_finished = Signal() def __init__(self, paths_to_scan, duplicate_cache, pool_manager, method="histogram_hashing", threshold=90, force_full=False): super().__init__() self.paths_to_scan = paths_to_scan self.duplicate_cache = duplicate_cache self.pool_manager = pool_manager self.method = method self.threshold = threshold # Similarity percentage (50-100) self.force_full = force_full self._is_running = True def stop(self): self._is_running = False self.wait() # Add this line def run(self): # Asegurar que todas las rutas sean absolutas y normalizadas al inicio para # total consistency # total consistency self.paths_to_scan = [os.path.abspath(os.path.normpath(p)) for p in self.paths_to_scan] # Pre-load exceptions to avoid UnboundLocalError in Phase 1 and maintain # consistency exceptions_set = self.duplicate_cache.get_all_exceptions_set() scan_paths_set = set(self.paths_to_scan) total_files = len(self.paths_to_scan) found_duplicates = [] # To store frozenset((path1, path2)) for uniqueness unique_inode_pairs = set() last_ui_update_time = 0 pool = self.pool_manager.get_pool() # 0. Deduplicate input paths by physical identity (symlinks to same file) # This avoids comparing a file with its symlink or multiple symlinks to # the same target. unique_paths_to_scan = [] canonical_paths = {} # Initialize canonical_paths dictionary for p in self.paths_to_scan: try: # Resolve symlinks to their real physical location rp = os.path.realpath(p) if rp not in canonical_paths: canonical_paths[rp] = p unique_paths_to_scan.append(p) else: # noqa: E501 # It's a symlink or hardlink to something already in the list. # We mark it as an exception (similarity 100) so it doesn't show up. self.duplicate_cache.mark_as_exception( p, canonical_paths[rp], True, similarity=100) except OSError: unique_paths_to_scan.append(p) total_unique = len(unique_paths_to_scan) processed_initial = 0 # Convert similarity threshold (percentage) to Hamming distance distance_threshold = int(MAX_DHASH_DISTANCE * (100 - self.threshold) / 100) logger.info( f"Duplicate detection: Method={self.method}, " f"Similarity Threshold={self.threshold}%, Hamming " f"Distance Threshold={distance_threshold}") # 2. Phase 1: Hash Collection (Parallelized) path_to_hash = {} dirty_paths = set() paths_to_hash_parallel = [] for i, path in enumerate(unique_paths_to_scan): if not self._is_running: break try: stat_info = os.stat(path) mtime = stat_info.st_mtime dev, inode = stat_info.st_dev, struct.pack('Q', stat_info.st_ino) # Update UI during initial cache check (Phase 1 part A) processed_initial += 1 cached_h, _, cached_path = \ self.duplicate_cache.get_hash_info_for_path(path, mtime, dev, inode) if cached_h: # noqa: E501 if cached_path == path: path_to_hash[path] = (cached_h, dev, inode) else: # El archivo se movió o renombró: actualizar caché y marcar # como sucio self.duplicate_cache.add_hash_for_path( path, cached_h, mtime, dev, inode) dirty_paths.add(path) path_to_hash[path] = (cached_h, dev, inode) else: dirty_paths.add(path) paths_to_hash_parallel.append((path, mtime, dev, inode)) if time_module.perf_counter() - last_ui_update_time > 0.05: # Scale this part to 0-50% of the total bar # Scale this part to 0-50% of the total bar progress = int((processed_initial / total_unique) * total_files) self.progress_update.emit( progress, total_files * 2, UITexts.DUPLICATE_MSG_HASHING.format(filename="...")) last_ui_update_time = time_module.perf_counter() except OSError: continue if paths_to_hash_parallel and self._is_running: batch_size = pool.maxThreadCount() * 2 results_mutex = QMutex() new_hashes = {} sem = QSemaphore(0) # Phase 1 part B: Parallel hashing for new/changed files # Phase 1 part B: Parallel hashing for new/changed files processed_hashing = total_files - len(paths_to_hash_parallel) for i in range(0, len(paths_to_hash_parallel), batch_size): if not self._is_running: break current_batch = paths_to_hash_parallel[i : i + batch_size] for p_data in current_batch: pool.start(HashWorker( p_data[0], self, new_hashes, results_mutex, sem)) for j in range(len(current_batch)): while not sem.tryAcquire(1, 100): if not self._is_running: break if not self._is_running: break processed_hashing += 1 if time_module.perf_counter() - last_ui_update_time > 0.03: self.progress_update.emit( processed_hashing, total_files * 2, UITexts.DUPLICATE_MSG_HASHING.format(filename="...")) last_ui_update_time = time_module.perf_counter() for p, mtime, dev, inode in paths_to_hash_parallel: h = new_hashes.get(p) if h: path_to_hash[p] = (h, dev, inode) self.duplicate_cache.add_hash_for_path(p, h, mtime, dev, inode) # Cargar duplicados pendientes existentes (a menos que sea force_full) # Load existing pending duplicates (unless force_full) if not self.force_full: pending = self.duplicate_cache.get_all_pending_duplicates() for p in pending: # Normalize database paths to ensure match with scan set np1, np2 = (os.path.abspath(os.path.normpath(p.path1)), os.path.abspath(os.path.normpath(p.path2))) if np1 in scan_paths_set and np2 in scan_paths_set: # If any of the files changed (is in dirty_paths), the pending # result is invalid and must be ignored for recalculation. if np1 in dirty_paths or np2 in dirty_paths: continue # Omitir identidades físicas (enlaces) # Skip physical identities (links) # Skip physical identities (links) try: if np1 == np2 or os.path.samefile(np1, np2): # Mover de pendiente a excepción silenciosamente si ahora # son links self.duplicate_cache.mark_as_exception( np1, np2, True, similarity=100) self.duplicate_cache.mark_as_pending(np1, np2, False) continue except OSError: continue # Check if already marked as exception # Check if already marked as exception dev1, ino1 = self.duplicate_cache._get_inode_info(np1) dev2, ino2 = self.duplicate_cache._get_inode_info(np2) if ino1 and ino2: id1, id2 = f"{dev1}-{ino1.hex()}", f"{dev2}-{ino2.hex()}" if frozenset((id1, id2)) in exceptions_set: # Si ya es una excepción, asegurar que no esté en pendientes self.duplicate_cache.mark_as_pending(np1, np2, False) continue if p.similarity is None or p.similarity >= self.threshold: # noqa: E501 # Usar las rutas normalizadas en el resultado res = p._replace(path1=np1, path2=np2) found_duplicates.append(res) # Guardar inodos para evitar recalcular en Phase 2 if ino1 and ino2: unique_inode_pairs.add(frozenset((id1, id2))) if not self._is_running: self.detection_finished.emit() return # --- KEY OPTIMIZATION: EARLY EXIT --- # If there are no new or modified files and no full analysis was forced, # devolvemos los resultados que ya estaban en la base de datos de pendientes. if not dirty_paths and not self.force_full: self.progress_update.emit( total_files * 2, total_files * 2, UITexts.DUPLICATE_FINISHED) self.duplicates_found.emit(found_duplicates) self.detection_finished.emit() return # 3. Phase 2: Incremental Comparison using Persistent BK-Tree # 3. Phase 2: Incremental Comparison using Persistent BK-Tree paths_to_query = list(dirty_paths) if not self.force_full \ else unique_paths_to_scan total_queries = len(paths_to_query) results_to_save = [] for i, p1 in enumerate(paths_to_query): if not self._is_running: break h1_data = path_to_hash.get(p1) if not h1_data: continue h1_str, dev1, ino1 = h1_data if time_module.perf_counter() - last_ui_update_time > 0.05: # Scale Analysis progress to 50% - 100% range of the status bar progress = total_files + int((i / total_queries) * total_files) self.progress_update.emit( progress, total_files * 2, UITexts.DUPLICATE_MSG_ANALYZING.format( filename=os.path.basename(p1))) last_ui_update_time = time_module.perf_counter() # Query the persistent tree for similar hashes (direct from disk) similar_hashes = self.duplicate_cache.persistent_bktree_query( h1_str, distance_threshold) for h2_bytes, distance in similar_hashes: # Find all files sharing this similar hash (reverse index) matches = self.duplicate_cache.get_files_for_hash(h2_bytes) for p2, dev2, ino2 in matches: if not self._is_running: break # Check if p2 is within the current scan scope to avoid # results outside the folders the user is currently browsing. if p2 not in scan_paths_set: continue # 1. Check if it's exactly the same path (already normalized) # 1. Check if it's exactly the same path (already normalized) if p1 == p2: continue # 2. Check memory caches for physical identity (Inodes) # 2. Check memory caches for physical identity (Inodes) id1, id2 = f"{dev1}-{ino1.hex()}", f"{dev2}-{ino2.hex()}" inode_pair = frozenset((id1, id2)) if inode_pair in unique_inode_pairs or inode_pair in exceptions_set: continue # 3. Absolute physical identity (pointers to the same object) # 3. Absolute physical identity (pointers to the same object) try: if (dev1, ino1) == (dev2, ino2) or os.path.samefile(p1, p2): # Silent identity (symlinks): mark and skip self.duplicate_cache.mark_as_exception( p1, p2, True, similarity=100) exceptions_set.add(inode_pair) unique_inode_pairs.add(inode_pair) continue except OSError: pass # 4. Avoid duplicating pairs already processed in this session # (A-B is the same as B-A) # (A-B is the same as B-A) if inode_pair in unique_inode_pairs: continue # 5. Calculate actual similarity # 5. Calculate actual similarity sim = int((1.0 - (distance / MAX_DHASH_DISTANCE)) * 100) if sim < self.threshold: continue ts = int(time_module.time()) res = DuplicateResult(p1, p2, h1_str, False, sim, ts) found_duplicates.append(res) unique_inode_pairs.add(inode_pair) results_to_save.append((p1, p2, sim, ts)) # Periodically flush pending updates to DB if len(results_to_save) >= 50: self.duplicate_cache.mark_as_pending_batch(results_to_save) results_to_save = [] # Final flush of remaining updates if results_to_save: self.duplicate_cache.mark_as_pending_batch(results_to_save) self.duplicates_found.emit(found_duplicates) self.detection_finished.emit()