v1.1.0

2026-05-10 16:37:46 +02:00
parent 6207cab27a
commit af21672b1c
3 changed files with 166 additions and 72 deletions
--- a/bagheera_search_lib/bagheera_search.py
+++ b/bagheera_search_lib/bagheera_search.py
@@ -33,31 +33,42 @@ def expression_contains_tags(text):
 class EvaluateExpression:
    def __init__(self):
        # Pre-define the grammar structure during initialization
        self.grammar = self._build_grammar()
    def _compare_single(self, l_val, op, r_val):
-        """
+        # 1. CASE SENSITIVE (Strict)
-        Atomic comparison logic for individual values.
+        if op == "==":
-        Handles numeric conversion and standard operators.
+            return str(l_val) == str(r_val)
-        """
+
-        # Numeric conversion for mathematical operators
+        # 2. NUMERIC LOGIC
        if op in (">", "<", ">=", "<="):
            try:
-                # Attempt to treat both sides as floats
+                # We use float for numeric magnitude
                curr_l, curr_r = float(l_val), float(r_val)
                if op == ">":
                    return curr_l > curr_r
                if op == "<":
                    return curr_l < curr_r
                if op == ">=":
                    return curr_l >= curr_r
                if op == "<=":
                    return curr_l <= curr_r
            except (ValueError, TypeError):
-                # Fallback to string comparison if conversion fails
+                # Fallback to case-insensitive string if not numeric
-                curr_l, curr_r = str(l_val), str(r_val)
+                pass
-        else:
+
-            # Default to string representation for other operators
+        # 3. CASE INSENSITIVE (Default for =, !=, :)
-            curr_l, curr_r = str(l_val), str(r_val)
+        curr_l = str(l_val).lower()
        curr_r = str(r_val).lower()
        # Standard operator logic
        if op == "=":
-            return l_val == r_val
+            return curr_l == curr_r
        if op == "!=":
-            return l_val != r_val
+            return curr_l != curr_r
        if op == ":":
            return curr_r in curr_l
        # String fallback for magnitude if numeric failed
        if op == ">":
            return curr_l > curr_r
        if op == "<":
@@ -66,44 +77,39 @@ class EvaluateExpression:
            return curr_l >= curr_r
        if op == "<=":
            return curr_l <= curr_r
-        if op == ":":
+
            return str(r_val).lower() in str(l_val).lower()
        return False
    def _compare(self, data, left_key, op, right_val):
-        """
+        # Normalizing keys for lookup, but KEEPING the values intact
        Main comparison router. Checks if the field is a list or a single value.
        """
        # Normalize data keys to lowercase for case-insensitive lookup
        normalized_data = {k.lower(): v for k, v in data.items()}
-        # Extract the left-hand value (the field from the JSON)
+        # Get left value from data or use as literal
        l_val = normalized_data.get(left_key.lower(), left_key)
-        # Extract the right-hand value (check if it's a literal or another field)
+        # Resolve right value: if it's a key in data, use its value.
-        r_val = normalized_data.get(str(right_val).lower(), right_val)
+        # Important: use lower() only for the KEY lookup, not the value itself.
        r_key_lookup = str(right_val).lower()
        if r_key_lookup in normalized_data:
            r_val = normalized_data[r_key_lookup]
        else:
            r_val = right_val
        # IF THE FIELD VALUE IS A LIST
        if isinstance(l_val, list):
            # Return True if ANY item in the list satisfies the condition
            return any(self._compare_single(item, op, r_val) for item in l_val)
        # IF THE FIELD VALUE IS A SINGLE DATA POINT
        return self._compare_single(l_val, op, r_val)
    def _build_grammar(self):
-        """
+        # CRITICAL: '==' must come BEFORE '=' in the list
-        Defines the pyparsing grammar for the expression engine.
+        # We use a list to ensure explicit priority in the parser
-        """
+        operators = one_of(["==", ">=", "<=", "!=", "=", ">", "<", ":"])
-        operators = one_of(">= <= != = > < :")
+
        identifier = Word(alphanums + "_./\\")
        quoted_string = QuotedString("'") | QuotedString('"')
        operand = quoted_string | identifier
        # Define basic condition (e.g., "width > 100" or "word")
        condition = Group((operand + operators + operand) | operand)
        # Attach the parse action to convert tokens into executable functions (lambdas)
        condition.set_parse_action(lambda t: self._create_evaluator_func(t[0]))
        return infix_notation(
@@ -119,25 +125,16 @@ class EvaluateExpression:
        )
    def _create_evaluator_func(self, tokens):
        """
        Creates a closure that captures tokens and waits for the data dictionary.
        """
        if len(tokens) == 1:
            # Rule: Single term -> path CONTAINS term
            return lambda data: self._compare(data, 'path', ':', tokens[0])
        else:
            # Rule: Explicit triplet (key, operator, value)
            return lambda data: self._compare(data, tokens[0], tokens[1], tokens[2])
    def compile(self, expression):
        """
        Parses the expression once and returns a reusable function.
        """
        try:
            return self.grammar.parse_string(expression, parse_all=True)[0]
        except Exception as e:
            print(f"Compilation Error: {e}")
            # Fallback: return a function that always fails gracefully
            return lambda data: False
@@ -250,7 +247,7 @@ class BagheeraSearcher:
            self.ids_processed.add(file_id)
            if exclude_evaluator:
-                file_info = {'path': item["path"]}
+                file_info = {'path': item["path"], 'filename': Path(item["path"]).name}
                if exclude_sources.get('properties'):
                    file_info = file_info | get_info(file_id)
                if exclude_sources.get('tags'):
@@ -325,7 +322,7 @@ class BagheeraSearcher:
            self.ids_processed.add(file_id)
            if exclude_evaluator:
-                file_info = {'path': item["path"]}
+                file_info = {'path': item["path"], 'filename': Path(item["path"]).name}
                if exclude_sources.get('properties'):
                    file_info = file_info | get_info(file_id)
                if exclude_sources.get('tags'):
--- a/bagheerasearch.py
+++ b/bagheerasearch.py
@@ -14,7 +14,7 @@ __status__ = "Production"
 import argparse
 import json
-import signal
+import os
 import sys
 from pathlib import Path
 # from baloo_tools import get_resolution
@@ -26,7 +26,7 @@ PROG_NAME = "Bagheera Search Tool"
 PROG_ID = "bagheerasearch"
 PROG_VERSION = __version__
 PROG_BY = __author__
-PROG_DATE = "2026-05-09"
+PROG_DATE = "2026-05-10"
 CONFIG_DIR = Path.home() / ".config" / PROG_ID
 CONFIG_FILE = CONFIG_DIR / "config.json"
@@ -55,7 +55,7 @@ def save_config(config: dict) -> None:
 def print_help_query() -> None:
    """Prints the detailed help for query syntax."""
-    help_query = f"""Help updated to 2025-01-01.
+    help_query = f"""{PROG_NAME} uses the Baloo search engine, which is part of the KDE ecosystem, to perform file searches so next help is obtained from Baloo documentation on 2025-01-01, with some additional information, and it may not be up to date with the latest features or changes in Baloo. For the most current information, please refer to the official Baloo documentation or resources.
 Baloo offers a rich syntax for searching through your files. Certain attributes of a file can be searched through.
@@ -63,7 +63,7 @@ For example 'type' can be used to filter for files based on their general type:
  type:Audio OR type:Document
-The following comparison operators are supported, but note that 'not equal' (!=) operator is not available.
+The following comparison operators are supported, but note that 'not equal' (!=) operator is not available in Baloo search engine.
  · :   - contains (only for text comparison)
  · =   - equal
  · >   - greater than
@@ -71,7 +71,7 @@ The following comparison operators are supported, but note that 'not equal' (!=)
  · <   - less than
  · <=  - less than or equal to
-Currently the following types are supported:
+Currently the following types, to use in --type property, are supported:
  · Archive
  · Folder
  · Audio
@@ -90,7 +90,7 @@ The full list of properties which can be searched is listed below. They are grou
 All Files
  · filename
  · mimetype
-  · modified
+  · modified (formated as yyyy-MM-dd[ hh[:mm[:ss]]])
  · rating
  · tags
  · userComment
@@ -103,7 +103,7 @@ Audio
  · Channels
  · Comment
  · Composer
-  · Duration
+  · Duration (this value must be in seconds, for example use 'duration > 300' to find files longer than 5 minutes)
  · Genre
  · Lyricist
  · ReleaseYear
@@ -113,7 +113,7 @@ Audio
 Documents
  · Author
  · Copyright
-  · CreationDate
+  · CreationDate (formated as yyyy-MM-dd[ hh[:mm[:ss]]])
  · Generator
  · Keywords
  · Language
@@ -153,8 +153,44 @@ Media
  · PhotoWhiteBalance
  · Width
 Next properties are undocumented but available in source code, may work or not, but worth trying:
  · AssistiveAlternateDescription
  · Arranger
  · AudioCodec
  · ColorSpace
  · Compilation
  · Conductor
  · Description
  · DiscNumber
  · Ensemble
  · Label
  · License
  · Location
  · Lyrics
  · Manufacturer
  · Model
  · Opus
  · OriginUrl
  · OriginEmailSubject
  · OriginEmailSender
  · OriginEmailMessageId
  · Performer
  · PixelFormat
  · ReplayGainAlbumPeak
  · ReplayGainAlbumGain
  · ReplayGainTrackPeak
  · ReplayGainTrackGain
  · TranslationUnitsTotal
  · TranslationUnitsWithTranslation
  · TranslationUnitsWithDraftTranslation
  · TranslationLastAuthor
  · TranslationLastUpDate
  · TranslationTemplateDate
  · VideoCodec
-{PROG_NAME} recognizes some natural language sentences in English, as long as they are capitalized, and transforms them into queries that can be interpreted by the search engine.
+Baloo documentation ends here, but {PROG_NAME} adds some extra features on top of it.
 Search engine recognizes some natural language sentences in English, as long as they are capitalized, and transforms them into queries that can be interpreted by the search engine.
 Supported natural language sentences and patterns for queries are:
  · MODIFIED TODAY
@@ -166,12 +202,18 @@ Supported natural language sentences and patterns for queries are:
 <NUMBER> can be any number or a number text from ONE to TWENTY.
-The --exclude and --recursive-exclude options allow you to filter files out of the results. The syntax for both options supports parentheses and logical operators (AND, OR, and NOT) to combine multiple patterns.
+The --exclude and --recursive-exclude options allow you to filter files out of the results.
 The syntax for both options supports parentheses and logical operators (AND, OR, and NOT) to combine multiple patterns.
 In addition to standard query comparison operators, the not equal (!=) operator is available for comparing properties against specific values. Furthermore, you can compare two properties directly; for example, 'width > height' is a valid expression.
 Remarks:
-  · All text comparison are case insensitive.
+  · Text comparisons are case sensitive with '==' operator but case insensitive with '=' and ':' operator. For example, 'filename:report' would match 'report.docx', 'Report.docx', and 'REPORT.docx', while 'filename=report.docx' would only match 'report.docx'.
  · Tags comparisons are performed against both individual full tag string (using the '/' character as a level separator) and each individual level. All individual level values are normalized to lowercase and stripped of accents or diacritics. For example, a file tagged as 'Opera,Person/María Callas,Singer' would match any of the following elements: ['Opera', 'Person/María Callas', 'Singer', 'callas', 'maria', 'opera', 'person', 'singer']."
-  · Only text and numeric data are supported."""
+  · Only text and numeric data are supported, dates are not supported as of now.
  · Baloo limit of at least three characters for property values is not applied in --exclude and --recursive-exclude options, so you can use shorter values in those options.
 For example, if you have a tag named 'Science' and another one 'Science Fiction' you can't obtain only results tagged with 'Science' becouse Baloo search engine will match both 'Science' and 'Science Fiction' tags when you use 'tags:Science' in your query. To exclude results tagged with 'Science Fiction' you can use the following query:
    {PROG_ID} --exclude tags:Fiction tags:Science"""
    print(help_query)
@@ -183,19 +225,12 @@ def print_version() -> None:
        "the good people at KDE"
    )
 def signal_handler(sig, frame) -> None:
    """Handles Ctrl+C gracefully."""
    print("\nSearch canceled at user request.")
    sys.exit(0)
 def main():
    parser = argparse.ArgumentParser(
        description="An improved search tool for Baloo"
    )
    parser.add_argument("query", nargs="?", help="list of words to query for")
-    parser.add_argument("-d", "--directory", help="limit search to specified directory")
+    parser.add_argument("-d", "--directory", help="limit search to specified directory tree")
    parser.add_argument("-e", "--exclude", help="Search exclude pattern")
    parser.add_argument("-i", "--id", action="store_true", help="show document IDs")
    parser.add_argument("-k", "--konsole", action="store_true", help="show files using file:/ and quotes")
@@ -203,7 +238,7 @@ def main():
    parser.add_argument("-o", "--offset", type=int, help="offset from which to start the search")
    parser.add_argument("-r", "--recursive", nargs="?", const="", default=None, help="enable recurse with or without a query")
    parser.add_argument("-n", "--recursive-indent", help="recursive indent character")
-    parser.add_argument("-x", "--recursive-exclude", help="recursion exclude pattern")
+    parser.add_argument("-x", "--recursive-exclude", help="recursion exclude query")
    parser.add_argument("-s", "--sort", help="sorting criteria <auto|none>")
    parser.add_argument("-t", "--type", help="type of Baloo data to be searched")
    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose mode")
@@ -318,16 +353,30 @@ def main():
    except FileNotFoundError as e:
        print(e)
        sys.exit(1)
    except KeyboardInterrupt:
        # Captura Ctrl+C dentro de main para una salida inmediata y limpia
        print("\nSearch canceled at user request.")
        sys.exit(0)
    except BrokenPipeError:
        # Silencia errores cuando se usa con 'head' o 'less' y se cierra el pipe
        devnull = os.open(os.devnull, os.O_WRONLY)
        os.dup2(devnull, sys.stdout.fileno())
        sys.exit(1)
    except Exception as e:
        print(f"Error executing search: {e}")
        sys.exit(1)
 if __name__ == "__main__":
    signal.signal(signal.SIGINT, signal_handler)
    try:
        main()
    except KeyboardInterrupt:
        # Respaldo por si la interrupción ocurre fuera del bloque principal de main
        print("\nSearch canceled at user request.")
        try:
            sys.exit(0)
        except SystemExit:
            os._exit(0)
    except Exception as e:
        print(f"Critical error: {e}")
        sys.exit(1)
--- a/baloo_tools/baloo_tools.py
+++ b/baloo_tools/baloo_tools.py
@@ -10,6 +10,7 @@ import lmdb
 import os
 import re
 import sys
 import unicodedata
 from typing import Tuple
 PROPERTIES_ID_MAP = {
@@ -100,6 +101,18 @@ PROPERTIES_ID_MAP = {
 }
 def normalize_text(text):
    """
    Remove accents/diacritics for string comparison.
    """
    if not text:
        return ""
    text = unicodedata.normalize('NFD', text)
    text = "".join(c for c in text if unicodedata.category(c) != 'Mn')
    # return text.lower().strip()
    return text.strip()
 class BalooTools:
    """Class to interact directly with the Baloo LMDB index."""
@@ -214,11 +227,46 @@ class BalooTools:
                            for p in parts:
                                p = p.strip()
                                if p:
-                                    tag = p.removeprefix('TAG-').removeprefix('TA')
+                                    """ 'TA' elements are tags normalized to lowercase
-                                    tags.append(tag)
+                                    and stripped of accents/diacritics, while 'TAG'
                                    elements are the original tags as they were added by
                                    the user. We need to process both to ensure we can
                                    match tags in a case-insensitive and
                                    accent-insensitive way. But we only want to add the
                                    original tags to the final result, not the
                                    normalized  ones, because the normalized ones are
                                    not handle correctly tags with spaces and words with
                                    less than three characters.
                                    """
                                    if p.startswith('TAG-'):
                                        tag = p.removeprefix('TAG-')
                                        tags.append(tag)
-                            return {'tags': tags}
+                            result_set = set(tags)
-                            # return {'tags': ",".join(tags)}
+
                            """ Must add individual parts of the tags to the result set
                            to be able to match them with queries like 'tags:callas'
                            or 'tags:maria' for tags "María Callas" or "Person/María
                            Callas". To maintain Baloo tag behaviour with spaces, it's
                            not possible to search for tags="María Callas" and must
                            search for tags=María tags:Callas, items with spaces are
                            not added to avoid confusion."""
                            for item in tags:
                                parts = re.split(r'[ /\n\t]+', item)
                                for part in parts:
                                    if part:
                                        result_set.add(part)
                                        normalize_part = normalize_text(part)
                                        if normalize_part:
                                            result_set.add(normalize_part)
                            tags = sorted(list(result_set))
                            if not tags:
                                return {}
                            else:
                                return {'tags': tags}
        except lmdb.Error as e:
            print(f"Warning: Failed to access Baloo LMDB index: {e}", file=sys.stderr)