Files
BagheeraSearch/bagheera_query_parser_lib/bagheera_query_parser.py.sav
Ignacio Serantes 3fb55ee4f3 First commit
2026-03-22 18:13:22 +01:00

171 lines
6.3 KiB
Python

#!/usr/bin/env python
"""
Bagheera Query Parser
Converts natural language English date expressions into Baloo-compatible queries.
"""
import re
from datetime import datetime, timedelta
from typing import Dict
class BagheeraQueryParser:
def __init__(self):
# Actualizamos 'today' cada vez que se usa para evitar problemas si el proceso
# queda abierto días
self.today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
def _convert_numbers(self, query: str) -> str:
"""
Replaces written numbers (ONE to TWENTY) with their numeric string equivalent.
Only processes uppercase words.
"""
# Mapping for numbers as requested (ONE to TWENTY)
number_map: Dict[str, int] = {
'ONE': 1, 'TWO': 2, 'THREE': 3, 'FOUR': 4, 'FIVE': 5,
'SIX': 6, 'SEVEN': 7, 'EIGHT': 8, 'NINE': 9, 'TEN': 10,
'ELEVEN': 11, 'TWELVE': 12, 'THIRTEEN': 13, 'FOURTEEN': 14,
'FIFTEEN': 15, 'SIXTEEN': 16, 'SEVENTEEN': 17, 'EIGHTEEN': 18,
'NINETEEN': 19, 'TWENTY': 20
}
words = query.split()
for i, word in enumerate(words):
# Solo intentamos convertir si la palabra está en mayúsculas
if word.isupper() and word in number_map:
words[i] = str(number_map[word])
return " ".join(words)
def _get_start_of_unit(self, dt, unit, offset=0):
if unit == 'YEAR':
return dt.replace(year=dt.year - offset, month=1, day=1)
if unit == 'MONTH':
month = dt.month - offset
year = dt.year
while month <= 0:
month += 12
year -= 1
return dt.replace(year=year, month=month, day=1)
if unit == 'WEEK':
return dt - timedelta(days=dt.weekday() + (offset * 7))
if unit == 'DAY':
return dt - timedelta(days=offset)
def _subtract_units(self, dt, unit, n):
if unit == 'YEAR':
return dt.replace(year=dt.year - n)
if unit == 'MONTH':
return self._get_start_of_unit(dt, 'MONTH', offset=n)
if unit == 'WEEK':
return dt - timedelta(weeks=n)
if unit == 'DAY':
return dt - timedelta(days=n)
def parse_date(self, query):
self.today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
q = self._convert_numbers(query)
# 1. Reemplazo de TODAY / YESTERDAY
q = re.sub(r'\bMODIFIED TODAY\b', f"modified={self.today.strftime('%Y-%m-%d')}",
q)
yest = self.today - timedelta(days=1)
q = re.sub(r'\bMODIFIED YESTERDAY\b', f"modified={yest.strftime('%Y-%m-%d')}",
q)
# 2. Reemplazo de (LAST/THIS) (YEAR/MONTH/WEEK)
# Usamos re.sub para encontrar el patrón en cualquier parte y reemplazarlo
def replace_simple(m):
mod, unit = m.groups()
if mod == "THIS":
start = self._get_start_of_unit(self.today, unit).strftime('%Y-%m-%d')
end = (self.today + timedelta(days=1)).strftime('%Y-%m-%d')
else:
start = self._get_start_of_unit(self.today, unit,
offset=1).strftime('%Y-%m-%d')
# end = (self._get_start_of_unit(self.today, unit) -
# timedelta(seconds=1)).strftime('%Y-%m-%d')
end = (self._get_start_of_unit(self.today, unit)).strftime('%Y-%m-%d')
return f"(modified>={start} AND modified<{end})"
q = re.sub(r"\bMODIFIED (LAST|THIS) (YEAR|MONTH|WEEK)\b", replace_simple, q)
# 3. Reemplazo de LAST <N> (YEAR/MONTH/WEEK/DAY)
def replace_last_n(m):
n, unit = m.groups()
start = self._subtract_units(self.today, unit, int(n)).strftime('%Y-%m-%d')
end = (self.today + timedelta(days=1)).strftime('%Y-%m-%d')
return f"(modified>={start} AND modified<{end})"
q = re.sub(r"\bMODIFIED LAST (\d+) (YEAR|MONTH|WEEK|DAY)S?\b",
replace_last_n, q)
# 4. Reemplazo de <N> AGO
def replace_ago(m):
n, unit = m.groups()
start = self._subtract_units(self.today, unit, int(n))
end = (start + timedelta(days=1)).strftime('%Y-%m-%d')
start = self._get_start_of_unit(self.today, unit, offset=1)
end = self._subtract_units(start, unit, int(n)-1).strftime('%Y-%m-%d')
start = self._subtract_units(start, unit, int(n))
return f"(modified>={start.strftime('%Y-%m-%d')} AND modified<{end})"
q = re.sub(r"\bMODIFIED (\d+) (YEAR|MONTH|WEEK|DAY)S? AGO\b", replace_ago, q)
return q
if __name__ == '__main__':
# Pruebas unitarias básicas para el parseo de fechas
test_queries = [
"MODIFIED TODAY",
"first MODIFIED YESTERDAY last",
"MODIFIED ONE DAY AGO",
"MODIFIED TWO DAYS AGO",
"MODIFIED THREE DAYS AGO",
"MODIFIED LAST TWO DAYS",
"MODIFIED THIS WEEK",
"MODIFIED LAST WEEK",
"MODIFIED LAST TWO WEEKS",
"MODIFIED ONE WEEK AGO",
"MODIFIED TWO WEEKS AGO",
"MODIFIED THREE WEEKS AGO",
"MODIFIED THIS MONTH",
"MODIFIED LAST MONTH",
"MODIFIED LAST TWO MONTHS",
"MODIFIED ONE MONTH AGO",
"MODIFIED TWO MONTHS AGO",
"MODIFIED THREE MONTHS AGO",
"MODIFIED THIS YEAR",
"MODIFIED LAST YEAR",
"MODIFIED LAST TWO YEARS",
"MODIFIED ONE YEAR AGO",
"MODIFIED TWO YEARS AGO",
"MODIFIED THREE YEARS AGO",
"foto MODIFIED LAST 2 YEARS"
]
parser = BagheeraQueryParser()
print(f"Testing {__file__}:")
for q in test_queries:
print(f" Input: '{q}'")
print(f" Output: '{parser.parse_date(q)}'")
print("-" * 20)
test_queries = [
"MODIFIED TODAYMODIFIED TODAY",
"MODIFIED yesterday",
"MODIFIED THIS MONTHMODIFIED THIS WEEK",
"MODIFIED LAST YEARMODIFIED YESTERDAY",
"modified TODAY",
"modified today"
]
parser = BagheeraQueryParser()
print(f"Testing {__file__}:")
for q in test_queries:
print(f" Input: '{q}'")
print(f" Output: '{parser.parse_date(q)}'")
print("-" * 20)