diff --git a/navi/memory/store.py b/navi/memory/store.py index 249d6a1..953628b 100644 --- a/navi/memory/store.py +++ b/navi/memory/store.py @@ -1,11 +1,31 @@ """Persistent memory store — facts about the user, backed by PostgreSQL.""" import asyncio +import re import uuid from datetime import datetime, timezone import asyncpg +_SEPARATORS = re.compile(r'[-_/\\.]') # treat as word boundaries +_NOISE = re.compile(r'[^\w\s]', re.UNICODE) # strip remaining punctuation + + +def _normalize_query(query: str) -> list[str]: + """Return a clean list of search terms from a raw query string. + + - Hyphens, underscores, slashes, dots → spaces (web-search → [web, search]) + - All remaining punctuation stripped (commas, quotes, parens …) + - Lowercased, split on whitespace + - Single-character tokens dropped + """ + q = _SEPARATORS.sub(' ', query) + q = _NOISE.sub(' ', q) + return [t for t in q.lower().split() if len(t) > 1] + + +_AUTO_DUMP_THRESHOLD = 60 # if the DB has ≤ this many facts, skip search and return all + _DDL_STATEMENTS = [ """CREATE TABLE IF NOT EXISTS memory_facts ( id TEXT PRIMARY KEY, @@ -70,29 +90,49 @@ ) async def search_facts(self, query: str, limit: int = 15) -> list[dict]: - terms = [t for t in query.lower().split() if len(t) > 1] + terms = _normalize_query(query) if not terms: return await self.get_all_facts(limit=limit) - params: list = [] - conditions_parts: list[str] = [] + # Small DB — skip search entirely, just return the most recent facts + if await self.fact_count() <= _AUTO_DUMP_THRESHOLD: + return await self.get_all_facts(limit=limit) + + # Build per-term ILIKE conditions and a shared parameter list + base_params: list = [] + term_conds: list[str] = [] for term in terms: like = f"%{term}%" - base = len(params) + 1 - conditions_parts.append( - f"(category ILIKE ${base} OR key ILIKE ${base + 1} OR value ILIKE ${base + 2})" + i = len(base_params) + 1 + term_conds.append( + f"(category ILIKE ${i} OR key ILIKE ${i + 1} OR value ILIKE ${i + 2})" ) - params.extend([like, like, like]) + base_params.extend([like, like, like]) - limit_idx = len(params) + 1 - params.append(limit) + limit_idx = len(base_params) + 1 + and_where = " AND ".join(term_conds) + or_where = " OR ".join(term_conds) pool = await self._get_pool() async with pool.acquire() as conn: + # 1. AND — all terms must match (most precise) rows = await conn.fetch( f"SELECT id, category, key, value, updated_at FROM memory_facts " - f"WHERE {' OR '.join(conditions_parts)} ORDER BY updated_at DESC LIMIT ${limit_idx}", - *params, + f"WHERE {and_where} ORDER BY updated_at DESC LIMIT ${limit_idx}", + *base_params, limit, + ) + if rows: + return [_row_to_dict(r) for r in rows] + + # 2. OR with relevance score — facts matching more terms rank higher + score_expr = " + ".join( + f"CASE WHEN {c} THEN 1 ELSE 0 END" for c in term_conds + ) + rows = await conn.fetch( + f"SELECT id, category, key, value, updated_at, ({score_expr}) AS score " + f"FROM memory_facts WHERE {or_where} " + f"ORDER BY score DESC, updated_at DESC LIMIT ${limit_idx}", + *base_params, limit, ) return [_row_to_dict(r) for r in rows]