fix(search): require library match and preserve raw scores for RRF

Replace OPTIONAL MATCH with MATCH for Library-Collection-Item paths to
ensure results are properly scoped to libraries, and remove per-query
score normalization since RRF fuses results by rank rather than score
magnitude.
This commit is contained in:
2026-04-26 06:35:11 -04:00
parent 4a35aa126f
commit 388b37e471
3 changed files with 55 additions and 360 deletions

View File

@@ -247,7 +247,7 @@ class SearchService:
CALL db.index.vector.queryNodes('chunk_embedding_index', $top_k, $query_vector)
YIELD node AS chunk, score
MATCH (item:Item)-[:HAS_CHUNK]->(chunk)
OPTIONAL MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
WHERE ($library_uid IS NULL OR lib.uid = $library_uid)
AND ($library_type IS NULL OR lib.library_type = $library_type)
AND ($collection_uid IS NULL OR col.uid = $collection_uid)
@@ -352,7 +352,7 @@ class SearchService:
CALL db.index.fulltext.queryNodes('chunk_text_fulltext', $query)
YIELD node AS chunk, score
MATCH (item:Item)-[:HAS_CHUNK]->(chunk)
OPTIONAL MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
WHERE ($library_uid IS NULL OR lib.uid = $library_uid)
AND ($library_type IS NULL OR lib.library_type = $library_type)
AND ($collection_uid IS NULL OR col.uid = $collection_uid)
@@ -374,15 +374,13 @@ class SearchService:
try:
results, _ = db.cypher_query(cypher, params)
# Normalize BM25 scores to 0-1 range
max_score = max((float(r[7]) for r in results if r[7]), default=1.0)
# Keep raw BM25 scores — RRF fuses by rank, not by score magnitude.
for row in results:
uid = row[0]
if not uid:
continue
raw_score = float(row[7]) if row[7] else 0.0
normalized = raw_score / max_score if max_score > 0 else 0.0
if uid not in candidates or normalized > candidates[uid].score:
if uid not in candidates or raw_score > candidates[uid].score:
candidates[uid] = SearchCandidate(
chunk_uid=uid,
text_preview=row[1] or "",
@@ -391,7 +389,7 @@ class SearchService:
item_uid=row[4] or "",
item_title=row[5] or "",
library_type=row[6] or "",
score=normalized,
score=raw_score,
source="fulltext",
)
except Exception as exc:
@@ -409,7 +407,7 @@ class SearchService:
YIELD node AS concept, score AS concept_score
MATCH (chunk:Chunk)-[:MENTIONS]->(concept)
MATCH (item:Item)-[:HAS_CHUNK]->(chunk)
OPTIONAL MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
MATCH (lib:Library)-[:CONTAINS]->(:Collection)-[:CONTAINS]->(item)
WHERE ($library_uid IS NULL OR lib.uid = $library_uid)
AND ($library_type IS NULL OR lib.library_type = $library_type)
RETURN chunk.uid AS chunk_uid, chunk.text_preview AS text_preview,
@@ -430,14 +428,13 @@ class SearchService:
try:
results, _ = db.cypher_query(cypher, params)
max_score = max((float(r[7]) for r in results if r[7]), default=1.0)
# Raw scores already include the 0.8 concept downweight from Cypher.
for row in results:
uid = row[0]
if not uid:
continue
raw_score = float(row[7]) if row[7] else 0.0
normalized = raw_score / max_score if max_score > 0 else 0.0
if uid not in candidates or normalized > candidates[uid].score:
if uid not in candidates or raw_score > candidates[uid].score:
candidates[uid] = SearchCandidate(
chunk_uid=uid,
text_preview=row[1] or "",
@@ -446,7 +443,7 @@ class SearchService:
item_uid=row[4] or "",
item_title=row[5] or "",
library_type=row[6] or "",
score=normalized,
score=raw_score,
source="fulltext",
)
except Exception as exc:
@@ -476,17 +473,17 @@ class SearchService:
LIMIT 10
MATCH (chunk:Chunk)-[:MENTIONS]->(concept)
MATCH (item:Item)-[:HAS_CHUNK]->(chunk)
OPTIONAL MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
MATCH (lib:Library)-[:CONTAINS]->(:Collection)-[:CONTAINS]->(item)
WHERE ($library_uid IS NULL OR lib.uid = $library_uid)
AND ($library_type IS NULL OR lib.library_type = $library_type)
WITH chunk, item, lib, concept, concept_score,
count(DISTINCT concept) AS concept_count
RETURN DISTINCT chunk.uid AS chunk_uid, chunk.text_preview AS text_preview,
WITH chunk, item, lib,
max(concept_score) AS score,
collect(DISTINCT concept.name)[..5] AS concept_names
RETURN chunk.uid AS chunk_uid, chunk.text_preview AS text_preview,
chunk.chunk_s3_key AS chunk_s3_key, chunk.chunk_index AS chunk_index,
item.uid AS item_uid, item.title AS item_title,
lib.library_type AS library_type,
concept_score AS score,
collect(concept.name)[..5] AS concept_names
score, concept_names
ORDER BY score DESC
LIMIT $limit
"""
@@ -504,16 +501,12 @@ class SearchService:
logger.error("Graph search failed: %s", exc)
return []
# Normalize scores
max_score = max((float(r[7]) for r in results if r[7]), default=1.0)
candidates = []
for row in results:
uid = row[0]
if not uid:
continue
raw_score = float(row[7]) if row[7] else 0.0
normalized = raw_score / max_score if max_score > 0 else 0.0
concept_names = row[8] if len(row) > 8 else []
candidates.append(
@@ -525,7 +518,7 @@ class SearchService:
item_uid=row[4] or "",
item_title=row[5] or "",
library_type=row[6] or "",
score=normalized,
score=raw_score,
source="graph",
metadata={"concepts": concept_names},
)
@@ -562,7 +555,7 @@ class SearchService:
YIELD node AS emb_node, score
MATCH (img:Image)-[:HAS_EMBEDDING]->(emb_node)
MATCH (item:Item)-[:HAS_IMAGE]->(img)
OPTIONAL MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
MATCH (lib:Library)-[:CONTAINS]->(:Collection)-[:CONTAINS]->(item)
WHERE ($library_uid IS NULL OR lib.uid = $library_uid)
AND ($library_type IS NULL OR lib.library_type = $library_type)
RETURN img.uid AS image_uid, img.image_type AS image_type,
@@ -642,11 +635,13 @@ class SearchService:
try:
client = RerankerClient(reranker_model, user=self.user)
# Don't pass top_n — let the reranker score every candidate so
# cross-attention can promote items the RRF stage ranked low.
# Final trimming to request.limit happens in search().
reranked = client.rerank(
query=request.query,
candidates=candidates_to_rerank,
instruction=instruction,
top_n=request.limit,
query_image=request.query_image,
)
return reranked, reranker_model.name
@@ -660,22 +655,27 @@ class SearchService:
# Helpers
# ------------------------------------------------------------------
GENERIC_RERANKER_INSTRUCTION = (
"Re-rank these passages by relevance to the query."
)
def _get_reranker_instruction(
self, request: SearchRequest, candidates: list[SearchCandidate]
) -> str:
"""
Get the content-type-aware reranker instruction.
If scoped to a library or library type, use that type's instruction.
If mixed types, use a generic instruction.
Scoped queries (by library or library type) use that type's
instruction. Unscoped queries — even when results happen to
come mostly from one type — use a generic instruction so the
reranker is not biased toward the majority type.
:param request: SearchRequest.
:param candidates: Candidates (used to detect dominant library type).
:param candidates: Candidates (unused; kept for API stability).
:returns: Reranker instruction string.
"""
from library.content_types import get_library_type_config
# Use explicit library type from request
if request.library_type:
try:
config = get_library_type_config(request.library_type)
@@ -683,25 +683,12 @@ class SearchService:
except ValueError:
pass
# Use library UID to look up type
if request.library_uid:
return self._get_library_reranker_instruction(request.library_uid)
instruction = self._get_library_reranker_instruction(request.library_uid)
if instruction:
return instruction
# Detect dominant type from candidates
type_counts: dict[str, int] = {}
for c in candidates:
if c.library_type:
type_counts[c.library_type] = type_counts.get(c.library_type, 0) + 1
if type_counts:
dominant_type = max(type_counts, key=type_counts.get)
try:
config = get_library_type_config(dominant_type)
return config.get("reranker_instruction", "")
except ValueError:
pass
return ""
return self.GENERIC_RERANKER_INSTRUCTION
def _get_library_reranker_instruction(self, library_uid: str) -> str:
"""Get reranker_instruction from a Library node."""
@@ -710,7 +697,12 @@ class SearchService:
lib = Library.nodes.get(uid=library_uid)
return lib.reranker_instruction or ""
except Exception:
except Exception as exc:
logger.warning(
"Failed to load reranker_instruction for library_uid=%s: %s",
library_uid,
exc,
)
return ""
def _get_embedding_instruction(self, library_uid: str) -> str:
@@ -720,7 +712,12 @@ class SearchService:
lib = Library.nodes.get(uid=library_uid)
return lib.embedding_instruction or ""
except Exception:
except Exception as exc:
logger.warning(
"Failed to load embedding_instruction for library_uid=%s: %s",
library_uid,
exc,
)
return ""
def _get_type_embedding_instruction(self, library_type: str) -> str:

View File

@@ -225,8 +225,12 @@ class SearchServiceHelperTest(TestCase):
instruction = service._get_reranker_instruction(request, [])
self.assertIn("fiction", instruction.lower())
def test_get_reranker_instruction_from_candidates(self):
"""Detects dominant library type from candidate list."""
def test_get_reranker_instruction_generic_for_unscoped(self):
"""
Unscoped queries get the generic instruction even when candidates
all share a library_type — type-specific instructions could bias
the reranker against minority-type results.
"""
service = SearchService()
request = SearchRequest(query="test")
candidates = [
@@ -240,10 +244,10 @@ class SearchServiceHelperTest(TestCase):
]
instruction = service._get_reranker_instruction(request, candidates)
self.assertIn("technical", instruction.lower())
self.assertEqual(instruction, SearchService.GENERIC_RERANKER_INSTRUCTION)
def test_get_reranker_instruction_empty_when_no_context(self):
"""Returns empty when no library type context available."""
def test_get_reranker_instruction_generic_when_no_context(self):
"""Returns the generic instruction when no library scope is set."""
service = SearchService()
request = SearchRequest(query="test")
candidates = [
@@ -256,4 +260,4 @@ class SearchServiceHelperTest(TestCase):
]
instruction = service._get_reranker_instruction(request, candidates)
self.assertEqual(instruction, "")
self.assertEqual(instruction, SearchService.GENERIC_RERANKER_INSTRUCTION)