fix(search): require library match and preserve raw scores for RRF
Replace OPTIONAL MATCH with MATCH for Library-Collection-Item paths to ensure results are properly scoped to libraries, and remove per-query score normalization since RRF fuses results by rank rather than score magnitude.
This commit is contained in:
@@ -247,7 +247,7 @@ class SearchService:
|
||||
CALL db.index.vector.queryNodes('chunk_embedding_index', $top_k, $query_vector)
|
||||
YIELD node AS chunk, score
|
||||
MATCH (item:Item)-[:HAS_CHUNK]->(chunk)
|
||||
OPTIONAL MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
|
||||
MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
|
||||
WHERE ($library_uid IS NULL OR lib.uid = $library_uid)
|
||||
AND ($library_type IS NULL OR lib.library_type = $library_type)
|
||||
AND ($collection_uid IS NULL OR col.uid = $collection_uid)
|
||||
@@ -352,7 +352,7 @@ class SearchService:
|
||||
CALL db.index.fulltext.queryNodes('chunk_text_fulltext', $query)
|
||||
YIELD node AS chunk, score
|
||||
MATCH (item:Item)-[:HAS_CHUNK]->(chunk)
|
||||
OPTIONAL MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
|
||||
MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
|
||||
WHERE ($library_uid IS NULL OR lib.uid = $library_uid)
|
||||
AND ($library_type IS NULL OR lib.library_type = $library_type)
|
||||
AND ($collection_uid IS NULL OR col.uid = $collection_uid)
|
||||
@@ -374,15 +374,13 @@ class SearchService:
|
||||
|
||||
try:
|
||||
results, _ = db.cypher_query(cypher, params)
|
||||
# Normalize BM25 scores to 0-1 range
|
||||
max_score = max((float(r[7]) for r in results if r[7]), default=1.0)
|
||||
# Keep raw BM25 scores — RRF fuses by rank, not by score magnitude.
|
||||
for row in results:
|
||||
uid = row[0]
|
||||
if not uid:
|
||||
continue
|
||||
raw_score = float(row[7]) if row[7] else 0.0
|
||||
normalized = raw_score / max_score if max_score > 0 else 0.0
|
||||
if uid not in candidates or normalized > candidates[uid].score:
|
||||
if uid not in candidates or raw_score > candidates[uid].score:
|
||||
candidates[uid] = SearchCandidate(
|
||||
chunk_uid=uid,
|
||||
text_preview=row[1] or "",
|
||||
@@ -391,7 +389,7 @@ class SearchService:
|
||||
item_uid=row[4] or "",
|
||||
item_title=row[5] or "",
|
||||
library_type=row[6] or "",
|
||||
score=normalized,
|
||||
score=raw_score,
|
||||
source="fulltext",
|
||||
)
|
||||
except Exception as exc:
|
||||
@@ -409,7 +407,7 @@ class SearchService:
|
||||
YIELD node AS concept, score AS concept_score
|
||||
MATCH (chunk:Chunk)-[:MENTIONS]->(concept)
|
||||
MATCH (item:Item)-[:HAS_CHUNK]->(chunk)
|
||||
OPTIONAL MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
|
||||
MATCH (lib:Library)-[:CONTAINS]->(:Collection)-[:CONTAINS]->(item)
|
||||
WHERE ($library_uid IS NULL OR lib.uid = $library_uid)
|
||||
AND ($library_type IS NULL OR lib.library_type = $library_type)
|
||||
RETURN chunk.uid AS chunk_uid, chunk.text_preview AS text_preview,
|
||||
@@ -430,14 +428,13 @@ class SearchService:
|
||||
|
||||
try:
|
||||
results, _ = db.cypher_query(cypher, params)
|
||||
max_score = max((float(r[7]) for r in results if r[7]), default=1.0)
|
||||
# Raw scores already include the 0.8 concept downweight from Cypher.
|
||||
for row in results:
|
||||
uid = row[0]
|
||||
if not uid:
|
||||
continue
|
||||
raw_score = float(row[7]) if row[7] else 0.0
|
||||
normalized = raw_score / max_score if max_score > 0 else 0.0
|
||||
if uid not in candidates or normalized > candidates[uid].score:
|
||||
if uid not in candidates or raw_score > candidates[uid].score:
|
||||
candidates[uid] = SearchCandidate(
|
||||
chunk_uid=uid,
|
||||
text_preview=row[1] or "",
|
||||
@@ -446,7 +443,7 @@ class SearchService:
|
||||
item_uid=row[4] or "",
|
||||
item_title=row[5] or "",
|
||||
library_type=row[6] or "",
|
||||
score=normalized,
|
||||
score=raw_score,
|
||||
source="fulltext",
|
||||
)
|
||||
except Exception as exc:
|
||||
@@ -476,17 +473,17 @@ class SearchService:
|
||||
LIMIT 10
|
||||
MATCH (chunk:Chunk)-[:MENTIONS]->(concept)
|
||||
MATCH (item:Item)-[:HAS_CHUNK]->(chunk)
|
||||
OPTIONAL MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
|
||||
MATCH (lib:Library)-[:CONTAINS]->(:Collection)-[:CONTAINS]->(item)
|
||||
WHERE ($library_uid IS NULL OR lib.uid = $library_uid)
|
||||
AND ($library_type IS NULL OR lib.library_type = $library_type)
|
||||
WITH chunk, item, lib, concept, concept_score,
|
||||
count(DISTINCT concept) AS concept_count
|
||||
RETURN DISTINCT chunk.uid AS chunk_uid, chunk.text_preview AS text_preview,
|
||||
WITH chunk, item, lib,
|
||||
max(concept_score) AS score,
|
||||
collect(DISTINCT concept.name)[..5] AS concept_names
|
||||
RETURN chunk.uid AS chunk_uid, chunk.text_preview AS text_preview,
|
||||
chunk.chunk_s3_key AS chunk_s3_key, chunk.chunk_index AS chunk_index,
|
||||
item.uid AS item_uid, item.title AS item_title,
|
||||
lib.library_type AS library_type,
|
||||
concept_score AS score,
|
||||
collect(concept.name)[..5] AS concept_names
|
||||
score, concept_names
|
||||
ORDER BY score DESC
|
||||
LIMIT $limit
|
||||
"""
|
||||
@@ -504,16 +501,12 @@ class SearchService:
|
||||
logger.error("Graph search failed: %s", exc)
|
||||
return []
|
||||
|
||||
# Normalize scores
|
||||
max_score = max((float(r[7]) for r in results if r[7]), default=1.0)
|
||||
|
||||
candidates = []
|
||||
for row in results:
|
||||
uid = row[0]
|
||||
if not uid:
|
||||
continue
|
||||
raw_score = float(row[7]) if row[7] else 0.0
|
||||
normalized = raw_score / max_score if max_score > 0 else 0.0
|
||||
concept_names = row[8] if len(row) > 8 else []
|
||||
|
||||
candidates.append(
|
||||
@@ -525,7 +518,7 @@ class SearchService:
|
||||
item_uid=row[4] or "",
|
||||
item_title=row[5] or "",
|
||||
library_type=row[6] or "",
|
||||
score=normalized,
|
||||
score=raw_score,
|
||||
source="graph",
|
||||
metadata={"concepts": concept_names},
|
||||
)
|
||||
@@ -562,7 +555,7 @@ class SearchService:
|
||||
YIELD node AS emb_node, score
|
||||
MATCH (img:Image)-[:HAS_EMBEDDING]->(emb_node)
|
||||
MATCH (item:Item)-[:HAS_IMAGE]->(img)
|
||||
OPTIONAL MATCH (lib:Library)-[:CONTAINS]->(col:Collection)-[:CONTAINS]->(item)
|
||||
MATCH (lib:Library)-[:CONTAINS]->(:Collection)-[:CONTAINS]->(item)
|
||||
WHERE ($library_uid IS NULL OR lib.uid = $library_uid)
|
||||
AND ($library_type IS NULL OR lib.library_type = $library_type)
|
||||
RETURN img.uid AS image_uid, img.image_type AS image_type,
|
||||
@@ -642,11 +635,13 @@ class SearchService:
|
||||
|
||||
try:
|
||||
client = RerankerClient(reranker_model, user=self.user)
|
||||
# Don't pass top_n — let the reranker score every candidate so
|
||||
# cross-attention can promote items the RRF stage ranked low.
|
||||
# Final trimming to request.limit happens in search().
|
||||
reranked = client.rerank(
|
||||
query=request.query,
|
||||
candidates=candidates_to_rerank,
|
||||
instruction=instruction,
|
||||
top_n=request.limit,
|
||||
query_image=request.query_image,
|
||||
)
|
||||
return reranked, reranker_model.name
|
||||
@@ -660,22 +655,27 @@ class SearchService:
|
||||
# Helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
GENERIC_RERANKER_INSTRUCTION = (
|
||||
"Re-rank these passages by relevance to the query."
|
||||
)
|
||||
|
||||
def _get_reranker_instruction(
|
||||
self, request: SearchRequest, candidates: list[SearchCandidate]
|
||||
) -> str:
|
||||
"""
|
||||
Get the content-type-aware reranker instruction.
|
||||
|
||||
If scoped to a library or library type, use that type's instruction.
|
||||
If mixed types, use a generic instruction.
|
||||
Scoped queries (by library or library type) use that type's
|
||||
instruction. Unscoped queries — even when results happen to
|
||||
come mostly from one type — use a generic instruction so the
|
||||
reranker is not biased toward the majority type.
|
||||
|
||||
:param request: SearchRequest.
|
||||
:param candidates: Candidates (used to detect dominant library type).
|
||||
:param candidates: Candidates (unused; kept for API stability).
|
||||
:returns: Reranker instruction string.
|
||||
"""
|
||||
from library.content_types import get_library_type_config
|
||||
|
||||
# Use explicit library type from request
|
||||
if request.library_type:
|
||||
try:
|
||||
config = get_library_type_config(request.library_type)
|
||||
@@ -683,25 +683,12 @@ class SearchService:
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Use library UID to look up type
|
||||
if request.library_uid:
|
||||
return self._get_library_reranker_instruction(request.library_uid)
|
||||
instruction = self._get_library_reranker_instruction(request.library_uid)
|
||||
if instruction:
|
||||
return instruction
|
||||
|
||||
# Detect dominant type from candidates
|
||||
type_counts: dict[str, int] = {}
|
||||
for c in candidates:
|
||||
if c.library_type:
|
||||
type_counts[c.library_type] = type_counts.get(c.library_type, 0) + 1
|
||||
|
||||
if type_counts:
|
||||
dominant_type = max(type_counts, key=type_counts.get)
|
||||
try:
|
||||
config = get_library_type_config(dominant_type)
|
||||
return config.get("reranker_instruction", "")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return ""
|
||||
return self.GENERIC_RERANKER_INSTRUCTION
|
||||
|
||||
def _get_library_reranker_instruction(self, library_uid: str) -> str:
|
||||
"""Get reranker_instruction from a Library node."""
|
||||
@@ -710,7 +697,12 @@ class SearchService:
|
||||
|
||||
lib = Library.nodes.get(uid=library_uid)
|
||||
return lib.reranker_instruction or ""
|
||||
except Exception:
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Failed to load reranker_instruction for library_uid=%s: %s",
|
||||
library_uid,
|
||||
exc,
|
||||
)
|
||||
return ""
|
||||
|
||||
def _get_embedding_instruction(self, library_uid: str) -> str:
|
||||
@@ -720,7 +712,12 @@ class SearchService:
|
||||
|
||||
lib = Library.nodes.get(uid=library_uid)
|
||||
return lib.embedding_instruction or ""
|
||||
except Exception:
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Failed to load embedding_instruction for library_uid=%s: %s",
|
||||
library_uid,
|
||||
exc,
|
||||
)
|
||||
return ""
|
||||
|
||||
def _get_type_embedding_instruction(self, library_type: str) -> str:
|
||||
|
||||
@@ -225,8 +225,12 @@ class SearchServiceHelperTest(TestCase):
|
||||
instruction = service._get_reranker_instruction(request, [])
|
||||
self.assertIn("fiction", instruction.lower())
|
||||
|
||||
def test_get_reranker_instruction_from_candidates(self):
|
||||
"""Detects dominant library type from candidate list."""
|
||||
def test_get_reranker_instruction_generic_for_unscoped(self):
|
||||
"""
|
||||
Unscoped queries get the generic instruction even when candidates
|
||||
all share a library_type — type-specific instructions could bias
|
||||
the reranker against minority-type results.
|
||||
"""
|
||||
service = SearchService()
|
||||
request = SearchRequest(query="test")
|
||||
candidates = [
|
||||
@@ -240,10 +244,10 @@ class SearchServiceHelperTest(TestCase):
|
||||
]
|
||||
|
||||
instruction = service._get_reranker_instruction(request, candidates)
|
||||
self.assertIn("technical", instruction.lower())
|
||||
self.assertEqual(instruction, SearchService.GENERIC_RERANKER_INSTRUCTION)
|
||||
|
||||
def test_get_reranker_instruction_empty_when_no_context(self):
|
||||
"""Returns empty when no library type context available."""
|
||||
def test_get_reranker_instruction_generic_when_no_context(self):
|
||||
"""Returns the generic instruction when no library scope is set."""
|
||||
service = SearchService()
|
||||
request = SearchRequest(query="test")
|
||||
candidates = [
|
||||
@@ -256,4 +260,4 @@ class SearchServiceHelperTest(TestCase):
|
||||
]
|
||||
|
||||
instruction = service._get_reranker_instruction(request, candidates)
|
||||
self.assertEqual(instruction, "")
|
||||
self.assertEqual(instruction, SearchService.GENERIC_RERANKER_INSTRUCTION)
|
||||
|
||||
Reference in New Issue
Block a user