From 75013ebfc3d91c7a588a73e1dab09c186f4a6e9f Mon Sep 17 00:00:00 2001
From: Robert Helewka <r@helu.ca>
Date: Sat, 23 May 2026 21:52:51 -0400
Subject: [PATCH] refactor(concepts): document-level extraction with one chat
 call per item
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Concept extraction was making up to 10 LLM calls per item by sampling
chunks, which produced redundant work (the same concept reappears in
multiple chunks), context-loss bugs (chunk boundaries cut mid-thought),
and on a 35B model dominated per-item wall time (~3 min/item).

Concepts are document-level semantic objects; chunks are retrieval
units. Extract once per item from the first 100KB of parsed document
text, then connect each chunk to the concepts it explicitly mentions
via case-insensitive substring match — no extra LLM calls. Drops the
sample-indices selector that the old per-chunk loop relied on.

Stage 7 is currently dormant in production because the configured
chat model is a reasoning-mode Qwen variant that returns empty content
on every call (output stuck in reasoning_content). Re-enables cleanly
once a non-reasoning instruct model is set as is_system_chat_model.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 mnemosyne/library/services/concepts.py | 107 ++++++++++++-------------
 mnemosyne/library/services/pipeline.py |   5 +-
 2 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/mnemosyne/library/services/concepts.py b/mnemosyne/library/services/concepts.py
index fc5c490..b2f397e 100644
--- a/mnemosyne/library/services/concepts.py
+++ b/mnemosyne/library/services/concepts.py
@@ -22,7 +22,7 @@ Return a JSON array of objects, each with:
 - "type": one of "person", "place", "topic", "technique", "theme"
 
 Only extract significant, specific concepts — not generic words.
-Return at most 20 concepts. Return ONLY the JSON array, no other text.
+Return at most 60 concepts. Return ONLY the JSON array, no other text.
 
 Text:
 {text}"""
@@ -49,60 +49,76 @@ class ConceptExtractor:
         item,
         chunk_nodes: list,
         chunk_texts: list[str],
+        document_text: str,
     ) -> int:
         """
-        Extract concepts from all chunks of an item.
+        Extract concepts from a document with one LLM call.
+
+        Concepts are document-level semantic objects; chunks are
+        retrieval units. Extracting per-chunk produced redundant calls
+        for the same entity, mid-thought context loss at chunk boundaries,
+        and 10x the cost. Single document-level call + post-hoc substring
+        matching for chunk linkage instead.
 
         :param item: Item node.
-        :param chunk_nodes: List of Chunk nodes.
-        :param chunk_texts: List of chunk text strings.
+        :param chunk_nodes: List of Chunk nodes (for MENTIONS linkage).
+        :param chunk_texts: List of chunk text strings (for substring match).
+        :param document_text: Full parsed document text.
         :returns: Total number of unique concepts extracted.
         """
-        all_concepts: dict[str, str] = {}  # name -> type
+        # 100KB covers ~25k tokens — fits comfortably in Qwen's 192k window
+        # and front-loads on the assumption that any concept introduced later
+        # has already appeared in the document's first ~30 pages.
+        DOC_EXTRACT_CHARS = 100_000
 
-        # Sample chunks for extraction (don't process every chunk for large docs)
-        sample_indices = self._select_sample_indices(len(chunk_texts), max_samples=10)
+        text_for_extraction = document_text[:DOC_EXTRACT_CHARS]
+        if not text_for_extraction.strip():
+            return 0
 
-        for idx in sample_indices:
-            chunk_text = chunk_texts[idx]
-            chunk_node = chunk_nodes[idx]
-
-            concepts = self._extract_from_text(chunk_text)
-            if not concepts:
+        raw_concepts = self._extract_from_text(text_for_extraction)
+        all_concepts: dict[str, str] = {}
+        for concept_data in raw_concepts:
+            name = concept_data.get("name", "").strip().lower()
+            if not name or len(name) < 2:
                 continue
+            all_concepts[name] = concept_data.get("type", "topic")
 
-            for concept_data in concepts:
-                name = concept_data.get("name", "").strip().lower()
-                concept_type = concept_data.get("type", "topic")
+        if not all_concepts:
+            logger.info("No concepts extracted for item_uid=%s", item.uid)
+            return 0
 
-                if not name or len(name) < 2:
-                    continue
+        # Resolve every concept once, then both link chunks (MENTIONS) and
+        # link the item (REFERENCES). Chunk linkage is substring-based —
+        # zero extra LLM calls, catches every explicit mention.
+        concept_nodes: dict[str, object] = {}
+        for name, concept_type in all_concepts.items():
+            node = self._get_or_create_concept(name, concept_type)
+            if node:
+                concept_nodes[name] = node
 
-                all_concepts[name] = concept_type
-
-                # Connect chunk -> concept via MENTIONS
-                concept_node = self._get_or_create_concept(name, concept_type)
-                if concept_node:
+        for chunk_node, chunk_text in zip(chunk_nodes, chunk_texts):
+            chunk_lower = chunk_text.lower()
+            for name, concept_node in concept_nodes.items():
+                if name in chunk_lower:
                     try:
                         chunk_node.mentions.connect(concept_node)
                     except Exception:
                         pass  # Already connected
 
-        # Connect item -> all concepts via REFERENCES
-        for name, concept_type in all_concepts.items():
-            concept_node = self._get_or_create_concept(name, concept_type)
-            if concept_node:
-                try:
-                    item.concepts.connect(concept_node, {"weight": 1.0})
-                except Exception:
-                    pass  # Already connected
-
-                CONCEPTS_EXTRACTED_TOTAL.labels(concept_type=concept_type).inc()
+        for name, concept_node in concept_nodes.items():
+            try:
+                item.concepts.connect(concept_node, {"weight": 1.0})
+            except Exception:
+                pass  # Already connected
+            CONCEPTS_EXTRACTED_TOTAL.labels(
+                concept_type=all_concepts[name]
+            ).inc()
 
         logger.info(
-            "Extracted %d concepts for item_uid=%s",
+            "Extracted %d concepts for item_uid=%s from %d chars",
             len(all_concepts),
             item.uid,
+            len(text_for_extraction),
         )
         return len(all_concepts)
 
@@ -110,13 +126,9 @@ class ConceptExtractor:
         """
         Call the chat model to extract concepts from text.
 
-        :param text: Text to analyze.
+        :param text: Text to analyze (caller is responsible for sizing).
         :returns: List of concept dicts with 'name' and 'type' keys.
         """
-        # Truncate very long text to avoid token limits
-        if len(text) > 3000:
-            text = text[:3000]
-
         prompt = CONCEPT_EXTRACTION_PROMPT.format(text=text)
 
         try:
@@ -164,7 +176,7 @@ class ConceptExtractor:
                 "model": self.chat_model.name,
                 "messages": [{"role": "user", "content": prompt}],
                 "temperature": 0.1,
-                "max_tokens": 1000,
+                "max_tokens": 4000,
             }
 
         resp = requests.post(
@@ -250,18 +262,3 @@ class ConceptExtractor:
             logger.debug("Failed to get/create concept '%s': %s", name, exc)
             return None
 
-    def _select_sample_indices(
-        self, total: int, max_samples: int = 10
-    ) -> list[int]:
-        """
-        Select evenly-spaced sample indices for concept extraction.
-
-        :param total: Total number of chunks.
-        :param max_samples: Maximum samples to take.
-        :returns: List of chunk indices to process.
-        """
-        if total <= max_samples:
-            return list(range(total))
-
-        step = total / max_samples
-        return [int(i * step) for i in range(max_samples)]
diff --git a/mnemosyne/library/services/pipeline.py b/mnemosyne/library/services/pipeline.py
index 4020508..11e0d10 100644
--- a/mnemosyne/library/services/pipeline.py
+++ b/mnemosyne/library/services/pipeline.py
@@ -274,9 +274,12 @@ class EmbeddingPipeline:
         # --- Stage 7: Concept extraction ---
         chat_model = LLMModel.get_system_chat_model()
         if chat_model and chunk_result.chunks:
+            document_text = "\n\n".join(
+                block.text for block in parse_result.text_blocks
+            )
             extractor = ConceptExtractor(chat_model, user=self.user)
             concepts_count = extractor.extract_for_item(
-                item, chunk_nodes, chunk_result.chunks
+                item, chunk_nodes, chunk_result.chunks, document_text
             )
             result["concepts_extracted"] = concepts_count