From 75013ebfc3d91c7a588a73e1dab09c186f4a6e9f Mon Sep 17 00:00:00 2001 From: Robert Helewka Date: Sat, 23 May 2026 21:52:51 -0400 Subject: [PATCH] refactor(concepts): document-level extraction with one chat call per item MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Concept extraction was making up to 10 LLM calls per item by sampling chunks, which produced redundant work (the same concept reappears in multiple chunks), context-loss bugs (chunk boundaries cut mid-thought), and on a 35B model dominated per-item wall time (~3 min/item). Concepts are document-level semantic objects; chunks are retrieval units. Extract once per item from the first 100KB of parsed document text, then connect each chunk to the concepts it explicitly mentions via case-insensitive substring match — no extra LLM calls. Drops the sample-indices selector that the old per-chunk loop relied on. Stage 7 is currently dormant in production because the configured chat model is a reasoning-mode Qwen variant that returns empty content on every call (output stuck in reasoning_content). Re-enables cleanly once a non-reasoning instruct model is set as is_system_chat_model. Co-Authored-By: Claude Opus 4.7 --- mnemosyne/library/services/concepts.py | 107 ++++++++++++------------- mnemosyne/library/services/pipeline.py | 5 +- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/mnemosyne/library/services/concepts.py b/mnemosyne/library/services/concepts.py index fc5c490..b2f397e 100644 --- a/mnemosyne/library/services/concepts.py +++ b/mnemosyne/library/services/concepts.py @@ -22,7 +22,7 @@ Return a JSON array of objects, each with: - "type": one of "person", "place", "topic", "technique", "theme" Only extract significant, specific concepts — not generic words. -Return at most 20 concepts. Return ONLY the JSON array, no other text. +Return at most 60 concepts. Return ONLY the JSON array, no other text. Text: {text}""" @@ -49,60 +49,76 @@ class ConceptExtractor: item, chunk_nodes: list, chunk_texts: list[str], + document_text: str, ) -> int: """ - Extract concepts from all chunks of an item. + Extract concepts from a document with one LLM call. + + Concepts are document-level semantic objects; chunks are + retrieval units. Extracting per-chunk produced redundant calls + for the same entity, mid-thought context loss at chunk boundaries, + and 10x the cost. Single document-level call + post-hoc substring + matching for chunk linkage instead. :param item: Item node. - :param chunk_nodes: List of Chunk nodes. - :param chunk_texts: List of chunk text strings. + :param chunk_nodes: List of Chunk nodes (for MENTIONS linkage). + :param chunk_texts: List of chunk text strings (for substring match). + :param document_text: Full parsed document text. :returns: Total number of unique concepts extracted. """ - all_concepts: dict[str, str] = {} # name -> type + # 100KB covers ~25k tokens — fits comfortably in Qwen's 192k window + # and front-loads on the assumption that any concept introduced later + # has already appeared in the document's first ~30 pages. + DOC_EXTRACT_CHARS = 100_000 - # Sample chunks for extraction (don't process every chunk for large docs) - sample_indices = self._select_sample_indices(len(chunk_texts), max_samples=10) + text_for_extraction = document_text[:DOC_EXTRACT_CHARS] + if not text_for_extraction.strip(): + return 0 - for idx in sample_indices: - chunk_text = chunk_texts[idx] - chunk_node = chunk_nodes[idx] - - concepts = self._extract_from_text(chunk_text) - if not concepts: + raw_concepts = self._extract_from_text(text_for_extraction) + all_concepts: dict[str, str] = {} + for concept_data in raw_concepts: + name = concept_data.get("name", "").strip().lower() + if not name or len(name) < 2: continue + all_concepts[name] = concept_data.get("type", "topic") - for concept_data in concepts: - name = concept_data.get("name", "").strip().lower() - concept_type = concept_data.get("type", "topic") + if not all_concepts: + logger.info("No concepts extracted for item_uid=%s", item.uid) + return 0 - if not name or len(name) < 2: - continue + # Resolve every concept once, then both link chunks (MENTIONS) and + # link the item (REFERENCES). Chunk linkage is substring-based — + # zero extra LLM calls, catches every explicit mention. + concept_nodes: dict[str, object] = {} + for name, concept_type in all_concepts.items(): + node = self._get_or_create_concept(name, concept_type) + if node: + concept_nodes[name] = node - all_concepts[name] = concept_type - - # Connect chunk -> concept via MENTIONS - concept_node = self._get_or_create_concept(name, concept_type) - if concept_node: + for chunk_node, chunk_text in zip(chunk_nodes, chunk_texts): + chunk_lower = chunk_text.lower() + for name, concept_node in concept_nodes.items(): + if name in chunk_lower: try: chunk_node.mentions.connect(concept_node) except Exception: pass # Already connected - # Connect item -> all concepts via REFERENCES - for name, concept_type in all_concepts.items(): - concept_node = self._get_or_create_concept(name, concept_type) - if concept_node: - try: - item.concepts.connect(concept_node, {"weight": 1.0}) - except Exception: - pass # Already connected - - CONCEPTS_EXTRACTED_TOTAL.labels(concept_type=concept_type).inc() + for name, concept_node in concept_nodes.items(): + try: + item.concepts.connect(concept_node, {"weight": 1.0}) + except Exception: + pass # Already connected + CONCEPTS_EXTRACTED_TOTAL.labels( + concept_type=all_concepts[name] + ).inc() logger.info( - "Extracted %d concepts for item_uid=%s", + "Extracted %d concepts for item_uid=%s from %d chars", len(all_concepts), item.uid, + len(text_for_extraction), ) return len(all_concepts) @@ -110,13 +126,9 @@ class ConceptExtractor: """ Call the chat model to extract concepts from text. - :param text: Text to analyze. + :param text: Text to analyze (caller is responsible for sizing). :returns: List of concept dicts with 'name' and 'type' keys. """ - # Truncate very long text to avoid token limits - if len(text) > 3000: - text = text[:3000] - prompt = CONCEPT_EXTRACTION_PROMPT.format(text=text) try: @@ -164,7 +176,7 @@ class ConceptExtractor: "model": self.chat_model.name, "messages": [{"role": "user", "content": prompt}], "temperature": 0.1, - "max_tokens": 1000, + "max_tokens": 4000, } resp = requests.post( @@ -250,18 +262,3 @@ class ConceptExtractor: logger.debug("Failed to get/create concept '%s': %s", name, exc) return None - def _select_sample_indices( - self, total: int, max_samples: int = 10 - ) -> list[int]: - """ - Select evenly-spaced sample indices for concept extraction. - - :param total: Total number of chunks. - :param max_samples: Maximum samples to take. - :returns: List of chunk indices to process. - """ - if total <= max_samples: - return list(range(total)) - - step = total / max_samples - return [int(i * step) for i in range(max_samples)] diff --git a/mnemosyne/library/services/pipeline.py b/mnemosyne/library/services/pipeline.py index 4020508..11e0d10 100644 --- a/mnemosyne/library/services/pipeline.py +++ b/mnemosyne/library/services/pipeline.py @@ -274,9 +274,12 @@ class EmbeddingPipeline: # --- Stage 7: Concept extraction --- chat_model = LLMModel.get_system_chat_model() if chat_model and chunk_result.chunks: + document_text = "\n\n".join( + block.text for block in parse_result.text_blocks + ) extractor = ConceptExtractor(chat_model, user=self.user) concepts_count = extractor.extract_for_item( - item, chunk_nodes, chunk_result.chunks + item, chunk_nodes, chunk_result.chunks, document_text ) result["concepts_extracted"] = concepts_count