refactor(concepts): document-level extraction with one chat call per item
All checks were successful
CVE Scan & Docker Build / security-scan (push) Successful in 3m20s
Build & Deploy Docs / build-and-deploy (push) Successful in 1m8s
CVE Scan & Docker Build / build-and-push (push) Successful in 2m49s

Concept extraction was making up to 10 LLM calls per item by sampling
chunks, which produced redundant work (the same concept reappears in
multiple chunks), context-loss bugs (chunk boundaries cut mid-thought),
and on a 35B model dominated per-item wall time (~3 min/item).

Concepts are document-level semantic objects; chunks are retrieval
units. Extract once per item from the first 100KB of parsed document
text, then connect each chunk to the concepts it explicitly mentions
via case-insensitive substring match — no extra LLM calls. Drops the
sample-indices selector that the old per-chunk loop relied on.

Stage 7 is currently dormant in production because the configured
chat model is a reasoning-mode Qwen variant that returns empty content
on every call (output stuck in reasoning_content). Re-enables cleanly
once a non-reasoning instruct model is set as is_system_chat_model.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-23 21:52:51 -04:00
parent bc80d90b38
commit 75013ebfc3
2 changed files with 56 additions and 56 deletions

View File

@@ -22,7 +22,7 @@ Return a JSON array of objects, each with:
- "type": one of "person", "place", "topic", "technique", "theme" - "type": one of "person", "place", "topic", "technique", "theme"
Only extract significant, specific concepts — not generic words. Only extract significant, specific concepts — not generic words.
Return at most 20 concepts. Return ONLY the JSON array, no other text. Return at most 60 concepts. Return ONLY the JSON array, no other text.
Text: Text:
{text}""" {text}"""
@@ -49,60 +49,76 @@ class ConceptExtractor:
item, item,
chunk_nodes: list, chunk_nodes: list,
chunk_texts: list[str], chunk_texts: list[str],
document_text: str,
) -> int: ) -> int:
""" """
Extract concepts from all chunks of an item. Extract concepts from a document with one LLM call.
Concepts are document-level semantic objects; chunks are
retrieval units. Extracting per-chunk produced redundant calls
for the same entity, mid-thought context loss at chunk boundaries,
and 10x the cost. Single document-level call + post-hoc substring
matching for chunk linkage instead.
:param item: Item node. :param item: Item node.
:param chunk_nodes: List of Chunk nodes. :param chunk_nodes: List of Chunk nodes (for MENTIONS linkage).
:param chunk_texts: List of chunk text strings. :param chunk_texts: List of chunk text strings (for substring match).
:param document_text: Full parsed document text.
:returns: Total number of unique concepts extracted. :returns: Total number of unique concepts extracted.
""" """
all_concepts: dict[str, str] = {} # name -> type # 100KB covers ~25k tokens — fits comfortably in Qwen's 192k window
# and front-loads on the assumption that any concept introduced later
# has already appeared in the document's first ~30 pages.
DOC_EXTRACT_CHARS = 100_000
# Sample chunks for extraction (don't process every chunk for large docs) text_for_extraction = document_text[:DOC_EXTRACT_CHARS]
sample_indices = self._select_sample_indices(len(chunk_texts), max_samples=10) if not text_for_extraction.strip():
return 0
for idx in sample_indices: raw_concepts = self._extract_from_text(text_for_extraction)
chunk_text = chunk_texts[idx] all_concepts: dict[str, str] = {}
chunk_node = chunk_nodes[idx] for concept_data in raw_concepts:
concepts = self._extract_from_text(chunk_text)
if not concepts:
continue
for concept_data in concepts:
name = concept_data.get("name", "").strip().lower() name = concept_data.get("name", "").strip().lower()
concept_type = concept_data.get("type", "topic")
if not name or len(name) < 2: if not name or len(name) < 2:
continue continue
all_concepts[name] = concept_data.get("type", "topic")
all_concepts[name] = concept_type if not all_concepts:
logger.info("No concepts extracted for item_uid=%s", item.uid)
return 0
# Connect chunk -> concept via MENTIONS # Resolve every concept once, then both link chunks (MENTIONS) and
concept_node = self._get_or_create_concept(name, concept_type) # link the item (REFERENCES). Chunk linkage is substring-based —
if concept_node: # zero extra LLM calls, catches every explicit mention.
concept_nodes: dict[str, object] = {}
for name, concept_type in all_concepts.items():
node = self._get_or_create_concept(name, concept_type)
if node:
concept_nodes[name] = node
for chunk_node, chunk_text in zip(chunk_nodes, chunk_texts):
chunk_lower = chunk_text.lower()
for name, concept_node in concept_nodes.items():
if name in chunk_lower:
try: try:
chunk_node.mentions.connect(concept_node) chunk_node.mentions.connect(concept_node)
except Exception: except Exception:
pass # Already connected pass # Already connected
# Connect item -> all concepts via REFERENCES for name, concept_node in concept_nodes.items():
for name, concept_type in all_concepts.items():
concept_node = self._get_or_create_concept(name, concept_type)
if concept_node:
try: try:
item.concepts.connect(concept_node, {"weight": 1.0}) item.concepts.connect(concept_node, {"weight": 1.0})
except Exception: except Exception:
pass # Already connected pass # Already connected
CONCEPTS_EXTRACTED_TOTAL.labels(
CONCEPTS_EXTRACTED_TOTAL.labels(concept_type=concept_type).inc() concept_type=all_concepts[name]
).inc()
logger.info( logger.info(
"Extracted %d concepts for item_uid=%s", "Extracted %d concepts for item_uid=%s from %d chars",
len(all_concepts), len(all_concepts),
item.uid, item.uid,
len(text_for_extraction),
) )
return len(all_concepts) return len(all_concepts)
@@ -110,13 +126,9 @@ class ConceptExtractor:
""" """
Call the chat model to extract concepts from text. Call the chat model to extract concepts from text.
:param text: Text to analyze. :param text: Text to analyze (caller is responsible for sizing).
:returns: List of concept dicts with 'name' and 'type' keys. :returns: List of concept dicts with 'name' and 'type' keys.
""" """
# Truncate very long text to avoid token limits
if len(text) > 3000:
text = text[:3000]
prompt = CONCEPT_EXTRACTION_PROMPT.format(text=text) prompt = CONCEPT_EXTRACTION_PROMPT.format(text=text)
try: try:
@@ -164,7 +176,7 @@ class ConceptExtractor:
"model": self.chat_model.name, "model": self.chat_model.name,
"messages": [{"role": "user", "content": prompt}], "messages": [{"role": "user", "content": prompt}],
"temperature": 0.1, "temperature": 0.1,
"max_tokens": 1000, "max_tokens": 4000,
} }
resp = requests.post( resp = requests.post(
@@ -250,18 +262,3 @@ class ConceptExtractor:
logger.debug("Failed to get/create concept '%s': %s", name, exc) logger.debug("Failed to get/create concept '%s': %s", name, exc)
return None return None
def _select_sample_indices(
self, total: int, max_samples: int = 10
) -> list[int]:
"""
Select evenly-spaced sample indices for concept extraction.
:param total: Total number of chunks.
:param max_samples: Maximum samples to take.
:returns: List of chunk indices to process.
"""
if total <= max_samples:
return list(range(total))
step = total / max_samples
return [int(i * step) for i in range(max_samples)]

View File

@@ -274,9 +274,12 @@ class EmbeddingPipeline:
# --- Stage 7: Concept extraction --- # --- Stage 7: Concept extraction ---
chat_model = LLMModel.get_system_chat_model() chat_model = LLMModel.get_system_chat_model()
if chat_model and chunk_result.chunks: if chat_model and chunk_result.chunks:
document_text = "\n\n".join(
block.text for block in parse_result.text_blocks
)
extractor = ConceptExtractor(chat_model, user=self.user) extractor = ConceptExtractor(chat_model, user=self.user)
concepts_count = extractor.extract_for_item( concepts_count = extractor.extract_for_item(
item, chunk_nodes, chunk_result.chunks item, chunk_nodes, chunk_result.chunks, document_text
) )
result["concepts_extracted"] = concepts_count result["concepts_extracted"] = concepts_count