refactor(concepts): document-level extraction with one chat call per item
Concept extraction was making up to 10 LLM calls per item by sampling chunks, which produced redundant work (the same concept reappears in multiple chunks), context-loss bugs (chunk boundaries cut mid-thought), and on a 35B model dominated per-item wall time (~3 min/item). Concepts are document-level semantic objects; chunks are retrieval units. Extract once per item from the first 100KB of parsed document text, then connect each chunk to the concepts it explicitly mentions via case-insensitive substring match — no extra LLM calls. Drops the sample-indices selector that the old per-chunk loop relied on. Stage 7 is currently dormant in production because the configured chat model is a reasoning-mode Qwen variant that returns empty content on every call (output stuck in reasoning_content). Re-enables cleanly once a non-reasoning instruct model is set as is_system_chat_model. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -22,7 +22,7 @@ Return a JSON array of objects, each with:
|
||||
- "type": one of "person", "place", "topic", "technique", "theme"
|
||||
|
||||
Only extract significant, specific concepts — not generic words.
|
||||
Return at most 20 concepts. Return ONLY the JSON array, no other text.
|
||||
Return at most 60 concepts. Return ONLY the JSON array, no other text.
|
||||
|
||||
Text:
|
||||
{text}"""
|
||||
@@ -49,60 +49,76 @@ class ConceptExtractor:
|
||||
item,
|
||||
chunk_nodes: list,
|
||||
chunk_texts: list[str],
|
||||
document_text: str,
|
||||
) -> int:
|
||||
"""
|
||||
Extract concepts from all chunks of an item.
|
||||
Extract concepts from a document with one LLM call.
|
||||
|
||||
Concepts are document-level semantic objects; chunks are
|
||||
retrieval units. Extracting per-chunk produced redundant calls
|
||||
for the same entity, mid-thought context loss at chunk boundaries,
|
||||
and 10x the cost. Single document-level call + post-hoc substring
|
||||
matching for chunk linkage instead.
|
||||
|
||||
:param item: Item node.
|
||||
:param chunk_nodes: List of Chunk nodes.
|
||||
:param chunk_texts: List of chunk text strings.
|
||||
:param chunk_nodes: List of Chunk nodes (for MENTIONS linkage).
|
||||
:param chunk_texts: List of chunk text strings (for substring match).
|
||||
:param document_text: Full parsed document text.
|
||||
:returns: Total number of unique concepts extracted.
|
||||
"""
|
||||
all_concepts: dict[str, str] = {} # name -> type
|
||||
# 100KB covers ~25k tokens — fits comfortably in Qwen's 192k window
|
||||
# and front-loads on the assumption that any concept introduced later
|
||||
# has already appeared in the document's first ~30 pages.
|
||||
DOC_EXTRACT_CHARS = 100_000
|
||||
|
||||
# Sample chunks for extraction (don't process every chunk for large docs)
|
||||
sample_indices = self._select_sample_indices(len(chunk_texts), max_samples=10)
|
||||
text_for_extraction = document_text[:DOC_EXTRACT_CHARS]
|
||||
if not text_for_extraction.strip():
|
||||
return 0
|
||||
|
||||
for idx in sample_indices:
|
||||
chunk_text = chunk_texts[idx]
|
||||
chunk_node = chunk_nodes[idx]
|
||||
|
||||
concepts = self._extract_from_text(chunk_text)
|
||||
if not concepts:
|
||||
raw_concepts = self._extract_from_text(text_for_extraction)
|
||||
all_concepts: dict[str, str] = {}
|
||||
for concept_data in raw_concepts:
|
||||
name = concept_data.get("name", "").strip().lower()
|
||||
if not name or len(name) < 2:
|
||||
continue
|
||||
all_concepts[name] = concept_data.get("type", "topic")
|
||||
|
||||
for concept_data in concepts:
|
||||
name = concept_data.get("name", "").strip().lower()
|
||||
concept_type = concept_data.get("type", "topic")
|
||||
if not all_concepts:
|
||||
logger.info("No concepts extracted for item_uid=%s", item.uid)
|
||||
return 0
|
||||
|
||||
if not name or len(name) < 2:
|
||||
continue
|
||||
# Resolve every concept once, then both link chunks (MENTIONS) and
|
||||
# link the item (REFERENCES). Chunk linkage is substring-based —
|
||||
# zero extra LLM calls, catches every explicit mention.
|
||||
concept_nodes: dict[str, object] = {}
|
||||
for name, concept_type in all_concepts.items():
|
||||
node = self._get_or_create_concept(name, concept_type)
|
||||
if node:
|
||||
concept_nodes[name] = node
|
||||
|
||||
all_concepts[name] = concept_type
|
||||
|
||||
# Connect chunk -> concept via MENTIONS
|
||||
concept_node = self._get_or_create_concept(name, concept_type)
|
||||
if concept_node:
|
||||
for chunk_node, chunk_text in zip(chunk_nodes, chunk_texts):
|
||||
chunk_lower = chunk_text.lower()
|
||||
for name, concept_node in concept_nodes.items():
|
||||
if name in chunk_lower:
|
||||
try:
|
||||
chunk_node.mentions.connect(concept_node)
|
||||
except Exception:
|
||||
pass # Already connected
|
||||
|
||||
# Connect item -> all concepts via REFERENCES
|
||||
for name, concept_type in all_concepts.items():
|
||||
concept_node = self._get_or_create_concept(name, concept_type)
|
||||
if concept_node:
|
||||
try:
|
||||
item.concepts.connect(concept_node, {"weight": 1.0})
|
||||
except Exception:
|
||||
pass # Already connected
|
||||
|
||||
CONCEPTS_EXTRACTED_TOTAL.labels(concept_type=concept_type).inc()
|
||||
for name, concept_node in concept_nodes.items():
|
||||
try:
|
||||
item.concepts.connect(concept_node, {"weight": 1.0})
|
||||
except Exception:
|
||||
pass # Already connected
|
||||
CONCEPTS_EXTRACTED_TOTAL.labels(
|
||||
concept_type=all_concepts[name]
|
||||
).inc()
|
||||
|
||||
logger.info(
|
||||
"Extracted %d concepts for item_uid=%s",
|
||||
"Extracted %d concepts for item_uid=%s from %d chars",
|
||||
len(all_concepts),
|
||||
item.uid,
|
||||
len(text_for_extraction),
|
||||
)
|
||||
return len(all_concepts)
|
||||
|
||||
@@ -110,13 +126,9 @@ class ConceptExtractor:
|
||||
"""
|
||||
Call the chat model to extract concepts from text.
|
||||
|
||||
:param text: Text to analyze.
|
||||
:param text: Text to analyze (caller is responsible for sizing).
|
||||
:returns: List of concept dicts with 'name' and 'type' keys.
|
||||
"""
|
||||
# Truncate very long text to avoid token limits
|
||||
if len(text) > 3000:
|
||||
text = text[:3000]
|
||||
|
||||
prompt = CONCEPT_EXTRACTION_PROMPT.format(text=text)
|
||||
|
||||
try:
|
||||
@@ -164,7 +176,7 @@ class ConceptExtractor:
|
||||
"model": self.chat_model.name,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 1000,
|
||||
"max_tokens": 4000,
|
||||
}
|
||||
|
||||
resp = requests.post(
|
||||
@@ -250,18 +262,3 @@ class ConceptExtractor:
|
||||
logger.debug("Failed to get/create concept '%s': %s", name, exc)
|
||||
return None
|
||||
|
||||
def _select_sample_indices(
|
||||
self, total: int, max_samples: int = 10
|
||||
) -> list[int]:
|
||||
"""
|
||||
Select evenly-spaced sample indices for concept extraction.
|
||||
|
||||
:param total: Total number of chunks.
|
||||
:param max_samples: Maximum samples to take.
|
||||
:returns: List of chunk indices to process.
|
||||
"""
|
||||
if total <= max_samples:
|
||||
return list(range(total))
|
||||
|
||||
step = total / max_samples
|
||||
return [int(i * step) for i in range(max_samples)]
|
||||
|
||||
@@ -274,9 +274,12 @@ class EmbeddingPipeline:
|
||||
# --- Stage 7: Concept extraction ---
|
||||
chat_model = LLMModel.get_system_chat_model()
|
||||
if chat_model and chunk_result.chunks:
|
||||
document_text = "\n\n".join(
|
||||
block.text for block in parse_result.text_blocks
|
||||
)
|
||||
extractor = ConceptExtractor(chat_model, user=self.user)
|
||||
concepts_count = extractor.extract_for_item(
|
||||
item, chunk_nodes, chunk_result.chunks
|
||||
item, chunk_nodes, chunk_result.chunks, document_text
|
||||
)
|
||||
result["concepts_extracted"] = concepts_count
|
||||
|
||||
|
||||
Reference in New Issue
Block a user