Files
mnemosyne/mnemosyne/library/services/concepts.py
Robert Helewka 99bdb4ac92 Add Themis application with custom widgets, views, and utilities
- Implemented custom form widgets for date, time, and datetime fields with DaisyUI styling.
- Created utility functions for formatting dates, times, and numbers according to user preferences.
- Developed views for profile settings, API key management, and notifications, including health check endpoints.
- Added URL configurations for Themis tests and main application routes.
- Established test cases for custom widgets to ensure proper functionality and integration.
- Defined project metadata and dependencies in pyproject.toml for package management.
2026-03-21 02:00:18 +00:00

268 lines
8.7 KiB
Python

"""
LLM-based concept extraction for the knowledge graph.
Uses the system chat model to extract named entities (people, places,
topics, techniques, themes) from document chunks, then creates Concept
nodes and MENTIONS/REFERENCES relationships in Neo4j.
"""
import json
import logging
from typing import Optional
from library.metrics import CONCEPTS_EXTRACTED_TOTAL
logger = logging.getLogger(__name__)
# Prompt for concept extraction
CONCEPT_EXTRACTION_PROMPT = """Extract named entities and key concepts from the following text.
Return a JSON array of objects, each with:
- "name": the entity/concept name (lowercase, canonical form)
- "type": one of "person", "place", "topic", "technique", "theme"
Only extract significant, specific concepts — not generic words.
Return at most 20 concepts. Return ONLY the JSON array, no other text.
Text:
{text}"""
class ConceptExtractor:
"""
Extracts concepts from text using the system chat model.
Creates or updates Concept nodes in Neo4j and connects them
to Chunk and Item nodes via MENTIONS and REFERENCES relationships.
"""
def __init__(self, chat_model, user=None):
"""
:param chat_model: LLMModel instance for chat/completion.
:param user: Optional Django user for usage tracking.
"""
self.chat_model = chat_model
self.user = user
def extract_for_item(
self,
item,
chunk_nodes: list,
chunk_texts: list[str],
) -> int:
"""
Extract concepts from all chunks of an item.
:param item: Item node.
:param chunk_nodes: List of Chunk nodes.
:param chunk_texts: List of chunk text strings.
:returns: Total number of unique concepts extracted.
"""
all_concepts: dict[str, str] = {} # name -> type
# Sample chunks for extraction (don't process every chunk for large docs)
sample_indices = self._select_sample_indices(len(chunk_texts), max_samples=10)
for idx in sample_indices:
chunk_text = chunk_texts[idx]
chunk_node = chunk_nodes[idx]
concepts = self._extract_from_text(chunk_text)
if not concepts:
continue
for concept_data in concepts:
name = concept_data.get("name", "").strip().lower()
concept_type = concept_data.get("type", "topic")
if not name or len(name) < 2:
continue
all_concepts[name] = concept_type
# Connect chunk -> concept via MENTIONS
concept_node = self._get_or_create_concept(name, concept_type)
if concept_node:
try:
chunk_node.mentions.connect(concept_node)
except Exception:
pass # Already connected
# Connect item -> all concepts via REFERENCES
for name, concept_type in all_concepts.items():
concept_node = self._get_or_create_concept(name, concept_type)
if concept_node:
try:
item.concepts.connect(concept_node, {"weight": 1.0})
except Exception:
pass # Already connected
CONCEPTS_EXTRACTED_TOTAL.labels(concept_type=concept_type).inc()
logger.info(
"Extracted %d concepts for item_uid=%s",
len(all_concepts),
item.uid,
)
return len(all_concepts)
def _extract_from_text(self, text: str) -> list[dict]:
"""
Call the chat model to extract concepts from text.
:param text: Text to analyze.
:returns: List of concept dicts with 'name' and 'type' keys.
"""
# Truncate very long text to avoid token limits
if len(text) > 3000:
text = text[:3000]
prompt = CONCEPT_EXTRACTION_PROMPT.format(text=text)
try:
response_text = self._call_chat_model(prompt)
concepts = self._parse_concept_response(response_text)
logger.debug(
"Extracted %d concepts from text chunk (len=%d)",
len(concepts),
len(text),
)
return concepts
except Exception as exc:
logger.warning("Concept extraction failed: %s", exc)
return []
def _call_chat_model(self, prompt: str) -> str:
"""
Make a chat completion request to the system chat model.
:param prompt: User prompt text.
:returns: Response text from the model.
"""
import requests
api = self.chat_model.api
base_url = api.base_url.rstrip("/")
if api.api_type == "bedrock":
# Bedrock Converse endpoint
url = f"{base_url}/model/{self.chat_model.name}/converse"
headers = {
"Authorization": f"Bearer {api.api_key}",
"Content-Type": "application/json",
}
body = {
"messages": [{"role": "user", "content": [{"text": prompt}]}],
}
else:
# OpenAI-compatible
url = f"{base_url}/chat/completions"
headers = {"Content-Type": "application/json"}
if api.api_key:
headers["Authorization"] = f"Bearer {api.api_key}"
body = {
"model": self.chat_model.name,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1,
"max_tokens": 1000,
}
resp = requests.post(
url, json=body, headers=headers, timeout=api.timeout_seconds or 60
)
resp.raise_for_status()
data = resp.json()
# Parse response based on format
if "output" in data:
# Bedrock Converse format
return data["output"]["message"]["content"][0]["text"]
if "choices" in data:
# OpenAI format
return data["choices"][0]["message"]["content"]
raise ValueError(f"Unexpected chat response format: {list(data.keys())}")
def _parse_concept_response(self, response_text: str) -> list[dict]:
"""
Parse the LLM's concept extraction response into structured data.
:param response_text: Raw response text (expected JSON array).
:returns: List of concept dicts.
"""
# Try to extract JSON from the response
text = response_text.strip()
# Handle markdown code blocks
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1]) if len(lines) > 2 else text
try:
concepts = json.loads(text)
if isinstance(concepts, list):
return [
c for c in concepts
if isinstance(c, dict) and "name" in c
]
except json.JSONDecodeError:
pass
# Try to find JSON array in the response
import re
match = re.search(r"\[.*\]", text, re.DOTALL)
if match:
try:
concepts = json.loads(match.group())
if isinstance(concepts, list):
return [
c for c in concepts
if isinstance(c, dict) and "name" in c
]
except json.JSONDecodeError:
pass
logger.debug("Could not parse concept response: %s", text[:200])
return []
def _get_or_create_concept(self, name: str, concept_type: str):
"""
Get or create a Concept node by name.
:param name: Concept name (lowercase).
:param concept_type: Concept type (person, place, topic, etc.).
:returns: Concept node, or None on failure.
"""
from library.models import Concept
try:
# Try to get existing
existing = Concept.nodes.filter(name=name)
if existing:
return existing[0]
# Create new
concept = Concept(name=name, concept_type=concept_type)
concept.save()
return concept
except Exception as exc:
logger.debug("Failed to get/create concept '%s': %s", name, exc)
return None
def _select_sample_indices(
self, total: int, max_samples: int = 10
) -> list[int]:
"""
Select evenly-spaced sample indices for concept extraction.
:param total: Total number of chunks.
:param max_samples: Maximum samples to take.
:returns: List of chunk indices to process.
"""
if total <= max_samples:
return list(range(total))
step = total / max_samples
return [int(i * step) for i in range(max_samples)]