- Implemented custom form widgets for date, time, and datetime fields with DaisyUI styling. - Created utility functions for formatting dates, times, and numbers according to user preferences. - Developed views for profile settings, API key management, and notifications, including health check endpoints. - Added URL configurations for Themis tests and main application routes. - Established test cases for custom widgets to ensure proper functionality and integration. - Defined project metadata and dependencies in pyproject.toml for package management.
268 lines
8.7 KiB
Python
268 lines
8.7 KiB
Python
"""
|
|
LLM-based concept extraction for the knowledge graph.
|
|
|
|
Uses the system chat model to extract named entities (people, places,
|
|
topics, techniques, themes) from document chunks, then creates Concept
|
|
nodes and MENTIONS/REFERENCES relationships in Neo4j.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from typing import Optional
|
|
|
|
from library.metrics import CONCEPTS_EXTRACTED_TOTAL
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Prompt for concept extraction
|
|
CONCEPT_EXTRACTION_PROMPT = """Extract named entities and key concepts from the following text.
|
|
|
|
Return a JSON array of objects, each with:
|
|
- "name": the entity/concept name (lowercase, canonical form)
|
|
- "type": one of "person", "place", "topic", "technique", "theme"
|
|
|
|
Only extract significant, specific concepts — not generic words.
|
|
Return at most 20 concepts. Return ONLY the JSON array, no other text.
|
|
|
|
Text:
|
|
{text}"""
|
|
|
|
|
|
class ConceptExtractor:
|
|
"""
|
|
Extracts concepts from text using the system chat model.
|
|
|
|
Creates or updates Concept nodes in Neo4j and connects them
|
|
to Chunk and Item nodes via MENTIONS and REFERENCES relationships.
|
|
"""
|
|
|
|
def __init__(self, chat_model, user=None):
|
|
"""
|
|
:param chat_model: LLMModel instance for chat/completion.
|
|
:param user: Optional Django user for usage tracking.
|
|
"""
|
|
self.chat_model = chat_model
|
|
self.user = user
|
|
|
|
def extract_for_item(
|
|
self,
|
|
item,
|
|
chunk_nodes: list,
|
|
chunk_texts: list[str],
|
|
) -> int:
|
|
"""
|
|
Extract concepts from all chunks of an item.
|
|
|
|
:param item: Item node.
|
|
:param chunk_nodes: List of Chunk nodes.
|
|
:param chunk_texts: List of chunk text strings.
|
|
:returns: Total number of unique concepts extracted.
|
|
"""
|
|
all_concepts: dict[str, str] = {} # name -> type
|
|
|
|
# Sample chunks for extraction (don't process every chunk for large docs)
|
|
sample_indices = self._select_sample_indices(len(chunk_texts), max_samples=10)
|
|
|
|
for idx in sample_indices:
|
|
chunk_text = chunk_texts[idx]
|
|
chunk_node = chunk_nodes[idx]
|
|
|
|
concepts = self._extract_from_text(chunk_text)
|
|
if not concepts:
|
|
continue
|
|
|
|
for concept_data in concepts:
|
|
name = concept_data.get("name", "").strip().lower()
|
|
concept_type = concept_data.get("type", "topic")
|
|
|
|
if not name or len(name) < 2:
|
|
continue
|
|
|
|
all_concepts[name] = concept_type
|
|
|
|
# Connect chunk -> concept via MENTIONS
|
|
concept_node = self._get_or_create_concept(name, concept_type)
|
|
if concept_node:
|
|
try:
|
|
chunk_node.mentions.connect(concept_node)
|
|
except Exception:
|
|
pass # Already connected
|
|
|
|
# Connect item -> all concepts via REFERENCES
|
|
for name, concept_type in all_concepts.items():
|
|
concept_node = self._get_or_create_concept(name, concept_type)
|
|
if concept_node:
|
|
try:
|
|
item.concepts.connect(concept_node, {"weight": 1.0})
|
|
except Exception:
|
|
pass # Already connected
|
|
|
|
CONCEPTS_EXTRACTED_TOTAL.labels(concept_type=concept_type).inc()
|
|
|
|
logger.info(
|
|
"Extracted %d concepts for item_uid=%s",
|
|
len(all_concepts),
|
|
item.uid,
|
|
)
|
|
return len(all_concepts)
|
|
|
|
def _extract_from_text(self, text: str) -> list[dict]:
|
|
"""
|
|
Call the chat model to extract concepts from text.
|
|
|
|
:param text: Text to analyze.
|
|
:returns: List of concept dicts with 'name' and 'type' keys.
|
|
"""
|
|
# Truncate very long text to avoid token limits
|
|
if len(text) > 3000:
|
|
text = text[:3000]
|
|
|
|
prompt = CONCEPT_EXTRACTION_PROMPT.format(text=text)
|
|
|
|
try:
|
|
response_text = self._call_chat_model(prompt)
|
|
concepts = self._parse_concept_response(response_text)
|
|
logger.debug(
|
|
"Extracted %d concepts from text chunk (len=%d)",
|
|
len(concepts),
|
|
len(text),
|
|
)
|
|
return concepts
|
|
except Exception as exc:
|
|
logger.warning("Concept extraction failed: %s", exc)
|
|
return []
|
|
|
|
def _call_chat_model(self, prompt: str) -> str:
|
|
"""
|
|
Make a chat completion request to the system chat model.
|
|
|
|
:param prompt: User prompt text.
|
|
:returns: Response text from the model.
|
|
"""
|
|
import requests
|
|
|
|
api = self.chat_model.api
|
|
base_url = api.base_url.rstrip("/")
|
|
|
|
if api.api_type == "bedrock":
|
|
# Bedrock Converse endpoint
|
|
url = f"{base_url}/model/{self.chat_model.name}/converse"
|
|
headers = {
|
|
"Authorization": f"Bearer {api.api_key}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
body = {
|
|
"messages": [{"role": "user", "content": [{"text": prompt}]}],
|
|
}
|
|
else:
|
|
# OpenAI-compatible
|
|
url = f"{base_url}/chat/completions"
|
|
headers = {"Content-Type": "application/json"}
|
|
if api.api_key:
|
|
headers["Authorization"] = f"Bearer {api.api_key}"
|
|
body = {
|
|
"model": self.chat_model.name,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"temperature": 0.1,
|
|
"max_tokens": 1000,
|
|
}
|
|
|
|
resp = requests.post(
|
|
url, json=body, headers=headers, timeout=api.timeout_seconds or 60
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
# Parse response based on format
|
|
if "output" in data:
|
|
# Bedrock Converse format
|
|
return data["output"]["message"]["content"][0]["text"]
|
|
if "choices" in data:
|
|
# OpenAI format
|
|
return data["choices"][0]["message"]["content"]
|
|
|
|
raise ValueError(f"Unexpected chat response format: {list(data.keys())}")
|
|
|
|
def _parse_concept_response(self, response_text: str) -> list[dict]:
|
|
"""
|
|
Parse the LLM's concept extraction response into structured data.
|
|
|
|
:param response_text: Raw response text (expected JSON array).
|
|
:returns: List of concept dicts.
|
|
"""
|
|
# Try to extract JSON from the response
|
|
text = response_text.strip()
|
|
|
|
# Handle markdown code blocks
|
|
if text.startswith("```"):
|
|
lines = text.split("\n")
|
|
text = "\n".join(lines[1:-1]) if len(lines) > 2 else text
|
|
|
|
try:
|
|
concepts = json.loads(text)
|
|
if isinstance(concepts, list):
|
|
return [
|
|
c for c in concepts
|
|
if isinstance(c, dict) and "name" in c
|
|
]
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Try to find JSON array in the response
|
|
import re
|
|
|
|
match = re.search(r"\[.*\]", text, re.DOTALL)
|
|
if match:
|
|
try:
|
|
concepts = json.loads(match.group())
|
|
if isinstance(concepts, list):
|
|
return [
|
|
c for c in concepts
|
|
if isinstance(c, dict) and "name" in c
|
|
]
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
logger.debug("Could not parse concept response: %s", text[:200])
|
|
return []
|
|
|
|
def _get_or_create_concept(self, name: str, concept_type: str):
|
|
"""
|
|
Get or create a Concept node by name.
|
|
|
|
:param name: Concept name (lowercase).
|
|
:param concept_type: Concept type (person, place, topic, etc.).
|
|
:returns: Concept node, or None on failure.
|
|
"""
|
|
from library.models import Concept
|
|
|
|
try:
|
|
# Try to get existing
|
|
existing = Concept.nodes.filter(name=name)
|
|
if existing:
|
|
return existing[0]
|
|
|
|
# Create new
|
|
concept = Concept(name=name, concept_type=concept_type)
|
|
concept.save()
|
|
return concept
|
|
except Exception as exc:
|
|
logger.debug("Failed to get/create concept '%s': %s", name, exc)
|
|
return None
|
|
|
|
def _select_sample_indices(
|
|
self, total: int, max_samples: int = 10
|
|
) -> list[int]:
|
|
"""
|
|
Select evenly-spaced sample indices for concept extraction.
|
|
|
|
:param total: Total number of chunks.
|
|
:param max_samples: Maximum samples to take.
|
|
:returns: List of chunk indices to process.
|
|
"""
|
|
if total <= max_samples:
|
|
return list(range(total))
|
|
|
|
step = total / max_samples
|
|
return [int(i * step) for i in range(max_samples)]
|