mnemosyne/mnemosyne/library/services/concepts.py

"""
LLM-based concept extraction for the knowledge graph.

Uses the system chat model to extract named entities (people, places,
topics, techniques, themes) from document chunks, then creates Concept
nodes and MENTIONS/REFERENCES relationships in Neo4j.
"""

import json
import logging
from typing import Optional

from library.metrics import CONCEPTS_EXTRACTED_TOTAL

logger = logging.getLogger(__name__)

# Prompt for concept extraction
CONCEPT_EXTRACTION_PROMPT = """Extract named entities and key concepts from the following text.

Return a JSON array of objects, each with:
- "name": the entity/concept name (lowercase, canonical form)
- "type": one of "person", "place", "topic", "technique", "theme"

Only extract significant, specific concepts — not generic words.
Return at most 20 concepts. Return ONLY the JSON array, no other text.

Text:
{text}"""


class ConceptExtractor:
    """
    Extracts concepts from text using the system chat model.

    Creates or updates Concept nodes in Neo4j and connects them
    to Chunk and Item nodes via MENTIONS and REFERENCES relationships.
    """

    def __init__(self, chat_model, user=None):
        """
        :param chat_model: LLMModel instance for chat/completion.
        :param user: Optional Django user for usage tracking.
        """
        self.chat_model = chat_model
        self.user = user

    def extract_for_item(
        self,
        item,
        chunk_nodes: list,
        chunk_texts: list[str],
    ) -> int:
        """
        Extract concepts from all chunks of an item.

        :param item: Item node.
        :param chunk_nodes: List of Chunk nodes.
        :param chunk_texts: List of chunk text strings.
        :returns: Total number of unique concepts extracted.
        """
        all_concepts: dict[str, str] = {}  # name -> type

        # Sample chunks for extraction (don't process every chunk for large docs)
        sample_indices = self._select_sample_indices(len(chunk_texts), max_samples=10)

        for idx in sample_indices:
            chunk_text = chunk_texts[idx]
            chunk_node = chunk_nodes[idx]

            concepts = self._extract_from_text(chunk_text)
            if not concepts:
                continue

            for concept_data in concepts:
                name = concept_data.get("name", "").strip().lower()
                concept_type = concept_data.get("type", "topic")

                if not name or len(name) < 2:
                    continue

                all_concepts[name] = concept_type

                # Connect chunk -> concept via MENTIONS
                concept_node = self._get_or_create_concept(name, concept_type)
                if concept_node:
                    try:
                        chunk_node.mentions.connect(concept_node)
                    except Exception:
                        pass  # Already connected

        # Connect item -> all concepts via REFERENCES
        for name, concept_type in all_concepts.items():
            concept_node = self._get_or_create_concept(name, concept_type)
            if concept_node:
                try:
                    item.concepts.connect(concept_node, {"weight": 1.0})
                except Exception:
                    pass  # Already connected

                CONCEPTS_EXTRACTED_TOTAL.labels(concept_type=concept_type).inc()

        logger.info(
            "Extracted %d concepts for item_uid=%s",
            len(all_concepts),
            item.uid,
        )
        return len(all_concepts)

    def _extract_from_text(self, text: str) -> list[dict]:
        """
        Call the chat model to extract concepts from text.

        :param text: Text to analyze.
        :returns: List of concept dicts with 'name' and 'type' keys.
        """
        # Truncate very long text to avoid token limits
        if len(text) > 3000:
            text = text[:3000]

        prompt = CONCEPT_EXTRACTION_PROMPT.format(text=text)

        try:
            response_text = self._call_chat_model(prompt)
            concepts = self._parse_concept_response(response_text)
            logger.debug(
                "Extracted %d concepts from text chunk (len=%d)",
                len(concepts),
                len(text),
            )
            return concepts
        except Exception as exc:
            logger.warning("Concept extraction failed: %s", exc)
            return []

    def _call_chat_model(self, prompt: str) -> str:
        """
        Make a chat completion request to the system chat model.

        :param prompt: User prompt text.
        :returns: Response text from the model.
        """
        import requests

        api = self.chat_model.api
        base_url = api.base_url.rstrip("/")

        if api.api_type == "bedrock":
            # Bedrock Converse endpoint
            url = f"{base_url}/model/{self.chat_model.name}/converse"
            headers = {
                "Authorization": f"Bearer {api.api_key}",
                "Content-Type": "application/json",
            }
            body = {
                "messages": [{"role": "user", "content": [{"text": prompt}]}],
            }
        else:
            # OpenAI-compatible
            url = f"{base_url}/chat/completions"
            headers = {"Content-Type": "application/json"}
            if api.api_key:
                headers["Authorization"] = f"Bearer {api.api_key}"
            body = {
                "model": self.chat_model.name,
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0.1,
                "max_tokens": 1000,
            }

        resp = requests.post(
            url, json=body, headers=headers, timeout=api.timeout_seconds or 60
        )
        resp.raise_for_status()
        data = resp.json()

        # Parse response based on format
        if "output" in data:
            # Bedrock Converse format
            return data["output"]["message"]["content"][0]["text"]
        if "choices" in data:
            # OpenAI format
            return data["choices"][0]["message"]["content"]

        raise ValueError(f"Unexpected chat response format: {list(data.keys())}")

    def _parse_concept_response(self, response_text: str) -> list[dict]:
        """
        Parse the LLM's concept extraction response into structured data.

        :param response_text: Raw response text (expected JSON array).
        :returns: List of concept dicts.
        """
        # Try to extract JSON from the response
        text = response_text.strip()

        # Handle markdown code blocks
        if text.startswith("```"):
            lines = text.split("\n")
            text = "\n".join(lines[1:-1]) if len(lines) > 2 else text

        try:
            concepts = json.loads(text)
            if isinstance(concepts, list):
                return [
                    c for c in concepts
                    if isinstance(c, dict) and "name" in c
                ]
        except json.JSONDecodeError:
            pass

        # Try to find JSON array in the response
        import re

        match = re.search(r"\[.*\]", text, re.DOTALL)
        if match:
            try:
                concepts = json.loads(match.group())
                if isinstance(concepts, list):
                    return [
                        c for c in concepts
                        if isinstance(c, dict) and "name" in c
                    ]
            except json.JSONDecodeError:
                pass

        logger.debug("Could not parse concept response: %s", text[:200])
        return []

    def _get_or_create_concept(self, name: str, concept_type: str):
        """
        Get or create a Concept node by name.

        :param name: Concept name (lowercase).
        :param concept_type: Concept type (person, place, topic, etc.).
        :returns: Concept node, or None on failure.
        """
        from library.models import Concept

        try:
            # Try to get existing
            existing = Concept.nodes.filter(name=name)
            if existing:
                return existing[0]

            # Create new
            concept = Concept(name=name, concept_type=concept_type)
            concept.save()
            return concept
        except Exception as exc:
            logger.debug("Failed to get/create concept '%s': %s", name, exc)
            return None

    def _select_sample_indices(
        self, total: int, max_samples: int = 10
    ) -> list[int]:
        """
        Select evenly-spaced sample indices for concept extraction.

        :param total: Total number of chunks.
        :param max_samples: Maximum samples to take.
        :returns: List of chunk indices to process.
        """
        if total <= max_samples:
            return list(range(total))

        step = total / max_samples
        return [int(i * step) for i in range(max_samples)]