diff --git a/.gitignore b/.gitignore index 36b13f1..9883c21 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,7 @@ cython_debug/ # PyPI configuration file .pypirc +# Mnemosyne-specific +.env.local +/staticfiles/ +/media/ diff --git a/LICENSE b/LICENSE index 34e595d..8a6f8da 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2026 r +Copyright (c) 2026 Helu.ca Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including diff --git a/README.md b/README.md index c1e0a6d..b9361a4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,96 @@ -# mnemosyne +# Mnemosyne + +*"The electric light did not come from the continuous improvement of candles."* — Oren Harari + +**The memory of everything you know.** + +Mnemosyne is a content-type-aware, multimodal personal knowledge management system built on Neo4j knowledge graphs and Qwen3-VL multimodal AI models. Named after the Titan goddess of memory and mother of the nine Muses, Mnemosyne doesn't just store your knowledge — it understands what kind of knowledge it is, connects it through relationships, and makes it all searchable through text, images, and natural language. + +## What Makes This Different + +Every existing knowledge base tool treats all documents identically: text in, chunks out, vectors stored. A novel and a PostgreSQL manual get the same treatment. + +Mnemosyne knows the difference: + +- **A textbook** has chapters, an index, technical terminology, and pedagogical structure. It's chunked accordingly, and when an LLM retrieves results, it knows this is instructional content. +- **A novel** has narrative flow, characters, plot arcs, dialogue. The LLM knows to interpret results as creative fiction. +- **Album artwork** is a visual asset tied to an artist, genre, and era. It's embedded multimodally — searchable by both image similarity and text description. +- **A journal entry** is personal, temporal, reflective. The LLM treats it differently than a reference manual. + +This **content-type awareness** flows through every layer: chunking strategy, embedding instructions, re-ranking, and the final LLM prompt. + +## Core Architecture + +| Component | Technology | Purpose | +|-----------|-----------|---------| +| **Knowledge Graph** | Neo4j 5.x | Relationships + vector storage (no dimension limits) | +| **Multimodal Embeddings** | Qwen3-VL-Embedding-8B | Text + image + video in unified vector space (4096d) | +| **Multimodal Re-ranking** | Qwen3-VL-Reranker-8B | Cross-attention precision scoring | +| **Text Fallback** | Qwen3-Reranker (llama.cpp) | Text-only re-ranking via GGUF | +| **Web Framework** | Django 5.x + DRF | Auth, admin, API, content management | +| **Object Storage** | S3/MinIO | Original content + chunk text storage | +| **Async Processing** | Celery + RabbitMQ | Document embedding, graph construction | +| **LLM Interface** | MCP Server | Primary interface for Claude, Copilot, etc. | +| **GPU Serving** | vLLM + llama.cpp | Local model inference | + +## Library Types + +| Library | Example Content | Multimodal? | Graph Relationships | +|---------|----------------|-------------|-------------------| +| **Fiction** | Novels, short stories | Cover art | Author → Book → Character → Theme | +| **Technical** | Textbooks, manuals, docs | Diagrams, screenshots | Product → Manual → Section → Procedure | +| **Music** | Lyrics, liner notes | Album artwork | Artist → Album → Track → Genre | +| **Film** | Scripts, synopses | Stills, posters | Director → Film → Scene → Actor | +| **Art** | Descriptions, catalogs | The artwork itself | Artist → Piece → Style → Movement | +| **Journals** | Personal entries | Photos | Date → Entry → Topic → Person/Place | + +## Search Pipeline + +``` +Query → Vector Search (Neo4j) + Graph Traversal (Cypher) + Full-Text Search + → Candidate Fusion → Qwen3-VL Re-ranking → Content-Type Context Injection + → LLM Response with Citations +``` + +## Heritage + +Mnemosyne's RAG pipeline architecture is inspired by [Spelunker](https://git.helu.ca/r/spelunker), an enterprise RFP response platform. The proven patterns — hybrid search, two-stage RAG (responder + reviewer), citation-based retrieval, and async document processing — are carried forward and enhanced with multimodal capabilities and knowledge graph relationships. + +## Running Celery Workers + +Mnemosyne uses Celery with RabbitMQ for async document embedding. From the `mnemosyne/` directory: + +```bash +# Development — single worker, all queues +celery -A mnemosyne worker -l info -Q celery,embedding,batch + +# Or skip workers entirely with eager mode (.env): +CELERY_TASK_ALWAYS_EAGER=True +``` + +**Production — separate workers:** +```bash +celery -A mnemosyne worker -l info -Q embedding -c 1 -n embedding@%h # GPU-bound embedding +celery -A mnemosyne worker -l info -Q batch -c 2 -n batch@%h # Batch orchestration +celery -A mnemosyne worker -l info -Q celery -c 2 -n default@%h # LLM API validation +``` + +**Scheduler & Monitoring:** +```bash +celery -A mnemosyne beat -l info # Periodic task scheduler +celery -A mnemosyne flower --port=5555 # Web monitoring UI +``` + +See [Phase 2: Celery Workers & Scheduler](docs/PHASE_2_EMBEDDING_PIPELINE.md#celery-workers--scheduler) for full details on queues, reliability settings, and task progress tracking. + +## Documentation + +- **[Architecture Documentation](docs/mnemosyne.html)** — Full system architecture with diagrams +- **[Phase 1: Foundation](docs/PHASE_1_FOUNDATION.md)** — Project skeleton, Neo4j data model, content-type system +- **[Phase 2: Embedding Pipeline](docs/PHASE_2_EMBEDDING_PIPELINE.md)** — Qwen3-VL multimodal embedding +- **[Phase 3: Search & Re-ranking](docs/PHASE_3_SEARCH_AND_RERANKING.md)** — Hybrid search + re-ranker +- **[Phase 4: RAG Pipeline](docs/PHASE_4_RAG_PIPELINE.md)** — Content-type-aware generation +- **[Phase 5: MCP Server](docs/PHASE_5_MCP_SERVER.md)** — LLM integration interface +- **[Phase 6: Backport to Spelunker](docs/PHASE_6_BACKPORT_TO_SPELUNKER.md)** — Proven patterns flowing back + diff --git a/docs/PHASE_1_FOUNDATION.md b/docs/PHASE_1_FOUNDATION.md new file mode 100644 index 0000000..f059561 --- /dev/null +++ b/docs/PHASE_1_FOUNDATION.md @@ -0,0 +1,254 @@ +# Phase 1: Foundation + +## Objective + +Establish the project skeleton, Neo4j data model, Django integration, and content-type system. At the end of this phase, you can create libraries, collections, and items via Django admin and the Neo4j graph is populated with the correct node/relationship structure. + +## Deliverables + +### 1. Django Project Skeleton + +- Rename configuration module from `mnemosyne/mnemosyne/` to `mnemosyne/config/` per Red Panda Standards +- Create `pyproject.toml` at repo root with floor-pinned dependencies +- Create `.env` / `.env.example` for environment variables (never commit `.env`) +- Use a single settings.py and use dotenv to configure with '.env'. +- Configure dual-database: PostgreSQL (Django auth/config) + Neo4j (content graph) +- Install and configure `django-neomodel` for Neo4j OGM integration +- Configure `djangorestframework` for API +- Configure Celery + RabbitMQ (Async Task pattern) +- Configure S3 storage backend via Incus buckets (MinIO-backed, Terraform-provisioned) +- Configure structured logging for Loki integration via Alloy + +### 2. Django Apps + +| App | Purpose | Database | +|-----|---------|----------| +| `themis` (installed) | User profiles, preferences, API key management, navigation, notifications | PostgreSQL | +| `library/` | Libraries, Collections, Items, Chunks, Concepts | Neo4j (neomodel) | +| `llm_manager/` | LLM API/model config, usage tracking | PostgreSQL (ported from Spelunker) | + +> **Note:** Themis replaces `core/`. User profiles, timezone preferences, theme management, API key storage (encrypted, Fernet), and standard navigation are all provided by Themis. No separate `core/` app is needed. If SSO (Casdoor) or Organization models are required in future, they will be added as separate apps following the SSO and Organization patterns. + +### 3. Neo4j Graph Model (neomodel) + +```python +# library/models.py + +class Library(StructuredNode): + uid = UniqueIdProperty() + name = StringProperty(unique_index=True, required=True) + library_type = StringProperty(required=True) # fiction, technical, music, film, art, journal + description = StringProperty(default='') + + # Content-type configuration (stored as JSON strings) + chunking_config = JSONProperty(default={}) + embedding_instruction = StringProperty(default='') + reranker_instruction = StringProperty(default='') + llm_context_prompt = StringProperty(default='') + + created_at = DateTimeProperty(default_now=True) + collections = RelationshipTo('Collection', 'CONTAINS') + + +class Collection(StructuredNode): + uid = UniqueIdProperty() + name = StringProperty(required=True) + description = StringProperty(default='') + metadata = JSONProperty(default={}) + + created_at = DateTimeProperty(default_now=True) + items = RelationshipTo('Item', 'CONTAINS') + library = RelationshipTo('Library', 'BELONGS_TO') + + +class Item(StructuredNode): + uid = UniqueIdProperty() + title = StringProperty(required=True) + item_type = StringProperty(default='') + s3_key = StringProperty(default='') + content_hash = StringProperty(index=True) + file_type = StringProperty(default='') + file_size = IntegerProperty(default=0) + metadata = JSONProperty(default={}) + + created_at = DateTimeProperty(default_now=True) + updated_at = DateTimeProperty(default_now=True) + + chunks = RelationshipTo('Chunk', 'HAS_CHUNK') + images = RelationshipTo('Image', 'HAS_IMAGE') + concepts = RelationshipTo('Concept', 'REFERENCES', model=ReferencesRel) + related_items = RelationshipTo('Item', 'RELATED_TO', model=RelatedToRel) + + +class Chunk(StructuredNode): + uid = UniqueIdProperty() + chunk_index = IntegerProperty(required=True) + chunk_s3_key = StringProperty(required=True) + chunk_size = IntegerProperty(default=0) + text_preview = StringProperty(default='') # First 500 chars for full-text index + embedding = ArrayProperty(FloatProperty()) # 4096d vector + + created_at = DateTimeProperty(default_now=True) + mentions = RelationshipTo('Concept', 'MENTIONS') + + +class Concept(StructuredNode): + uid = UniqueIdProperty() + name = StringProperty(unique_index=True, required=True) + concept_type = StringProperty(default='') # person, place, topic, technique, theme + embedding = ArrayProperty(FloatProperty()) # 4096d vector + + related_concepts = RelationshipTo('Concept', 'RELATED_TO') + + +class Image(StructuredNode): + uid = UniqueIdProperty() + s3_key = StringProperty(required=True) + image_type = StringProperty(default='') # cover, diagram, artwork, still, photo + description = StringProperty(default='') + metadata = JSONProperty(default={}) + + created_at = DateTimeProperty(default_now=True) + embeddings = RelationshipTo('ImageEmbedding', 'HAS_EMBEDDING') + + +class ImageEmbedding(StructuredNode): + uid = UniqueIdProperty() + embedding = ArrayProperty(FloatProperty()) # 4096d multimodal vector + created_at = DateTimeProperty(default_now=True) +``` + +### 4. Neo4j Index Setup + +Management command: `python manage.py setup_neo4j_indexes` + +Creates vector indexes (4096d cosine), full-text indexes, and constraint indexes. + +### 5. Content-Type System + +Default library type configurations loaded via management command (`python manage.py load_library_types`). A management command is preferred over fixtures because these configurations will evolve across releases, and the command can be re-run idempotently to update defaults without overwriting per-library customizations. + +Default configurations: + +| Library Type | Chunking Strategy | Embedding Instruction | LLM Context | +|-------------|-------------------|----------------------|-------------| +| fiction | chapter_aware | narrative retrieval | "Excerpts from fiction..." | +| technical | section_aware | procedural retrieval | "Excerpts from technical docs..." | +| music | song_level | music discovery | "Song lyrics and metadata..." | +| film | scene_level | cinematic retrieval | "Film content..." | +| art | description_level | visual/stylistic retrieval | "Artwork descriptions..." | +| journal | entry_level | temporal/reflective retrieval | "Personal journal entries..." | + +### 6. Admin & Management UI + +`django-neomodel`'s admin support is limited — `StructuredNode` models don't participate in Django's ORM, so standard `ModelAdmin`, filters, search, and inlines don't work. Instead: + +- **Custom admin views** for Library, Collection, and Item CRUD using Cypher/neomodel queries, rendered in Django admin's template structure +- **DRF management API** (`/api/v1/library/`, `/api/v1/collection/`, `/api/v1/item/`) for programmatic access and future frontend consumption +- Library CRUD includes content-type configuration editing +- Collection/Item views support filtering by library, type, and date +- All admin views extend `themis/base.html` for consistent navigation + +### 7. LLM Manager (Port from Spelunker) + +Copy and adapt `llm_manager/` app from Spelunker: +- `LLMApi` model (OpenAI-compatible API endpoints) +- `LLMModel` model (with new `reranker` and `multimodal_embed` model types) +- `LLMUsage` tracking +- **API key storage uses Themis `UserAPIKey`** — LLM Manager does not implement its own encrypted key storage. API credentials for LLM providers are stored via Themis's Fernet-encrypted `UserAPIKey` model with `key_type='api'` and appropriate `service_name` (e.g., "OpenAI", "Arke"). `LLMApi` references credentials by service name lookup against the requesting user's Themis keys. + +Schema additions to Spelunker's `LLMModel`: + +| Field | Change | Purpose | +|-------|--------|---------| +| `model_type` | Add choices: `reranker`, `multimodal_embed` | Support Qwen3-VL reranker and embedding models | +| `supports_multimodal` | New `BooleanField` | Flag models that accept image+text input | +| `vector_dimensions` | New `IntegerProperty` | Embedding output dimensions (e.g., 4096) | + +### 8. Infrastructure Wiring (Ouranos) + +All connections follow Ouranos DNS conventions — use `.incus` hostnames, never hardcode IPs. + +| Service | Host | Connection | Settings Variable | +|---------|------|------------|-------------------| +| PostgreSQL | `portia.incus:5432` | Database `mnemosyne` (must be provisioned) | `DATABASE_URL` | +| Neo4j (Bolt) | `ariel.incus:25554` | Neo4j 5.26.0 | `NEOMODEL_NEO4J_BOLT_URL` | +| Neo4j (HTTP) | `ariel.incus:25584` | Browser/API access | — | +| RabbitMQ | `oberon.incus:5672` | Message broker | `CELERY_BROKER_URL` | +| S3 (Incus) | Terraform-provisioned Incus bucket | MinIO-backed object storage | `AWS_S3_ENDPOINT_URL`, `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_STORAGE_BUCKET_NAME` | +| Arke LLM Proxy | `sycorax.incus:25540` | LLM API routing | Configured per `LLMApi` record | +| SMTP (dev) | `oberon.incus:22025` | smtp4dev test server | `EMAIL_HOST` | +| Loki (logs) | `prospero.incus:3100` | Via Alloy agent (host-level, not app-level) | — | +| Casdoor SSO | `titania.incus:22081` | Future: SSO pattern | — | + +**Terraform provisioning required before Phase 1 deployment:** +- PostgreSQL database `mnemosyne` on Portia +- Incus S3 bucket for Mnemosyne content storage +- HAProxy route: `mnemosyne.ouranos.helu.ca` → `puck.incus:` (port TBD, assign next available in 22xxx range) + +**Development environment (local):** +- PostgreSQL for Django ORM on 'portia.incus' +- Local Neo4j instance or `ariel.incus` via SSH tunnel +- `django.core.files.storage.FileSystemStorage` for S3 (tests/dev) +- `CELERY_TASK_ALWAYS_EAGER=True` for synchronous task execution + +### 9. Testing Strategy + +Follows Red Panda Standards: Django `TestCase`, separate test files per module. + +| Test File | Scope | +|-----------|-------| +| `library/tests/test_models.py` | Neo4j node creation, relationships, property validation | +| `library/tests/test_content_types.py` | `load_library_types` command, configuration retrieval per library | +| `library/tests/test_indexes.py` | `setup_neo4j_indexes` command execution | +| `library/tests/test_api.py` | DRF endpoints for Library/Collection/Item CRUD | +| `library/tests/test_admin_views.py` | Custom admin views render and submit correctly | +| `llm_manager/tests/test_models.py` | LLMApi, LLMModel creation, new model types | +| `llm_manager/tests/test_api.py` | LLM Manager API endpoints | + +**Neo4j test strategy:** +- Tests use a dedicated Neo4j test database (separate from development/production) +- `NEOMODEL_NEO4J_BOLT_URL` overridden in test settings to point to test database +- Each test class clears its nodes in `setUp` / `tearDown` using `neomodel.clear_neo4j_database()` +- CI/CD (Gitea Runner on Puck) uses a Docker Neo4j instance for isolated test runs +- For local development without Neo4j, tests that require Neo4j are skipped via `@unittest.skipUnless(neo4j_available(), "Neo4j not available")` + +## Dependencies + +```toml +# pyproject.toml — floor-pinned with ceiling per Red Panda Standards +dependencies = [ + "Django>=5.2,<6.0", + "djangorestframework>=3.14,<4.0", + "django-neomodel>=0.1,<1.0", + "neomodel>=5.3,<6.0", + "neo4j>=5.0,<6.0", + "celery>=5.3,<6.0", + "django-storages[boto3]>=1.14,<2.0", + "django-environ>=0.11,<1.0", + "psycopg[binary]>=3.1,<4.0", + "dj-database-url>=2.1,<3.0", + "shortuuid>=1.0,<2.0", + "gunicorn>=21.0,<24.0", + "cryptography>=41.0,<45.0", + "flower>=2.0,<3.0", + "pymemcache>=4.0,<5.0", + "django-heluca-themis", +] +``` + +## Success Criteria + +- [ ] Config module renamed to `config/`, `pyproject.toml` at repo root with floor-pinned deps +- [ ] Settings load from environment variables via `django-environ` (`.env.example` provided) +- [ ] Django project runs with dual PostgreSQL + Neo4j databases +- [ ] Can create Library → Collection → Item through custom admin views +- [ ] DRF API endpoints return Library/Collection/Item data +- [ ] Neo4j graph shows correct node types and relationships +- [ ] Content-type configurations loaded via `load_library_types` and retrievable per library +- [ ] LLM Manager ported from Spelunker; uses Themis `UserAPIKey` for credential storage +- [ ] S3 storage configured against Incus bucket (Terraform-provisioned) and tested +- [ ] Celery worker connects to RabbitMQ on Oberon +- [ ] Structured logging configured (JSON format, compatible with Loki/Alloy) +- [ ] Tests pass for all Phase 1 apps (library, llm_manager) +- [ ] HAProxy route provisioned: `mnemosyne.ouranos.helu.ca` diff --git a/docs/PHASE_2_EMBEDDING_PIPELINE.md b/docs/PHASE_2_EMBEDDING_PIPELINE.md new file mode 100644 index 0000000..a69eb73 --- /dev/null +++ b/docs/PHASE_2_EMBEDDING_PIPELINE.md @@ -0,0 +1,498 @@ +# Phase 2: Embedding Pipeline + +## Objective + +Build the complete document ingestion and embedding pipeline: upload content → parse (text + images) → chunk (content-type-aware) → embed via configurable model → store vectors in Neo4j → extract concepts for knowledge graph. + +## Heritage + +The embedding pipeline adapts proven patterns from [Spelunker](https://git.helu.ca/r/spelunker)'s `rag/services/embeddings.py` — semantic chunking, batch embedding, S3 chunk storage, and progress tracking — enhanced with multimodal capabilities, knowledge graph relationships, and content-type awareness. + +## Architecture Overview + +``` +Upload (API/Admin) + → S3 Storage (original file) + → Document Parsing (PyMuPDF — text + images) + → Content-Type-Aware Chunking (semantic-text-splitter) + → Text Embedding (system embedding model via LLM Manager) + → Image Embedding (multimodal model, if available) + → Neo4j Graph Storage (Chunk nodes, Image nodes, vectors) + → Concept Extraction (system chat model) + → Knowledge Graph (Concept nodes, MENTIONS/REFERENCES edges) +``` + +## Deliverables + +### 1. Document Parsing Service (`library/services/parsers.py`) + +**Primary parser: PyMuPDF** — a single library handling all document formats with unified text + image extraction. + +#### Supported Formats + +| Format | Extensions | Text Extraction | Image Extraction | +|--------|-----------|----------------|-----------------| +| PDF | `.pdf` | Layout-preserving text | Embedded images, diagrams | +| EPUB | `.epub` | Chapter-structured HTML | Cover art, illustrations | +| DOCX | `.docx` | Via HTML conversion | Inline images, diagrams | +| PPTX | `.pptx` | Via HTML conversion | Slide images, charts | +| XLSX | `.xlsx` | Via HTML conversion | Embedded charts | +| XPS | `.xps` | Native | Native | +| MOBI | `.mobi` | Native | Native | +| FB2 | `.fb2` | Native | Native | +| CBZ | `.cbz` | Native | Native (comic pages) | +| Plain text | `.txt`, `.md` | Direct read | N/A | +| HTML | `.html`, `.htm` | PyMuPDF or direct | Inline images | +| Images | `.jpg`, `.png`, etc. | N/A (OCR future) | The image itself | + +#### Text Sanitization + +Ported from Spelunker's `text_utils.py`: +- Remove null bytes and control characters +- Remove zero-width characters +- Normalize Unicode to NFC +- Replace invalid UTF-8 sequences +- Clean PDF ligatures and artifacts +- Normalize whitespace + +#### Image Extraction + +For each document page/section, extract embedded images via `page.get_images()` → `doc.extract_image(xref)`: +- Raw image bytes (PNG/JPEG) +- Dimensions (width × height) +- Source page/position for chunk-image association +- Store in S3: `images/{item_uid}/{image_index}.{ext}` + +#### Parse Result Structure + +```python +@dataclass +class TextBlock: + text: str + page: int + metadata: dict # {heading_level, section_name, etc.} + +@dataclass +class ExtractedImage: + data: bytes + ext: str # png, jpg, etc. + width: int + height: int + source_page: int + source_index: int + +@dataclass +class ParseResult: + text_blocks: list[TextBlock] + images: list[ExtractedImage] + metadata: dict # {page_count, title, author, etc.} + file_type: str +``` + +### 2. Content-Type-Aware Chunking Service (`library/services/chunker.py`) + +Uses `semantic-text-splitter` with HuggingFace tokenizer (proven in Spelunker). + +#### Strategy Dispatch + +Based on `Library.chunking_config`: + +| Strategy | Library Type | Boundary Markers | Chunk Size | Overlap | +|----------|-------------|-----------------|-----------|---------| +| `chapter_aware` | Fiction | chapter, scene, paragraph | 1024 | 128 | +| `section_aware` | Technical | section, subsection, code_block, list | 512 | 64 | +| `song_level` | Music | song, verse, chorus | 512 | 32 | +| `scene_level` | Film | scene, act, sequence | 768 | 64 | +| `description_level` | Art | artwork, description, analysis | 512 | 32 | +| `entry_level` | Journal | entry, date, paragraph | 512 | 32 | + +#### Chunk-Image Association + +Track which images appeared near which text chunks: +- PDF: image bounding boxes on specific pages +- DOCX/PPTX: images associated with slides/sections +- EPUB: images referenced from specific chapters + +Creates `Chunk -[HAS_NEARBY_IMAGE]-> Image` relationships with proximity metadata. + +#### Chunk Storage + +- Chunk text stored in S3: `chunks/{item_uid}/chunk_{index}.txt` +- `text_preview` (first 500 chars) stored on Chunk node for full-text indexing + +### 3. Embedding Client (`library/services/embedding_client.py`) + +Multi-backend embedding client dispatching by `LLMApi.api_type`. + +#### Backend Support + +| API Type | Protocol | Auth | Batch Support | +|----------|---------|------|---------------| +| `openai` | HTTP POST `/embeddings` | API key header | Native batch | +| `vllm` | HTTP POST `/embeddings` | API key header | Native batch | +| `llama-cpp` | HTTP POST `/embeddings` | API key header | Native batch | +| `ollama` | HTTP POST `/embeddings` | None | Native batch | +| `bedrock` | HTTP POST `/model/{id}/invoke` | Bearer token | Client-side loop | + +#### Bedrock Integration + +Uses Amazon Bedrock API keys (Bearer token auth) — no boto3 SDK required: + +``` +POST https://bedrock-runtime.{region}.amazonaws.com/model/{model_id}/invoke +Authorization: Bearer {bedrock_api_key} +Content-Type: application/json + +{"inputText": "text to embed", "dimensions": 1024, "normalize": true} +→ {"embedding": [float, ...], "inputTextTokenCount": 42} +``` + +**LLMApi setup for Bedrock embeddings:** +- `api_type`: `"bedrock"` +- `base_url`: `https://bedrock-runtime.us-east-1.amazonaws.com` +- `api_key`: Bedrock API key (encrypted) + +**LLMApi setup for Bedrock chat (Claude, etc.):** +- `api_type`: `"openai"` (Mantle endpoint is OpenAI-compatible) +- `base_url`: `https://bedrock-mantle.us-east-1.api.aws/v1` +- `api_key`: Same Bedrock API key + +#### Embedding Instruction Prefix + +Before embedding, prepend the library's `embedding_instruction` to each chunk: +``` +"{embedding_instruction}\n\n{chunk_text}" +``` + +#### Image Embedding + +For multimodal models (`model.supports_multimodal`): +- Send base64-encoded image to the embedding endpoint +- Create `ImageEmbedding` node with the resulting vector +- If no multimodal model available, skip (images stored but not embedded) + +#### Model Matching + +Track embedded model by **name** (not UUID). Multiple APIs can serve the same model — matching by name allows provider switching without re-embedding. + +### 4. Pipeline Orchestrator (`library/services/pipeline.py`) + +Coordinates the full flow: parse → chunk → embed → store → graph. + +#### Pipeline Stages + +1. **Parse**: Extract text blocks + images from document +2. **Chunk**: Split text using content-type-aware strategy +3. **Store chunks**: S3 + Chunk nodes in Neo4j +4. **Embed text**: Generate vectors for all chunks +5. **Store images**: S3 + Image nodes in Neo4j +6. **Embed images**: Multimodal vectors (if available) +7. **Extract concepts**: Named entities from chunk text (via system chat model) +8. **Build graph**: Create Concept nodes, MENTIONS/REFERENCES edges + +#### Idempotency + +- Check `Item.content_hash` — skip if already processed with same hash +- Re-embedding deletes existing Chunk/Image nodes before re-processing + +#### Dimension Compatibility + +- Validate that the system embedding model's `vector_dimensions` matches the Neo4j vector index dimensions +- Warn at embed time if mismatch detected + +### 5. Concept Extraction (`library/services/concepts.py`) + +Uses the system chat model for LLM-based named entity recognition. + +- Extract: people, places, topics, techniques, themes +- Create/update `Concept` nodes (deduplicated by name via unique_index) +- Connect: `Chunk -[MENTIONS]-> Concept`, `Item -[REFERENCES]-> Concept` +- Embed concept names for vector search +- If no system chat model configured, concept extraction is skipped + +### 6. Celery Tasks (`library/tasks.py`) + +All tasks pass IDs (not model instances) per Red Panda Standards. + +| Task | Queue | Purpose | +|------|-------|---------| +| `embed_item(item_uid)` | `embedding` | Full pipeline for single item | +| `embed_collection(collection_uid)` | `batch` | All items in a collection | +| `embed_library(library_uid)` | `batch` | All items in a library | +| `batch_embed_items(item_uids)` | `batch` | Specific items | +| `reembed_item(item_uid)` | `embedding` | Delete + re-embed | + +Tasks are idempotent, include retry logic, and track progress via Memcached: `library:task:{task_id}:progress`. + +### 7. Prometheus Metrics (`library/metrics.py`) + +Custom metrics for pipeline observability: + +| Metric | Type | Labels | Purpose | +|--------|------|--------|---------| +| `mnemosyne_documents_parsed_total` | Counter | file_type, status | Parse throughput | +| `mnemosyne_document_parse_duration_seconds` | Histogram | file_type | Parse latency | +| `mnemosyne_images_extracted_total` | Counter | file_type | Image extraction volume | +| `mnemosyne_chunks_created_total` | Counter | library_type, strategy | Chunk throughput | +| `mnemosyne_chunk_size_tokens` | Histogram | — | Chunk size distribution | +| `mnemosyne_embeddings_generated_total` | Counter | model_name, api_type, content_type | Embedding throughput | +| `mnemosyne_embedding_batch_duration_seconds` | Histogram | model_name, api_type | API latency | +| `mnemosyne_embedding_api_errors_total` | Counter | model_name, api_type, error_type | API failures | +| `mnemosyne_embedding_tokens_total` | Counter | model_name | Token consumption | +| `mnemosyne_pipeline_items_total` | Counter | status | Pipeline throughput | +| `mnemosyne_pipeline_item_duration_seconds` | Histogram | — | End-to-end latency | +| `mnemosyne_pipeline_items_in_progress` | Gauge | — | Concurrent processing | +| `mnemosyne_concepts_extracted_total` | Counter | concept_type | Concept extraction volume | + +### 8. Model Changes + +#### Item Node — New Fields + +| Field | Type | Purpose | +|-------|------|---------| +| `embedding_status` | StringProperty | pending / processing / completed / failed | +| `embedding_model_name` | StringProperty | Name of model that generated embeddings | +| `chunk_count` | IntegerProperty | Number of chunks created | +| `image_count` | IntegerProperty | Number of images extracted | +| `error_message` | StringProperty | Last error message (if failed) | + +#### New Relationship Model + +```python +class NearbyImageRel(StructuredRel): + proximity = StringProperty(default="same_page") # same_page, inline, same_slide, same_chapter +``` + +#### Chunk Node — New Relationship + +```python +nearby_images = RelationshipTo('Image', 'HAS_NEARBY_IMAGE', model=NearbyImageRel) +``` + +#### LLMApi Model — New API Type + +Add `("bedrock", "Amazon Bedrock")` to `api_type` choices. + +### 9. API Enhancements + +- `POST /api/v1/library/items/` — File upload with auto-trigger of `embed_item` task +- `POST /api/v1/library/items//reembed/` — Re-embed endpoint +- `GET /api/v1/library/items//status/` — Embedding status check +- Admin views: File upload field on item create, embedding status display + +### 10. Management Commands + +| Command | Purpose | +|---------|---------| +| `embed_item ` | CLI embedding for testing | +| `embed_collection ` | CLI batch embedding | +| `embedding_status` | Show embedding progress/statistics | + +### 11. Dynamic Vector Index Dimensions + +Update `setup_neo4j_indexes` to read dimensions from `LLMModel.get_system_embedding_model().vector_dimensions` instead of hardcoding 4096. + +## Celery Workers & Scheduler + +### Prerequisites + +- RabbitMQ running on `oberon.incus:5672` with `mnemosyne` vhost and user +- `.env` configured with `CELERY_BROKER_URL=amqp://mnemosyne:password@oberon.incus:5672/mnemosyne` +- Virtual environment activated: `source ~/env/mnemosyne/bin/activate` + +### Queues + +Mnemosyne uses three Celery queues with task routing configured in `settings.py`: + +| Queue | Tasks | Purpose | Recommended Concurrency | +|-------|-------|---------|------------------------| +| `celery` (default) | `llm_manager.validate_all_llm_apis`, `llm_manager.validate_single_api` | LLM API validation & model discovery | 2 | +| `embedding` | `library.tasks.embed_item`, `library.tasks.reembed_item` | Single-item embedding pipeline (GPU-bound) | 1 | +| `batch` | `library.tasks.embed_collection`, `library.tasks.embed_library`, `library.tasks.batch_embed_items` | Batch orchestration (dispatches to embedding queue) | 2 | + +Task routing (`settings.py`): +```python +CELERY_TASK_ROUTES = { + "library.tasks.embed_*": {"queue": "embedding"}, + "library.tasks.batch_*": {"queue": "batch"}, +} +``` + +### Starting Workers + +All commands run from the Django project root (`mnemosyne/`): + +**Development — single worker, all queues:** +```bash +cd mnemosyne +celery -A mnemosyne worker -l info -Q celery,embedding,batch +``` + +**Development — eager mode (no worker needed):** + +Set `CELERY_TASK_ALWAYS_EAGER=True` in `.env`. All tasks execute synchronously in the web process. Useful for debugging but does not test async behavior. + +**Production — separate workers per queue:** +```bash +# Embedding worker (single concurrency — GPU is sequential) +celery -A mnemosyne worker \ + -l info \ + -Q embedding \ + -c 1 \ + -n embedding@%h \ + --max-tasks-per-child=100 + +# Batch orchestration worker +celery -A mnemosyne worker \ + -l info \ + -Q batch \ + -c 2 \ + -n batch@%h + +# Default queue worker (LLM API validation, etc.) +celery -A mnemosyne worker \ + -l info \ + -Q celery \ + -c 2 \ + -n default@%h +``` + +### Celery Beat (Periodic Scheduler) + +Celery Beat runs scheduled tasks (e.g., periodic LLM API validation): + +```bash +# File-based scheduler (simple, stores schedule in celerybeat-schedule file) +celery -A mnemosyne beat -l info + +# Or with Django database scheduler (if django-celery-beat is installed) +celery -A mnemosyne beat -l info --scheduler django_celery_beat.schedulers:DatabaseScheduler +``` + +Example periodic task schedule (add to `settings.py` if needed): +```python +from celery.schedules import crontab + +CELERY_BEAT_SCHEDULE = { + "validate-llm-apis-daily": { + "task": "llm_manager.validate_all_llm_apis", + "schedule": crontab(hour=6, minute=0), # Daily at 6 AM + }, +} +``` + +### Flower (Task Monitoring) + +[Flower](https://flower.readthedocs.io/) provides a real-time web UI for monitoring Celery workers and tasks: + +```bash +celery -A mnemosyne flower --port=5555 +``` + +Access at `http://localhost:5555`. Shows: +- Active/completed/failed tasks +- Worker status and resource usage +- Task execution times and retry counts +- Queue depths + +### Reliability Configuration + +The following settings are already configured in `settings.py`: + +| Setting | Value | Purpose | +|---------|-------|---------| +| `CELERY_TASK_ACKS_LATE` | `True` | Acknowledge tasks after execution (not on receipt) — prevents task loss on worker crash | +| `CELERY_WORKER_PREFETCH_MULTIPLIER` | `1` | Workers fetch one task at a time — ensures fair distribution across workers | +| `CELERY_ACCEPT_CONTENT` | `["json"]` | Only accept JSON-serialized tasks | +| `CELERY_TASK_SERIALIZER` | `"json"` | Serialize task arguments as JSON | + +### Task Progress Tracking + +Embedding tasks report progress via Memcached using the key pattern: +``` +library:task:{task_id}:progress → {"percent": 45, "message": "Embedded 12/27 chunks"} +``` + +Tasks also update Celery's native state: +```python +# Query task progress from Python +from celery.result import AsyncResult +result = AsyncResult(task_id) +result.state # "PROGRESS", "SUCCESS", "FAILURE" +result.info # {"percent": 45, "message": "..."} +``` + +## Dependencies + +```toml +# New additions to pyproject.toml +"PyMuPDF>=1.24,<2.0", +"pymupdf4llm>=0.0.17,<1.0", +"semantic-text-splitter>=0.20,<1.0", +"tokenizers>=0.20,<1.0", +"Pillow>=10.0,<12.0", +"django-prometheus>=2.3,<3.0", +``` + +### License Note + +PyMuPDF is AGPL-3.0 licensed. Acceptable for self-hosted personal use. Commercial distribution would require Artifex's commercial license. + +## File Structure + +``` +mnemosyne/library/ +├── services/ +│ ├── __init__.py +│ ├── parsers.py # PyMuPDF universal document parsing +│ ├── text_utils.py # Text sanitization (from Spelunker) +│ ├── chunker.py # Content-type-aware chunking +│ ├── embedding_client.py # Multi-backend embedding API client +│ ├── pipeline.py # Orchestration: parse → chunk → embed → graph +│ └── concepts.py # LLM-based concept extraction +├── metrics.py # Prometheus metrics definitions +├── tasks.py # Celery tasks for async embedding +├── management/commands/ +│ ├── embed_item.py +│ ├── embed_collection.py +│ └── embedding_status.py +└── tests/ + ├── test_parsers.py + ├── test_text_utils.py + ├── test_chunker.py + ├── test_embedding_client.py + ├── test_pipeline.py + ├── test_concepts.py + └── test_tasks.py +``` + +## Testing Strategy + +All tests use Django `TestCase`. External services (LLM APIs, Neo4j) are mocked. + +| Test File | Scope | +|-----------|-------| +| `test_parsers.py` | PyMuPDF parsing for each file type, image extraction, text sanitization | +| `test_text_utils.py` | Sanitization functions, PDF artifact cleaning, Unicode normalization | +| `test_chunker.py` | Content-type strategies, boundary detection, chunk-image association | +| `test_embedding_client.py` | OpenAI-compat + Bedrock backends (mocked HTTP), batch processing, usage tracking | +| `test_pipeline.py` | Full pipeline integration (mocked), S3 storage, idempotency | +| `test_concepts.py` | Concept extraction, deduplication, graph relationships | +| `test_tasks.py` | Celery tasks (eager mode), retry logic, error handling | + +## Success Criteria + +- [ ] Upload a document (PDF, EPUB, DOCX, PPTX, TXT) via API or admin → file stored in S3 +- [ ] Images extracted from documents and stored as Image nodes in Neo4j +- [ ] Document automatically chunked using content-type-aware strategy +- [ ] Chunks embedded via system embedding model and vectors stored in Neo4j Chunk nodes +- [ ] Images embedded multimodally into ImageEmbedding nodes (when multimodal model available) +- [ ] Chunk-image proximity relationships established in graph +- [ ] Concepts extracted and graph populated with MENTIONS/REFERENCES relationships +- [ ] Neo4j vector indexes usable for similarity queries on stored embeddings +- [ ] Celery tasks handle async embedding with progress tracking +- [ ] Re-embedding works (delete old chunks, re-process) +- [ ] Content hash prevents redundant re-embedding +- [ ] Prometheus metrics exposed at `/metrics` for pipeline monitoring +- [ ] All tests pass with mocked LLM/embedding APIs +- [ ] Bedrock embedding works via Bearer token HTTP (no boto3) diff --git a/docs/Pattern_Async-TASK_V1-00.md b/docs/Pattern_Async-TASK_V1-00.md new file mode 100644 index 0000000..9b81991 --- /dev/null +++ b/docs/Pattern_Async-TASK_V1-00.md @@ -0,0 +1,673 @@ +# Async Task Pattern v1.0.0 + +Defines how Spelunker Django apps implement background task processing using Celery, RabbitMQ, Memcached, and Flower — covering fire-and-forget tasks, long-running batch jobs, signal-triggered tasks, and periodic scheduled tasks. + +## 🐾 Red Panda Approval™ + +This pattern follows Red Panda Approval standards. + +--- + +## Why a Pattern, Not a Shared Implementation + +Long-running work in Spelunker spans multiple domains, each with distinct progress-tracking and state requirements: + +- A `solution_library` document embedding task needs to update `review_status` on a `Document` and count vector chunks created. +- An `rfp_manager` batch job tracks per-question progress, per-question errors, and the Celery task ID on an `RFPBatchJob` record. +- An `llm_manager` API-validation task iterates over all active APIs and accumulates model sync statistics. +- A `solution_library` documentation-source sync task fires from a View, stores `celery_task_id` on a `SyncJob`, and reports incremental progress via a callback. + +Instead, this pattern defines: + +- **Required task interface** — every task must have a namespaced name, a structured return dict, and structured logging. +- **Recommended job-tracking fields** — most tasks that represent a significant unit of work should have a corresponding DB job record. +- **Error handling conventions** — how to catch, log, and reflect failures back to the record. +- **Dispatch variants** — signal-triggered, admin action, view-triggered, and periodic (Beat). +- **Infrastructure conventions** — broker, result backend, serialization, and cache settings. + +--- + +## Required Task Interface + +Every Celery task in Spelunker **must**: + +```python +from celery import shared_task +import logging + +logger = logging.getLogger(__name__) + +@shared_task(name='.') +def my_task(primary_id: int, user_id: int = None) -> dict: + """One-line description of what this task does.""" + try: + # ... do work ... + logger.info(f"Task succeeded for {primary_id}") + return {'success': True, 'id': primary_id} + + except Exception as e: + logger.error( + f"Task failed for {primary_id}: {type(e).__name__}: {e}", + extra={'id': primary_id, 'error': str(e)}, + exc_info=True, + ) + return {'success': False, 'id': primary_id, 'error': str(e)} +``` + +| Requirement | Rule | +|---|---| +| `name` | Must be `'.'`, e.g., `'solution_library.embed_document'` | +| Return value | Always a dict with at minimum `{'success': bool}` | +| Logging | Use structured `extra={}` kwargs; never silence exceptions silently | +| Import style | Use `@shared_task`, not direct `app.task` references | +| Idempotency | Tasks **must** be safe to re-execute with the same arguments (broker redelivery, worker crash). Use `update_or_create`, check-before-write, or guard with the job record's status before re-processing. | +| Arguments | Pass only JSON-serialisable primitives (PKs, strings, numbers). Never pass ORM instances. | + +--- + +## Retry & Time-Limit Policy + +Tasks that call external services (LLM APIs, S3, remote URLs) should declare automatic retries for transient failures. Tasks must also set time limits to prevent hung workers. + +### Recommended Retry Decorator + +```python +@shared_task( + name='.', + bind=True, + autoretry_for=(ConnectionError, TimeoutError), + retry_backoff=60, # first retry after 60 s, then 120 s, 240 s … + retry_backoff_max=600, # cap at 10 minutes + retry_jitter=True, # add randomness to avoid thundering herd + max_retries=3, + soft_time_limit=1800, # raise SoftTimeLimitExceeded after 30 min + time_limit=2100, # hard-kill after 35 min +) +def my_task(self, primary_id: int, ...): + ... +``` + +| Setting | Purpose | Guideline | +|---|---|---| +| `autoretry_for` | Exception classes that trigger an automatic retry | Use for **transient** errors only (network, timeout). Never for `ValueError` or business-logic errors. | +| `retry_backoff` | Seconds before first retry (doubles each attempt) | 60 s is a reasonable default for external API calls. | +| `max_retries` | Maximum retry attempts | 3 for API calls; 0 (no retry) for user-triggered batch jobs that track their own progress. | +| `soft_time_limit` | Raises `SoftTimeLimitExceeded` — allows graceful cleanup | Set on every task. Catch it to mark the job record as failed. | +| `time_limit` | Hard `SIGKILL` — last resort | Set 5–10 min above `soft_time_limit`. | + +### Handling `SoftTimeLimitExceeded` + +```python +from celery.exceptions import SoftTimeLimitExceeded + +@shared_task(bind=True, soft_time_limit=1800, time_limit=2100, ...) +def long_running_task(self, job_id: int): + job = MyJob.objects.get(id=job_id) + try: + for item in items: + process(item) + except SoftTimeLimitExceeded: + logger.warning(f"Job {job_id} hit soft time limit — marking as failed") + job.status = 'failed' + job.completed_at = timezone.now() + job.save() + return {'success': False, 'job_id': job_id, 'error': 'Time limit exceeded'} +``` + +> **Note:** Batch jobs in `rfp_manager` do **not** use `autoretry_for` because they track per-question progress and should not re-run the entire batch. Instead, individual question failures are logged and the batch continues. + +--- + +## Standard Values / Conventions + +### Task Name Registry + +| App | Task name | Trigger | +|---|---|---| +| `solution_library` | `solution_library.embed_document` | Signal / admin action | +| `solution_library` | `solution_library.embed_documents_batch` | Admin action | +| `solution_library` | `solution_library.sync_documentation_source` | View / admin action | +| `solution_library` | `solution_library.sync_all_documentation_sources` | Celery Beat (periodic) | +| `rfp_manager` | `rfp_manager.summarize_information_document` | Admin action | +| `rfp_manager` | `rfp_manager.batch_generate_responder_answers` | View | +| `rfp_manager` | `rfp_manager.batch_generate_reviewer_answers` | View | +| `llm_manager` | `llm_manager.validate_all_llm_apis` | Celery Beat (periodic) | +| `llm_manager` | `llm_manager.validate_single_api` | Admin action | + +### Job Status Choices (DB Job Records) + +```python +STATUS_PENDING = 'pending' +STATUS_PROCESSING = 'processing' +STATUS_COMPLETED = 'completed' +STATUS_FAILED = 'failed' +STATUS_CANCELLED = 'cancelled' # optional — used by rfp_manager +``` + +--- + +## Recommended Job-Tracking Fields + +Tasks that represent a significant unit of work should write their state to a DB model. These are the recommended fields: + +```python +class MyJobModel(models.Model): + # Celery linkage + celery_task_id = models.CharField( + max_length=255, blank=True, + help_text="Celery task ID for Flower monitoring" + ) + + # Status lifecycle + status = models.CharField( + max_length=20, choices=STATUS_CHOICES, default=STATUS_PENDING + ) + started_at = models.DateTimeField(null=True, blank=True) + completed_at = models.DateTimeField(null=True, blank=True) + + # Audit + started_by = models.ForeignKey( + User, on_delete=models.PROTECT, related_name='+' + ) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + # Error accumulation + errors = models.JSONField(default=list) + + class Meta: + indexes = [ + models.Index(fields=['celery_task_id']), + models.Index(fields=['-created_at']), + ] +``` + +For batch jobs that process many items, add counter fields: + +```python + total_items = models.IntegerField(default=0) + processed_items = models.IntegerField(default=0) + successful_items = models.IntegerField(default=0) + failed_items = models.IntegerField(default=0) + + def get_progress_percentage(self) -> int: + if self.total_items == 0: + return 0 + return int((self.processed_items / self.total_items) * 100) + + def is_stale(self, timeout_minutes: int = 30) -> bool: + """True if stuck in pending/processing without recent updates.""" + if self.status not in (self.STATUS_PENDING, self.STATUS_PROCESSING): + return False + return (timezone.now() - self.updated_at).total_seconds() > (timeout_minutes * 60) +``` + +--- + +## Variant 1 — Fire-and-Forget (Signal-Triggered) + +Automatically dispatch a task whenever a model record is saved. Used by `solution_library` to kick off embedding whenever a `Document` is created. + +```python +# solution_library/signals.py +from django.db.models.signals import post_save +from django.dispatch import receiver +from django.conf import settings + +@receiver(post_save, sender=Document) +def trigger_document_embedding(sender, instance, created, **kwargs): + if not created: + return + if not getattr(settings, 'AUTO_EMBED_DOCUMENTS', True): + return + + from solution_library.tasks import embed_document_task # avoid circular import + + from django.db import transaction + + def _dispatch(): + try: + task = embed_document_task.delay( + document_id=instance.id, + embedding_model_id=instance.embedding_model_id or None, + user_id=None, + ) + logger.info(f"Queued embedding task {task.id} for document {instance.id}") + except Exception as e: + logger.error(f"Failed to queue embedding task for document {instance.id}: {e}") + + # Dispatch AFTER the transaction commits so the worker can read the row + transaction.on_commit(_dispatch) +``` + +The corresponding task updates the record's status field at start and completion: + +```python +@shared_task(name='solution_library.embed_document') +def embed_document_task(document_id: int, embedding_model_id: int = None, user_id: int = None): + document = Document.objects.get(id=document_id) + document.review_status = 'processing' + document.save(update_fields=['review_status', 'embedding_model']) + + # ... perform work ... + + document.review_status = 'pending' + document.save(update_fields=['review_status']) + return {'success': True, 'document_id': document_id, 'chunks_created': count} +``` + +--- + +## Variant 2 — Long-Running Batch Job (View or Admin Triggered) + +Used by `rfp_manager` for multi-hour batch RAG processing. The outer transaction creates the DB job record first, then dispatches the Celery task, passing the job's PK. + +```python +# rfp_manager/views.py (dispatch) +from django.db import transaction + +job = RFPBatchJob.objects.create( + rfp=rfp, + started_by=request.user, + job_type=RFPBatchJob.JOB_TYPE_RESPONDER, + status=RFPBatchJob.STATUS_PENDING, +) + +def _dispatch(): + task = batch_generate_responder_answers.delay(rfp.pk, request.user.pk, job.pk) + # Save the Celery task ID for Flower cross-reference + job.celery_task_id = task.id + job.save(update_fields=['celery_task_id']) + +# IMPORTANT: dispatch after the transaction commits so the worker +# can read the job row. Without this, the worker may receive the +# message before the row is visible, causing DoesNotExist. +transaction.on_commit(_dispatch) +``` + +Inside the task, use `bind=True` to get the Celery task ID: + +```python +@shared_task(bind=True, name='rfp_manager.batch_generate_responder_answers') +def batch_generate_responder_answers(self, rfp_id: int, user_id: int, job_id: int): + job = RFPBatchJob.objects.get(id=job_id) + job.status = RFPBatchJob.STATUS_PROCESSING + job.started_at = timezone.now() + job.celery_task_id = self.request.id # authoritative Celery ID + job.save() + + for item in items_to_process: + try: + # ... process item ... + job.processed_questions += 1 + job.successful_questions += 1 + job.save(update_fields=['processed_questions', 'successful_questions', 'updated_at']) + except Exception as e: + job.add_error(item, str(e)) + + job.status = RFPBatchJob.STATUS_COMPLETED + job.completed_at = timezone.now() + job.save() + return {'success': True, 'job_id': job_id} +``` + +--- + +## Variant 3 — Progress-Callback Task (View or Admin Triggered) + +Used by `solution_library`'s `sync_documentation_source_task` when an underlying synchronous service needs to stream incremental progress updates back to the DB. + +```python +@shared_task(bind=True, name='solution_library.sync_documentation_source') +def sync_documentation_source_task(self, source_id: int, user_id: int, job_id: int): + job = SyncJob.objects.get(id=job_id) + job.status = SyncJob.STATUS_PROCESSING + job.started_at = timezone.now() + job.celery_task_id = self.request.id + job.save(update_fields=['status', 'started_at', 'celery_task_id', 'updated_at']) + + def update_progress(created, updated, skipped, processed, total): + job.documents_created = created + job.documents_updated = updated + job.documents_skipped = skipped + job.save(update_fields=['documents_created', 'documents_updated', + 'documents_skipped', 'updated_at']) + + result = sync_documentation_source(source_id, user_id, progress_callback=update_progress) + + job.status = SyncJob.STATUS_COMPLETED if result.status == 'completed' else SyncJob.STATUS_FAILED + job.completed_at = timezone.now() + job.save() + return {'success': True, 'job_id': job_id} +``` + +--- + +## Variant 4 — Periodic Task (Celery Beat) + +Used by `llm_manager` for hourly/daily API validation and by `solution_library` for nightly source syncs. Schedule via django-celery-beat in Django admin (no hardcoded schedules in code). + +```python +@shared_task(name='llm_manager.validate_all_llm_apis') +def validate_all_llm_apis(): + """Periodic task: validate all active LLM APIs and refresh model lists.""" + active_apis = LLMApi.objects.filter(is_active=True) + results = {'tested': 0, 'successful': 0, 'failed': 0, 'details': []} + + for api in active_apis: + results['tested'] += 1 + try: + result = test_llm_api(api) + if result['success']: + results['successful'] += 1 + else: + results['failed'] += 1 + except Exception as e: + results['failed'] += 1 + logger.error(f"Error validating {api.name}: {e}", exc_info=True) + + return results + + +@shared_task(name='solution_library.sync_all_documentation_sources') +def sync_all_sources_task(): + """Periodic task: queue a sync for every active documentation source.""" + sources = DocumentationSource.objects.all() + system_user = User.objects.filter(is_superuser=True).first() + + for source in sources: + # Skip if an active sync job already exists + if SyncJob.objects.filter(source=source, + status__in=[SyncJob.STATUS_PENDING, + SyncJob.STATUS_PROCESSING]).exists(): + continue + + job = SyncJob.objects.create(source=source, started_by=system_user, + status=SyncJob.STATUS_PENDING) + sync_documentation_source_task.delay(source.id, system_user.id, job.id) + + return {'queued': queued, 'skipped': skipped} +``` + +--- + +## Infrastructure Configuration + +### `spelunker/celery.py` — App Entry Point + +```python +import os +from celery import Celery + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "spelunker.settings") + +app = Celery("spelunker") +app.config_from_object("django.conf:settings", namespace="CELERY") +app.autodiscover_tasks() # auto-discovers tasks.py in every INSTALLED_APP +``` + +### `settings.py` — Celery Settings + +```python +# Broker and result backend — supplied via environment variables +CELERY_BROKER_URL = env('CELERY_BROKER_URL') # amqp://spelunker:@rabbitmq:5672/spelunker +CELERY_RESULT_BACKEND = env('CELERY_RESULT_BACKEND') # rpc:// + +# Serialization — JSON only (no pickle) +CELERY_ACCEPT_CONTENT = ['json'] +CELERY_TASK_SERIALIZER = 'json' +CELERY_RESULT_SERIALIZER = 'json' +CELERY_TIMEZONE = env('TIME_ZONE') + +# Result expiry — critical when using rpc:// backend. +# Uncollected results accumulate in worker memory without this. +CELERY_RESULT_EXPIRES = 3600 # 1 hour; safe because we store state in DB job records + +# Global time limits (can be overridden per-task with decorator args) +CELERY_TASK_SOFT_TIME_LIMIT = 1800 # 30 min soft limit → SoftTimeLimitExceeded +CELERY_TASK_TIME_LIMIT = 2100 # 35 min hard kill + +# Late ack: acknowledge messages AFTER task completes, not before. +# If a worker crashes mid-task, the broker redelivers the message. +CELERY_TASK_ACKS_LATE = True +CELERY_WORKER_PREFETCH_MULTIPLIER = 1 # fetch one task at a time per worker slot + +# Separate logging level for Celery vs. application code +CELERY_LOGGING_LEVEL = env('CELERY_LOGGING_LEVEL', default='INFO') +``` + +> **`CELERY_TASK_ACKS_LATE`**: Combined with idempotent tasks, this provides at-least-once delivery. If a worker process is killed (OOM, deployment), the message returns to the queue and another worker picks it up. This is why idempotency is a hard requirement. + +### `settings.py` — Memcached (Django Cache) + +Memcached is the Django HTTP-layer cache (sessions, view caching). It is **not** used as a Celery result backend. + +```python +CACHES = { + "default": { + "BACKEND": "django.core.cache.backends.memcached.PyMemcacheCache", + "LOCATION": env('KVDB_LOCATION'), # memcached:11211 + "KEY_PREFIX": env('KVDB_PREFIX'), # spelunker + "TIMEOUT": 300, + } +} +``` + +### `INSTALLED_APPS` — Required + +```python +INSTALLED_APPS = [ + ... + 'django_celery_beat', # DB-backed periodic task scheduler (Beat) + ... +] +``` + +### `docker-compose.yml` — Service Topology + +| Service | Image | Purpose | +|---|---|---| +| `rabbitmq` | `rabbitmq:3-management-alpine` | AMQP message broker | +| `memcached` | `memcached:1.6-alpine` | Django HTTP cache | +| `worker` | `spelunker:latest` | Celery worker (`--concurrency=4`) | +| `scheduler` | `spelunker:latest` | Celery Beat with `DatabaseScheduler` | +| `flower` | `mher/flower:latest` | Task monitoring UI (port 5555) | + +### Task Routing / Queues (Recommended) + +By default all tasks run in the `celery` default queue. For production deployments, separate CPU-heavy work from I/O-bound work: + +```python +# settings.py +CELERY_TASK_ROUTES = { + 'solution_library.embed_document': {'queue': 'embedding'}, + 'solution_library.embed_documents_batch': {'queue': 'embedding'}, + 'rfp_manager.batch_generate_*': {'queue': 'batch'}, + 'llm_manager.validate_*': {'queue': 'default'}, +} +``` + +```yaml +# docker-compose.yml — separate workers per queue +worker-default: + command: celery -A spelunker worker -Q default --concurrency=4 + +worker-embedding: + command: celery -A spelunker worker -Q embedding --concurrency=2 + +worker-batch: + command: celery -A spelunker worker -Q batch --concurrency=2 +``` + +This prevents a burst of embedding tasks from starving time-sensitive API validation, and lets you scale each queue independently. + +### Database Connection Management + +Celery workers are long-lived processes. Django DB connections can become stale between tasks. Set `CONN_MAX_AGE` to `0` (the Django default) so connections are closed after each request cycle, or use a connection pooler like PgBouncer. Celery's `worker_pool_restarts` and Django's `close_old_connections()` (called automatically by Celery's Django fixup) handle cleanup between tasks. + +--- + +## Domain Extension Examples + +### `solution_library` App + +Three task types: single-document embed, batch embed, and documentation-source sync. The single-document task is also triggered by a `post_save` signal for automatic processing on upload. + +```python +# Auto-embed on create (signal) +embed_document_task.delay(document_id=instance.id, ...) + +# Manual batch from admin action +embed_documents_batch_task.delay(document_ids=[1, 2, 3], ...) + +# Source sync from view (with progress callback) +sync_documentation_source_task.delay(source_id=..., user_id=..., job_id=...) +``` + +### `rfp_manager` App + +Two-stage pipeline: responder answers first, reviewer answers second. Each stage is a separate Celery batch job. Both check for an existing active job before dispatching to prevent duplicate runs. + +```python +# Guard against duplicate jobs before dispatch +if RFPBatchJob.objects.filter( + rfp=rfp, + job_type=RFPBatchJob.JOB_TYPE_RESPONDER, + status__in=[RFPBatchJob.STATUS_PENDING, RFPBatchJob.STATUS_PROCESSING] +).exists(): + # surface error to user + ... + +# Stage 1 +batch_generate_responder_answers.delay(rfp.pk, user.pk, job.pk) + +# Stage 2 (after Stage 1 is complete) +batch_generate_reviewer_answers.delay(rfp.pk, user.pk, job.pk) +``` + +### `llm_manager` App + +Stateless periodic task — no DB job record needed because results are written directly to the `LLMApi` and `LLMModel` objects. + +```python +# Triggered by Celery Beat; schedule managed via django-celery-beat admin +validate_all_llm_apis.delay() + +# Triggered from admin action for a single API +validate_single_api.delay(api_id=api.pk) +``` + +--- + +## Anti-Patterns + +- ❌ Don't use `rpc://` result backend for tasks where the caller never retrieves the result — the result accumulates in memory. Spelunker mitigates this by storing state in DB job records rather than reading Celery results. Always set `CELERY_RESULT_EXPIRES`. +- ❌ Don't pass full model instances as task arguments — pass PKs only. Celery serialises arguments as JSON; ORM objects are not JSON serialisable. +- ❌ Don't share the same `celery_task_id` between the dispatch call and the task's `self.request.id` without re-saving. The dispatch `AsyncResult.id` and the in-task `self.request.id` are the same value; write it from **inside** the task using `bind=True` as the authoritative source. +- ❌ Don't silence exceptions with bare `except: pass` — always log errors and reflect failure status onto the DB record. +- ❌ Don't skip the duplicate-job guard when the task is triggered from a view or admin action. Without it, double-clicking a submit button can queue two identical jobs. +- ❌ Don't use `CELERY_TASK_SERIALIZER = 'pickle'` — JSON only, to prevent arbitrary code execution via crafted task payloads. +- ❌ Don't hardcode periodic task schedules in code via `app.conf.beat_schedule` — use `django_celery_beat` and manage schedules in Django admin so they survive deployments. +- ❌ Don't call `.delay()` inside a database transaction — use `transaction.on_commit()`. The worker may receive the message before the row is committed, causing `DoesNotExist`. +- ❌ Don't write non-idempotent tasks — workers may crash and brokers may redeliver. A re-executed task must produce the same result (or safely no-op). +- ❌ Don't omit time limits — a hung external API call (LLM, S3) will block a worker slot forever. Always set `soft_time_limit` and `time_limit`. +- ❌ Don't retry business-logic errors with `autoretry_for` — only retry **transient** failures (network errors, timeouts). A `ValueError` or `DoesNotExist` will never succeed on retry. + +--- + +## Migration / Adoption + +When adding a new Celery task to an existing app: + +1. Create `/tasks.py` using `@shared_task`, not `@app.task`. +2. Name the task `'.'`. +3. If the task is long-running, create a DB job model with the recommended fields above. +4. Register the app in `INSTALLED_APPS` (required for `autodiscover_tasks`). +5. For periodic tasks, add a schedule record via Django admin → Periodic Tasks (django-celery-beat) rather than in code. +6. Add a test that confirms the task can be called synchronously with `CELERY_TASK_ALWAYS_EAGER = True`. + +--- + +## Settings + +```python +# settings.py + +# Required — broker and result backend +CELERY_BROKER_URL = env('CELERY_BROKER_URL') # amqp://user:pw@host:5672/vhost +CELERY_RESULT_BACKEND = env('CELERY_RESULT_BACKEND') # rpc:// + +# Serialization (do not change) +CELERY_ACCEPT_CONTENT = ['json'] +CELERY_TASK_SERIALIZER = 'json' +CELERY_RESULT_SERIALIZER = 'json' +CELERY_TIMEZONE = env('TIME_ZONE') # must match Django TIME_ZONE + +# Result expiry — prevents unbounded memory growth with rpc:// backend +CELERY_RESULT_EXPIRES = 3600 # seconds (1 hour) + +# Time limits — global defaults, overridable per-task +CELERY_TASK_SOFT_TIME_LIMIT = 1800 # SoftTimeLimitExceeded after 30 min +CELERY_TASK_TIME_LIMIT = 2100 # hard SIGKILL after 35 min + +# Reliability — late ack + single prefetch for at-least-once delivery +CELERY_TASK_ACKS_LATE = True +CELERY_WORKER_PREFETCH_MULTIPLIER = 1 + +# Logging +CELERY_LOGGING_LEVEL = env('CELERY_LOGGING_LEVEL', default='INFO') # separate from app/Django level + +# Optional — disable for production +# AUTO_EMBED_DOCUMENTS = True # set False to suppress signal-triggered embedding + +# Optional — task routing (see Infrastructure Configuration for queue examples) +# CELERY_TASK_ROUTES = { ... } +``` + +--- + +## Testing + +```python +from django.test import TestCase, override_settings + + +@override_settings(CELERY_TASK_ALWAYS_EAGER=True, CELERY_TASK_EAGER_PROPAGATES=True) +class EmbedDocumentTaskTest(TestCase): + def test_happy_path(self): + """Task embeds a document and returns success.""" + # arrange: create Document, LLMModel fixtures + result = embed_document_task(document_id=doc.id) + self.assertTrue(result['success']) + self.assertGreater(result['chunks_created'], 0) + doc.refresh_from_db() + self.assertEqual(doc.review_status, 'pending') + + def test_document_not_found(self): + """Task returns success=False for a missing document ID.""" + result = embed_document_task(document_id=999999) + self.assertFalse(result['success']) + self.assertIn('not found', result['error']) + + def test_no_embedding_model(self): + """Task returns success=False when no embedding model is available.""" + # arrange: no LLMModel with is_system_default=True + result = embed_document_task(document_id=doc.id) + self.assertFalse(result['success']) + + +@override_settings(CELERY_TASK_ALWAYS_EAGER=True, CELERY_TASK_EAGER_PROPAGATES=True) +class BatchJobTest(TestCase): + def test_job_reaches_completed_status(self): + """Batch job transitions from pending → processing → completed.""" + job = RFPBatchJob.objects.create(...) + batch_generate_responder_answers(rfp_id=rfp.pk, user_id=user.pk, job_id=job.pk) + job.refresh_from_db() + self.assertEqual(job.status, RFPBatchJob.STATUS_COMPLETED) + + def test_duplicate_job_guard(self): + """A second dispatch when a job is already active is rejected by the view.""" + # arrange: one active job + response = self.client.post(dispatch_url) + self.assertContains(response, 'already running', status_code=400) +``` diff --git a/docs/Pattern_Django-MCP_V1-00.md b/docs/Pattern_Django-MCP_V1-00.md new file mode 100644 index 0000000..460c3fc --- /dev/null +++ b/docs/Pattern_Django-MCP_V1-00.md @@ -0,0 +1,1045 @@ +# Django MCP Pattern v1.0.0 + +Standardizes embedding a FastMCP server inside a Django ASGI process with token-based authentication, modular tool registration, and dual transport (Streamable HTTP + SSE). Used by Angelia 2 (Wagtail CMS); applicable to any Django project. + +## 🐾 Red Panda Approval™ + +This pattern follows Red Panda Approval standards. + +--- + +## Why a Pattern, Not a Shared App + +Every Django project that exposes MCP tools has different domain models, different permission requirements, and different admin UX needs. A single reusable Django app cannot accommodate this variability: + +- A **CMS platform** needs page-tree tools, media upload tools, and editorial permissions scoped to Wagtail collections +- An **e-commerce project** needs product-catalog tools, order-status tools, and Stripe-scoped API tokens +- An **internal dashboard** needs reporting tools, data-export tools, and LDAP-group-scoped permissions +- A **DevOps platform** needs deployment tools, log-query tools, and service-account tokens + +The tools, models, and admin surfaces differ — but the **wiring** is always the same: + +Instead, this pattern defines: + +- **Required components** — token model, auth middleware, context helpers, server instance, ASGI mount +- **Recommended behaviors** — audit trail, metrics, masked display, dev-mode bypass +- **Extension guidelines** — tool registration, resource registration, admin UI, CLI management +- **Standard constants** — token length, state keys, settings names, metric names + +--- + +## Required Components + +The non-negotiable minimum every implementation must provide. + +### MCPToken Model + +```python +import secrets +from django.conf import settings +from django.db import models +from django.utils import timezone + +class MCPToken(models.Model): + user = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.CASCADE, + related_name='mcp_tokens', + ) + token = models.CharField(max_length=64, unique=True, db_index=True) + name = models.CharField(max_length=100) + is_active = models.BooleanField(default=True) + expires_at = models.DateTimeField(null=True, blank=True) + last_used_at = models.DateTimeField(null=True, blank=True) + allowed_tools = models.JSONField(default=list, blank=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + def save(self, **kwargs): + if not self.token: + self.token = secrets.token_urlsafe(48) # 64-char URL-safe string + super().save(**kwargs) + + @property + def is_valid(self): + if not self.is_active: + return False + if self.expires_at and self.expires_at < timezone.now(): + return False + return True + + def can_use_tool(self, tool_name: str) -> bool: + if not self.allowed_tools: + return True # Empty list = unrestricted + return tool_name in self.allowed_tools +``` + +**Required fields:** + +| Field | Type | Purpose | +|-------|------|---------| +| `user` | FK → User | Auth scope — all operations run as this user | +| `token` | CharField(64) | Auto-generated bearer token, unique + indexed | +| `name` | CharField(100) | Friendly label ("Claude Desktop", "CI Script") | +| `is_active` | BooleanField | Revocation flag | +| `expires_at` | DateTimeField | Optional expiry (null = never) | +| `last_used_at` | DateTimeField | Audit trail — updated on each request | +| `allowed_tools` | JSONField | Tool whitelist (empty list = all tools) | +| `created_at` | DateTimeField | Auto-set on creation | +| `updated_at` | DateTimeField | Auto-set on save | + +**Required methods:** + +| Method | Returns | Purpose | +|--------|---------|---------| +| `is_valid` | bool | Checks active + not expired | +| `can_use_tool(name)` | bool | Whitelist check (empty = permit all) | + +### Auth Resolution + +```python +from django.contrib.auth import get_user_model +from django.utils import timezone + +User = get_user_model() + +class MCPAuthError(Exception): + pass + +def resolve_mcp_user(token_string: str) -> tuple: + try: + token = ( + MCPToken.objects + .select_related('user') + .get(token=token_string) + ) + except MCPToken.DoesNotExist: + raise MCPAuthError("Invalid MCP token.") + + if not token.is_active: + raise MCPAuthError("Token has been deactivated.") + if token.expires_at and token.expires_at < timezone.now(): + raise MCPAuthError("Token has expired.") + if not token.user.is_active: + raise MCPAuthError("User account is disabled.") + + token.record_usage() + return token.user, token + +def check_tool_permission(token: MCPToken, tool_name: str) -> bool: + return token.can_use_tool(tool_name) +``` + +**Validation chain (order matters):** + +1. Token exists in database (`select_related('user')` for single query) +2. Token is active (`is_active=True`) +3. Token is not expired (`expires_at` is null or in the future) +4. Bound user is active (`user.is_active=True`) +5. Record usage for audit trail + +### Auth Middleware + +```python +from asgiref.sync import sync_to_async +from fastmcp.server.middleware import Middleware, MiddlewareContext +from fastmcp.server.dependencies import get_http_request + +STATE_KEY_USER = "mcp_user" +STATE_KEY_TOKEN = "mcp_token" + +class MCPAuthMiddleware(Middleware): + async def __call__(self, context: MiddlewareContext, call_next): + require_auth = getattr(settings, 'MCP_REQUIRE_AUTH', True) + + token_string = self._extract_token(context) + + user = None + token = None + + if token_string: + try: + user, token = await sync_to_async( + resolve_mcp_user, thread_sensitive=True + )(token_string) + except MCPAuthError as e: + if require_auth: + raise PermissionError(str(e)) + elif require_auth and context.method == "tools/call": + raise PermissionError( + "Authentication required. Provide a Bearer token." + ) + + # Tool-level permission check + if token and context.method == "tools/call": + tool_name = self._extract_tool_name(context) + if tool_name and not check_tool_permission(token, tool_name): + raise PermissionError( + f"Token does not have permission to call '{tool_name}'." + ) + + # Store on request-scoped state + fastmcp_ctx = context.fastmcp_context + if fastmcp_ctx and user: + await fastmcp_ctx.set_state( + STATE_KEY_USER, user, serializable=False + ) + await fastmcp_ctx.set_state( + STATE_KEY_TOKEN, token, serializable=False + ) + + return await call_next(context) + + def _extract_token(self, context: MiddlewareContext) -> str | None: + try: + request = get_http_request() + auth_header = request.headers.get("Authorization", "") + if auth_header.startswith("Bearer "): + return auth_header[7:] + except RuntimeError: + pass # No HTTP request (e.g., stdio transport) + return None + + def _extract_tool_name(self, context: MiddlewareContext) -> str | None: + msg = context.message + if hasattr(msg, 'params') and hasattr(msg.params, 'name'): + return msg.params.name + return None +``` + +**Middleware responsibilities:** + +1. Extract Bearer token from HTTP `Authorization` header +2. Resolve token to Django User via `sync_to_async` (ORM is synchronous) +3. Check tool-level permissions on `tools/call` requests +4. Store user and token on FastMCP's request-scoped state (`serializable=False`) +5. Handle dev-mode bypass when `MCP_REQUIRE_AUTH=False` +6. Gracefully skip auth when no HTTP request exists (stdio transport) + +### Context Helpers + +```python +from fastmcp.server.context import Context + +async def get_mcp_user(ctx: Context): + return await ctx.get_state(STATE_KEY_USER) + +async def get_mcp_token(ctx: Context): + return await ctx.get_state(STATE_KEY_TOKEN) +``` + +Tools use these to access the authenticated user: + +```python +@mcp.tool +async def create_item(title: str, ctx: Context = None) -> dict: + user = await get_mcp_user(ctx) + # ORM operations run as this user +``` + +### FastMCP Server Instance + +```python +from fastmcp import FastMCP + +mcp = FastMCP( + "my-project", + instructions="System prompt describing your domain for LLMs.", + middleware=[MCPAuthMiddleware()], +) + +# Register tools by domain +register_product_tools(mcp) +register_order_tools(mcp) +``` + +**Requirements:** + +- Single global `FastMCP` instance created at module import time +- Auth middleware injected at server level (not per-tool) +- `instructions` string guides LLMs on domain concepts (page types, field meanings, workflows) +- Tool registration via modular functions, one per domain + +### ASGI Mount + +```python +import os +import django + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'myproject.settings') +django.setup() + +from contextlib import asynccontextmanager +from starlette.applications import Starlette +from starlette.routing import Mount, Route +from starlette.responses import JSONResponse + +from mcp_server.server import mcp + +mcp_http_app = mcp.http_app(path="/", transport="streamable-http") +mcp_sse_app = mcp.http_app(path="/", transport="sse") + +async def health(request): + return JSONResponse({"status": "ok"}) + +@asynccontextmanager +async def lifespan(app): + async with mcp_http_app.lifespan(app): + async with mcp_sse_app.lifespan(app): + yield + +app = Starlette( + routes=[ + Route("/mcp/health", health), + Mount("/mcp/sse", app=mcp_sse_app), # More specific path first + Mount("/mcp", app=mcp_http_app), + ], + lifespan=lifespan, +) +``` + +**Requirements:** + +- `django.setup()` before any Django imports (ORM, models) +- Health check endpoint at `/mcp/health` +- `/mcp/sse` route listed before `/mcp` (Starlette matches first hit) +- Lifespan combines both transport apps + +--- + +## Standard Constants + +Use these values for consistency across implementations. + +### Token Generation + +```python +# 64-character URL-safe token via stdlib +token = secrets.token_urlsafe(48) +``` + +### State Keys + +```python +STATE_KEY_USER = "mcp_user" +STATE_KEY_TOKEN = "mcp_token" +``` + +### Settings + +```python +# settings.py +MCP_REQUIRE_AUTH = env.bool('MCP_REQUIRE_AUTH', default=True) +``` + +### Metric Names + +```python +mcp_tool_invocations_total # Counter — labels: tool, status +mcp_tool_duration_seconds # Histogram — labels: tool +mcp_auth_failures_total # Counter — labels: reason +mcp_active_sessions # Gauge +``` + +### Tool Description Limit + +The MCP specification requires tool descriptions ≤ **1024 characters**. Use a stricter internal limit of **750 characters** to leave headroom for protocol overhead. Validate with: + +```python +import inspect +assert len(inspect.cleandoc(tool_fn.__doc__)) <= 750 +``` + +--- + +## Recommended Behaviors + +Most implementations should include these, but they are not strictly required. + +### Audit Trail + +```python +def record_usage(self): + self.last_used_at = timezone.now() + self.save(update_fields=['last_used_at']) +``` + +Called by `resolve_mcp_user()` on every authenticated request. Provides admin visibility into token activity. + +### Masked Token Display + +```python +def get_masked_token(self): + if len(self.token) > 8: + return f"{'*' * (len(self.token) - 8)}{self.token[-8:]}" + return "********" +``` + +Used in admin list views and logs. Never expose full tokens after initial creation. + +### Prometheus Metrics + +```python +from prometheus_client import Counter, Histogram, Gauge + +mcp_tool_invocations_total = Counter( + 'mcp_tool_invocations_total', + 'Total MCP tool invocations', + ['tool', 'status'], +) + +mcp_tool_duration_seconds = Histogram( + 'mcp_tool_duration_seconds', + 'MCP tool execution duration in seconds', + ['tool'], + buckets=(0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0), +) + +mcp_auth_failures_total = Counter( + 'mcp_auth_failures_total', + 'Total MCP authentication failures', + ['reason'], +) + +def record_tool_call(tool_name: str, status: str, duration: float): + mcp_tool_invocations_total.labels(tool=tool_name, status=status).inc() + mcp_tool_duration_seconds.labels(tool=tool_name).observe(duration) +``` + +### sync_to_async for ORM + +All Django ORM calls inside async tool functions **must** be wrapped with `sync_to_async`: + +```python +from asgiref.sync import sync_to_async + +@mcp.tool +async def create_item(title: str, ctx: Context = None) -> dict: + user = await get_mcp_user(ctx) + + def _do_create(): + item = MyModel(title=title, created_by=user) + item.save() + return {"id": item.id, "title": item.title} + + return await sync_to_async(_do_create, thread_sensitive=True)() +``` + +The `thread_sensitive=True` parameter ensures ORM operations run in Django's main thread, avoiding database connection issues. + +### Dev Mode Auth Bypass + +When `MCP_REQUIRE_AUTH=False`, the middleware skips token validation. Tools run without a user context. This is only for local development — never disable in production. + +--- + +## Tool Registration Pattern + +Tools are organized by domain in separate modules. Each module exports a `register_*_tools(mcp)` factory function that defines tools as closures, capturing the `mcp` instance. + +### File Structure + +``` +mcp_server/ + tools/ + __init__.py + pages.py # register_page_tools(mcp) + media.py # register_media_tools(mcp) + blog.py # register_blog_tools(mcp) + orders.py # register_order_tools(mcp) +``` + +### Factory Function + +```python +# mcp_server/tools/products.py + +import time +from asgiref.sync import sync_to_async +from fastmcp.server.context import Context +from ..context import get_mcp_user +from ..metrics import record_tool_call + +def register_product_tools(mcp): + + @mcp.tool + def list_products(category: str | None = None, limit: int = 20) -> dict: + """List products with optional category filter. Returns id, name, + price, and category for each product.""" + start = time.time() + try: + qs = Product.objects.all() + if category: + qs = qs.filter(category__slug=category) + result = { + "products": [ + {"id": p.id, "name": p.name, "price": str(p.price)} + for p in qs[:limit] + ] + } + record_tool_call("list_products", "success", time.time() - start) + return result + except Exception: + record_tool_call("list_products", "error", time.time() - start) + raise + + @mcp.tool + async def create_product( + name: str, price: str, description: str, + ctx: Context = None, + ) -> dict: + """Create a new product. Price as decimal string (e.g. '29.99').""" + start = time.time() + try: + user = await get_mcp_user(ctx) if ctx else None + + def _do_create(): + product = Product( + name=name, + price=Decimal(price), + description=description, + created_by=user, + ) + product.save() + return {"id": product.id, "name": product.name} + + result = await sync_to_async(_do_create, thread_sensitive=True)() + record_tool_call("create_product", "success", time.time() - start) + return result + except Exception: + record_tool_call("create_product", "error", time.time() - start) + raise +``` + +### Sync vs Async Decision + +| Tool Type | Define As | ORM Access | Use When | +|-----------|-----------|------------|----------| +| Read-only queries | `def tool(...)` | Direct | Simple lookups, listing, search | +| Mutations | `async def tool(..., ctx)` | `sync_to_async` | Create, update, delete — needs user context | + +Read-only tools can be synchronous because FastMCP handles the async bridge. Mutation tools must be async to access the request-scoped user context via `await get_mcp_user(ctx)`. + +### Server Registration + +```python +# mcp_server/server.py + +mcp = FastMCP("my-project", instructions="...", middleware=[...]) + +register_product_tools(mcp) +register_order_tools(mcp) +register_inventory_tools(mcp) +``` + +All `register_*` calls happen at module import time. The tools are available immediately when the ASGI app starts. + +--- + +## ASGI Dual Transport Mount + +Two MCP transports share a single FastMCP instance, served by one Uvicorn process. + +### Streamable HTTP (Standard) + +POST-based JSON-RPC at `/mcp/`. Stateless — could support multiple workers (but single worker is simpler when SSE is also served). + +### SSE (Legacy) + +Server-Sent Events at `/mcp/sse/`. Stateful — session state lives in the worker's memory. **Requires single Uvicorn worker.** Supported for backward compatibility with older MCP clients. + +### Health Check + +`GET /mcp/health` returns `{"status": "ok"}`. Used by load balancers, Docker health checks, and monitoring. + +### Deployment + +```bash +# Separate from the Django WSGI server +uvicorn myproject.asgi:app --host 0.0.0.0 --port 8001 --workers 1 +``` + +The MCP server runs on a separate port from Django's WSGI server (Gunicorn). Nginx routes `/mcp/` to Uvicorn: + +```nginx +# Streamable HTTP + SSE +location /mcp/ { + proxy_pass http://mcp:8001; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_buffering off; + proxy_cache off; + proxy_read_timeout 300s; +} +``` + +> ⚠️ `proxy_buffering off` and `proxy_cache off` are required for SSE. Without them, Nginx buffers the event stream and clients see no output. + +--- + +## Admin UI & CLI Token Management + +Two approaches for creating and managing tokens. + +### Django Admin + +Register `MCPToken` with Django's admin. Show the full token **once** after creation — it cannot be retrieved later. + +```python +# admin.py +from django.contrib import admin +from .models import MCPToken + +@admin.register(MCPToken) +class MCPTokenAdmin(admin.ModelAdmin): + list_display = ['name', 'user', 'is_active', 'masked_token_display', + 'expires_at', 'last_used_at', 'created_at'] + list_filter = ['is_active'] + search_fields = ['name', 'user__email'] + readonly_fields = ['token', 'last_used_at', 'created_at', 'updated_at'] +``` + +For frameworks with richer admin UIs (e.g., Wagtail snippets), register as a snippet with grouped tool selection widgets and token-shown-once creation views. + +### Management Command + +For scripted or CI token creation: + +```python +# management/commands/create_mcp_token.py +from django.core.management.base import BaseCommand, CommandError + +class Command(BaseCommand): + help = "Create an MCP token for a user and print the full token." + + def add_arguments(self, parser): + parser.add_argument("--user", required=True, help="User email.") + parser.add_argument("--name", required=True, help="Token name.") + parser.add_argument("--tools", default="", + help="Comma-separated allowed tools (empty = all).") + parser.add_argument("--expires-days", type=int, default=None, + help="Days until expiry (omit for no expiry).") + + def handle(self, *args, **options): + User = get_user_model() + try: + user = User.objects.get(email=options["user"]) + except User.DoesNotExist: + raise CommandError(f'User "{options["user"]}" not found.') + + if not user.is_active: + raise CommandError(f'User "{options["user"]}" is inactive.') + + allowed_tools = [] + if options["tools"]: + allowed_tools = [t.strip() for t in options["tools"].split(",") + if t.strip()] + + expires_at = None + if options["expires_days"] is not None: + if options["expires_days"] < 1: + raise CommandError("--expires-days must be at least 1.") + expires_at = timezone.now() + timedelta(days=options["expires_days"]) + + token = MCPToken.objects.create( + user=user, + name=options["name"], + allowed_tools=allowed_tools, + expires_at=expires_at, + ) + + self.stdout.write(self.style.SUCCESS("✔ MCP token created")) + self.stdout.write(f" Name: {token.name}") + self.stdout.write(f" User: {user.email}") + self.stdout.write(self.style.WARNING(" Token (shown once):")) + self.stdout.write(f" {token.token}") +``` + +**Usage:** + +```bash +# All tools, no expiry +python manage.py create_mcp_token --user admin@example.com --name "Claude Desktop" + +# Restricted tools +python manage.py create_mcp_token --user admin@example.com --name "Read Only" \ + --tools list_products,get_order_status + +# 30-day expiry +python manage.py create_mcp_token --user admin@example.com --name "Temp" \ + --expires-days 30 +``` + +--- + +## MCP Resources + +Read-only reference data registered with `@mcp.resource()`. Resources give LLMs context about your domain without requiring tool calls. + +```python +# mcp_server/tools/reference.py +from pathlib import Path + +def register_resources(mcp): + + @mcp.resource("myapp://api-schema") + def api_schema() -> str: + """OpenAPI schema for the public API.""" + return Path("static/openapi.yaml").read_text() + + @mcp.resource("myapp://style-guide") + def style_guide() -> str: + """Content style guide for consistent authoring.""" + return Path("static/style-guide.md").read_text() +``` + +**Good candidates for resources:** + +- Design tokens / CSS custom properties +- Template structure descriptions +- API schemas or field references +- Content style guides +- Image/media specifications + +--- + +## Domain Extension Examples + +### Wagtail CMS (Angelia) + +Angelia adds Wagtail-specific patterns on top of the core: + +- **Page tree tools** — `get_page_tree()`, `get_page_content()` navigate Wagtail's hierarchical page model +- **Type-specific CRUD** — `create_flex_page()`, `create_blog_post()`, `create_event()` with type-aware field handling +- **Media tools** — `search_images()` returns pre-generated rendition URLs; `upload_image()` uses async httpx +- **Wagtail snippet admin** — MCPToken registered as a Wagtail snippet with `GroupedToolWidget` for tool selection +- **Design resources** — CSS custom properties, base template structure, rendition spec reference +- **Permissions** — Token inherits user's Wagtail page permissions, collection permissions, group memberships +- **Audit trail** — All page revisions record `user=token.user` + +### Generic Django (Inventory API) + +A hypothetical inventory system using standard Django: + +```python +# mcp_server/tools/inventory.py + +def register_inventory_tools(mcp): + + @mcp.tool + def search_products(query: str, in_stock: bool = True, limit: int = 20) -> dict: + """Search products by name or SKU. Filter by stock availability.""" + qs = Product.objects.filter(name__icontains=query) + if in_stock: + qs = qs.filter(stock_quantity__gt=0) + return {"products": [ + {"id": p.id, "name": p.name, "sku": p.sku, "stock": p.stock_quantity} + for p in qs[:limit] + ]} + + @mcp.tool + async def adjust_stock( + product_id: int, quantity_change: int, reason: str, + ctx: Context = None, + ) -> dict: + """Adjust stock for a product. Positive = restock, negative = deduct.""" + user = await get_mcp_user(ctx) + + def _do_adjust(): + product = Product.objects.get(id=product_id) + product.stock_quantity += quantity_change + product.save(update_fields=['stock_quantity']) + StockAdjustment.objects.create( + product=product, quantity=quantity_change, + reason=reason, user=user, + ) + return {"id": product.id, "new_stock": product.stock_quantity} + + return await sync_to_async(_do_adjust, thread_sensitive=True)() +``` + +- **Django admin** — Standard `MCPTokenAdmin` with `list_display` and `list_filter` +- **Permissions** — Django model permissions (`can_change_product`, `can_view_order`) +- **Resources** — Product category taxonomy, warehouse location reference + +--- + +## Anti-Patterns + +- ❌ Don't run multiple Uvicorn workers with SSE transport — sessions live in the worker's memory and POSTs hit random workers, causing `404 Not Found` on `/mcp/messages/` +- ❌ Don't store tokens hashed — the middleware needs plaintext lookup via `MCPToken.objects.get(token=token_string)`. Use `unique=True` + `db_index=True` instead +- ❌ Don't skip `sync_to_async` for ORM calls in async tools — Django raises `SynchronousOnlyOperation` when ORM methods are called from an async context +- ❌ Don't put auth logic inside individual tools — use middleware so auth is enforced uniformly before any tool executes +- ❌ Don't exceed 750-character tool descriptions — the MCP spec allows 1024, but leaving headroom avoids protocol overhead issues +- ❌ Don't inline metrics recording in every tool — extract a shared `record_tool_call(name, status, duration)` helper to keep tools focused on business logic +- ❌ Don't serialize user/token to state — use `serializable=False` to prevent Django model instances from leaking into logs or JSON responses +- ❌ Don't show full tokens after initial creation — display the masked version (`get_masked_token()`) in admin list views and logs + +--- + +## Settings + +```python +# settings.py + +# Require Bearer token authentication for MCP requests. +# Set to False only for local development. +MCP_REQUIRE_AUTH = env.bool('MCP_REQUIRE_AUTH', default=True) +``` + +Additional deployment settings (not MCP-specific, but required for the pattern): + +| Setting | Example | Purpose | +|---------|---------|---------| +| `ASGI_APPLICATION` | `'myproject.asgi.app'` | Uvicorn entrypoint | +| Uvicorn port | `8001` | Separate from WSGI server | +| Uvicorn workers | `1` | Required for SSE transport | +| Nginx proxy target | `http://mcp:8001` | Route `/mcp/` to Uvicorn | + +--- + +## Testing + +Standard test cases every implementation should cover. + +### Token Model Tests + +```python +class MCPTokenModelTest(TestCase): + def setUp(self): + self.user = User.objects.create_user( + email="test@example.com", password="pass123" + ) + + def test_token_auto_generated(self): + """Token is auto-generated on creation.""" + token = MCPToken.objects.create(user=self.user, name="Test") + self.assertIsNotNone(token.token) + self.assertTrue(len(token.token) > 20) + + def test_active_token_is_valid(self): + """Active non-expired token is valid.""" + token = MCPToken.objects.create(user=self.user, name="Valid") + self.assertTrue(token.is_valid) + + def test_inactive_token_not_valid(self): + """Deactivated token is not valid.""" + token = MCPToken.objects.create( + user=self.user, name="Off", is_active=False + ) + self.assertFalse(token.is_valid) + + def test_expired_token_not_valid(self): + """Expired token is not valid.""" + token = MCPToken.objects.create( + user=self.user, name="Old", + expires_at=timezone.now() - timedelta(hours=1), + ) + self.assertFalse(token.is_valid) + + def test_tool_restriction(self): + """Restricted token only permits listed tools.""" + token = MCPToken.objects.create( + user=self.user, name="Limited", + allowed_tools=["list_products"], + ) + self.assertTrue(token.can_use_tool("list_products")) + self.assertFalse(token.can_use_tool("delete_product")) + + def test_unrestricted_permits_all(self): + """Empty allowed_tools permits any tool.""" + token = MCPToken.objects.create(user=self.user, name="Open") + self.assertTrue(token.can_use_tool("anything")) + + def test_record_usage(self): + """record_usage updates last_used_at.""" + token = MCPToken.objects.create(user=self.user, name="Usage") + self.assertIsNone(token.last_used_at) + token.record_usage() + token.refresh_from_db() + self.assertIsNotNone(token.last_used_at) + + def test_masked_token(self): + """Masked token hides most characters.""" + token = MCPToken.objects.create(user=self.user, name="Mask") + masked = token.get_masked_token() + self.assertTrue(masked.endswith(token.token[-8:])) + self.assertIn("*", masked) +``` + +### Auth Resolution Tests + +```python +class MCPAuthTest(TestCase): + def setUp(self): + self.user = User.objects.create_user( + email="auth@example.com", password="pass123" + ) + self.token = MCPToken.objects.create( + user=self.user, name="Auth Test" + ) + + def test_resolve_valid_token(self): + """Valid token resolves to user and token.""" + user, token = resolve_mcp_user(self.token.token) + self.assertEqual(user.email, "auth@example.com") + + def test_invalid_token_raises(self): + with self.assertRaises(MCPAuthError): + resolve_mcp_user("invalid-token-string") + + def test_inactive_token_raises(self): + self.token.is_active = False + self.token.save() + with self.assertRaises(MCPAuthError): + resolve_mcp_user(self.token.token) + + def test_expired_token_raises(self): + self.token.expires_at = timezone.now() - timedelta(hours=1) + self.token.save() + with self.assertRaises(MCPAuthError): + resolve_mcp_user(self.token.token) + + def test_disabled_user_raises(self): + self.user.is_active = False + self.user.save() + with self.assertRaises(MCPAuthError): + resolve_mcp_user(self.token.token) +``` + +### Server Registration Tests + +```python +class MCPServerRegistrationTest(TestCase): + def test_expected_tools_registered(self): + """All expected tools are registered on the server.""" + from .server import mcp + tools = asyncio.run(mcp.list_tools()) + tool_names = {t.name for t in tools} + for expected in ["list_products", "create_product", "adjust_stock"]: + self.assertIn(expected, tool_names) + + def test_resources_registered(self): + """All expected resources are registered.""" + from .server import mcp + resources = asyncio.run(mcp.list_resources()) + uris = {str(r.uri) for r in resources} + self.assertIn("myapp://api-schema", uris) +``` + +### Management Command Tests + +```python +class CreateMCPTokenCommandTest(TestCase): + def setUp(self): + self.user = User.objects.create_user( + email="cmd@example.com", password="pass123" + ) + + def test_create_basic_token(self): + out = StringIO() + call_command( + "create_mcp_token", + user="cmd@example.com", + name="CLI Test", + stdout=out, + ) + self.assertEqual(MCPToken.objects.count(), 1) + self.assertIn("CLI Test", out.getvalue()) + + def test_invalid_user_raises(self): + with self.assertRaises(CommandError): + call_command( + "create_mcp_token", + user="nobody@example.com", + name="Fail", + ) +``` + +--- + +## Deployment + +### Dual-Worker Architecture + +| Process | Server | Port | Protocol | Purpose | +|---------|--------|------|----------|---------| +| Web | Gunicorn | 8080 | WSGI | Django views, admin, static | +| MCP | Uvicorn | 8001 | ASGI | MCP tools (Streamable HTTP + SSE) | + +Both processes share the same Django codebase and database. Nginx routes traffic: + +- `/mcp/*` → Uvicorn (port 8001) +- Everything else → Gunicorn (port 8080) + +### Docker Compose + +```yaml +services: + web: + build: . + command: gunicorn --bind :8080 --workers 3 myproject.wsgi + ports: + - "8080:8080" + + mcp: + build: . + command: uvicorn myproject.asgi:app --host 0.0.0.0 --port 8001 --workers 1 + ports: + - "8001:8001" + + nginx: + image: nginx:alpine + volumes: + - ./nginx.conf:/etc/nginx/nginx.conf:ro + ports: + - "443:443" + depends_on: + - web + - mcp +``` + +### Entrypoint Pattern + +Wait for dependencies before starting the server: + +```bash +#!/bin/bash +set -e + +# Wait for database +until PGPASSWORD=$DB_PASSWORD psql -h $DB_HOST -U $DB_USER -d $DB_NAME -c '\q'; do + echo "Waiting for database..." + sleep 2 +done + +# Run migrations +python manage.py migrate --noinput + +# Collect static files +python manage.py collectstatic --noinput + +# Start server (command passed via Docker CMD) +exec "$@" +``` + +### Client Configuration + +```json +{ + "mcpServers": { + "my-project": { + "url": "https://my-site.com/mcp/", + "headers": { + "Authorization": "Bearer " + } + } + } +} +``` + +For SSE transport, change URL to `/mcp/sse/`. diff --git a/docs/Pattern_Notification_V1-00.md b/docs/Pattern_Notification_V1-00.md new file mode 100644 index 0000000..5555b4d --- /dev/null +++ b/docs/Pattern_Notification_V1-00.md @@ -0,0 +1,401 @@ +# Notification Trigger Pattern v1.0.0 + +Standard pattern for triggering notifications from domain-specific events in Django applications that use Themis for notification infrastructure. + +## 🐾 Red Panda Approval™ + +This pattern follows Red Panda Approval standards. + +--- + +## Overview + +Themis provides the notification *mailbox* — the model, UI (bell + dropdown + list page), polling, browser notifications, user preferences, and cleanup. What Themis does **not** provide is the *trigger logic* — the rules that decide when a notification should be created. + +Trigger logic is inherently domain-specific: + +- A task tracker sends "Task overdue" notifications +- A calendar sends "Event starting in 15 minutes" reminders +- A finance app sends "Invoice payment received" alerts +- A monitoring system sends "Server CPU above 90%" warnings + +This pattern documents how consuming apps should create notifications using Themis infrastructure. + +--- + +## The Standard Interface + +All notification creation goes through one function: + +```python +from themis.notifications import notify_user + +notify_user( + user=user, # Django User instance + title="Task overdue", # Short headline (max 200 chars) + message="Task 'Deploy v2' was due yesterday.", # Optional body + level="warning", # info | success | warning | danger + url="/tasks/42/", # Optional: where to navigate on click + source_app="tasks", # Your app label (for tracking/cleanup) + source_model="Task", # Model that triggered this + source_id="42", # PK of the source object (as string) + deduplicate=True, # Skip if unread duplicate exists + expires_at=None, # Optional: auto-expire datetime +) +``` + +**Never create `UserNotification` objects directly.** The `notify_user()` function handles: + +- Checking if the user has notifications enabled +- Filtering by the user's minimum notification level +- Deduplication (when `deduplicate=True`) +- Returning `None` when skipped (so callers can check) + +--- + +## Trigger Patterns + +### 1. Signal-Based Triggers + +The most common pattern — listen to Django signals and create notifications: + +```python +# myapp/signals.py +from django.db.models.signals import post_save +from django.dispatch import receiver + +from themis.notifications import notify_user + +from .models import Task + + +@receiver(post_save, sender=Task) +def notify_task_assigned(sender, instance, created, **kwargs): + """Notify user when a task is assigned to them.""" + if not created and instance.assignee and instance.tracker.has_changed("assignee"): + notify_user( + user=instance.assignee, + title=f"Task assigned: {instance.title}", + message=f"You've been assigned to '{instance.title}'", + level="info", + url=instance.get_absolute_url(), + source_app="tasks", + source_model="Task", + source_id=str(instance.pk), + deduplicate=True, + ) +``` + +### 2. View-Based Triggers + +Create notifications during request processing: + +```python +# myapp/views.py +from themis.notifications import notify_user + + +@login_required +def approve_request(request, pk): + req = get_object_or_404(Request, pk=pk) + req.status = "approved" + req.save() + + # Notify the requester + notify_user( + user=req.requester, + title="Request approved", + message=f"Your request '{req.title}' has been approved.", + level="success", + url=req.get_absolute_url(), + source_app="requests", + source_model="Request", + source_id=str(req.pk), + ) + + messages.success(request, "Request approved.") + return redirect("request-list") +``` + +### 3. Management Command Triggers + +For scheduled checks (e.g., daily overdue detection): + +```python +# myapp/management/commands/check_overdue.py +from django.core.management.base import BaseCommand +from django.utils import timezone + +from themis.notifications import notify_user + +from myapp.models import Task + + +class Command(BaseCommand): + help = "Send notifications for overdue tasks" + + def handle(self, *args, **options): + overdue = Task.objects.filter( + due_date__lt=timezone.now().date(), + status__in=["open", "in_progress"], + ) + count = 0 + for task in overdue: + result = notify_user( + user=task.assignee, + title=f"Overdue: {task.title}", + message=f"Task was due {task.due_date}", + level="danger", + url=task.get_absolute_url(), + source_app="tasks", + source_model="Task", + source_id=str(task.pk), + deduplicate=True, # Don't send again if unread + ) + if result: + count += 1 + self.stdout.write(f"Sent {count} overdue notification(s)") +``` + +Schedule with cron or Kubernetes CronJob: + +```yaml +# Kubernetes CronJob +apiVersion: batch/v1 +kind: CronJob +metadata: + name: check-overdue-tasks +spec: + schedule: "0 8 * * *" # Daily at 8 AM + jobTemplate: + spec: + template: + spec: + containers: + - name: check-overdue + command: ["python", "manage.py", "check_overdue"] +``` + +### 4. Celery Task Triggers + +For apps with background workers: + +```python +# myapp/tasks.py +from celery import shared_task +from django.contrib.auth import get_user_model + +from themis.notifications import notify_user + +User = get_user_model() + + +@shared_task +def notify_report_ready(user_id, report_id): + """Notify user when their report has been generated.""" + from myapp.models import Report + + user = User.objects.get(pk=user_id) + report = Report.objects.get(pk=report_id) + + notify_user( + user=user, + title="Report ready", + message=f"Your {report.report_type} report is ready to download.", + level="success", + url=report.get_absolute_url(), + source_app="reports", + source_model="Report", + source_id=str(report.pk), + ) +``` + +--- + +## Notification Levels + +Choose the appropriate level for each notification type: + +| Level | Weight | Use For | +|---|---|---| +| `info` | 0 | Informational updates (assigned, comment added) | +| `success` | 0 | Positive outcomes (approved, completed, payment received) | +| `warning` | 1 | Needs attention (approaching deadline, low balance) | +| `danger` | 2 | Urgent/error (overdue, failed, system error) | + +Users can set a minimum notification level in their preferences: + +- **info** (default) — receive all notifications +- **warning** — only warnings and errors +- **danger** — only errors + +Note that `info` and `success` have the same weight (0), so setting minimum to "warning" filters out both. + +--- + +## Source Tracking + +The three source tracking fields enable two important features: + +### Deduplication + +When `deduplicate=True`, `notify_user()` checks for existing unread notifications with the same `source_app`, `source_model`, and `source_id`. This prevents notification spam when the same event is checked multiple times (e.g., a daily cron job for overdue tasks). + +### Bulk Cleanup + +When a source object is deleted, clean up its notifications: + +```python +# In your model's delete signal or post_delete: +from themis.models import UserNotification + +@receiver(post_delete, sender=Task) +def cleanup_task_notifications(sender, instance, **kwargs): + UserNotification.objects.filter( + source_app="tasks", + source_model="Task", + source_id=str(instance.pk), + ).delete() +``` + +--- + +## Expiring Notifications + +For time-sensitive notifications, use `expires_at`: + +```python +from datetime import timedelta +from django.utils import timezone + +# Event reminder that expires when the event starts +notify_user( + user=attendee, + title=f"Starting soon: {event.title}", + level="info", + url=event.get_absolute_url(), + expires_at=event.start_time, + source_app="events", + source_model="Event", + source_id=str(event.pk), + deduplicate=True, +) +``` + +Expired notifications are automatically excluded from counts and lists. The `cleanup_notifications` management command deletes them permanently. + +--- + +## Multi-User Notifications + +For events that affect multiple users, call `notify_user()` in a loop: + +```python +def notify_team(team, title, message, **kwargs): + """Send a notification to all members of a team.""" + for member in team.members.all(): + notify_user(user=member, title=title, message=message, **kwargs) +``` + +For large recipient lists, consider using a Celery task to avoid blocking the request. + +--- + +## Notification Cleanup + +Themis provides automatic cleanup via the management command: + +```bash +# Uses THEMIS_NOTIFICATION_MAX_AGE_DAYS (default: 90) +python manage.py cleanup_notifications + +# Override max age +python manage.py cleanup_notifications --max-age-days=60 +``` + +**What gets deleted:** + +- Read notifications older than the max age +- Dismissed notifications older than the max age +- Expired notifications (past their `expires_at`) + +**What is preserved:** + +- Unread notifications (regardless of age) + +Schedule this as a daily cron job or Kubernetes CronJob. + +--- + +## Settings + +Themis recognizes these settings for notification behavior: + +```python +# Polling interval for the notification bell (seconds, 0 = disabled) +THEMIS_NOTIFICATION_POLL_INTERVAL = 60 + +# Hard ceiling for notification cleanup (days) +THEMIS_NOTIFICATION_MAX_AGE_DAYS = 90 +``` + +Users control their own preferences in Settings: + +- **Enable notifications** — master on/off switch +- **Minimum level** — filter low-priority notifications +- **Browser desktop notifications** — opt-in for OS-level alerts +- **Retention days** — how long to keep read notifications + +--- + +## Anti-Patterns + +- ❌ Don't create `UserNotification` objects directly — use `notify_user()` +- ❌ Don't send notifications in tight loops without `deduplicate=True` +- ❌ Don't use notifications for real-time chat — use WebSocket channels +- ❌ Don't store sensitive data in notification messages (they're visible in admin) +- ❌ Don't rely on notifications as the sole delivery mechanism — they may be disabled by the user +- ❌ Don't forget `source_app`/`source_model`/`source_id` — they enable cleanup and dedup + +--- + +## Testing Notifications + +```python +from themis.notifications import notify_user +from themis.models import UserNotification + + +class MyAppNotificationTest(TestCase): + def test_task_overdue_notification(self): + """Overdue task creates a danger notification.""" + user = User.objects.create_user(username="test", password="pass") + task = Task.objects.create( + title="Deploy v2", + assignee=user, + due_date=date.today() - timedelta(days=1), + ) + + # Trigger your notification logic + check_overdue_tasks() + + # Verify notification was created + notif = UserNotification.objects.get( + user=user, + source_app="tasks", + source_model="Task", + source_id=str(task.pk), + ) + self.assertEqual(notif.level, "danger") + self.assertIn("Deploy v2", notif.title) + + def test_disabled_user_gets_no_notification(self): + """Users with notifications disabled get nothing.""" + user = User.objects.create_user(username="quiet", password="pass") + user.profile.notifications_enabled = False + user.profile.save() + + result = notify_user(user, "Should be skipped") + self.assertIsNone(result) + self.assertEqual(UserNotification.objects.count(), 0) +``` diff --git a/docs/Pattern_Organization_V1-00.md b/docs/Pattern_Organization_V1-00.md new file mode 100644 index 0000000..7393632 --- /dev/null +++ b/docs/Pattern_Organization_V1-00.md @@ -0,0 +1,275 @@ +# Organization Model Pattern v1.0.0 + +Standard pattern for Organization models across Django applications. Each app implements its own Organization model following this pattern to ensure interoperability and consistent field names. + +## 🐾 Red Panda Approval™ + +This pattern follows Red Panda Approval standards. + +--- + +## Why a Pattern, Not a Shared Model + +Organization requirements vary by domain. A financial app needs stock symbols and ISIN codes. A healthcare app needs provider IDs. An education app needs accreditation fields. Shipping a monolithic Organization model with 40+ fields forces every app to carry fields it does not need. + +Instead, this pattern defines: + +- **Required fields** every Organization model must have +- **Recommended fields** most apps should include +- **Extension guidelines** for domain-specific needs +- **Standard choice values** for interoperability + +--- + +## Required Fields + +Every Organization model must include these fields: + +```python +import uuid +from django.conf import settings +from django.db import models + + +class Organization(models.Model): + # Primary key + id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) + + # Core identity + name = models.CharField(max_length=255, db_index=True, + help_text="Organization display name") + slug = models.SlugField(max_length=255, unique=True, + help_text="URL-friendly identifier") + + # Classification + type = models.CharField(max_length=20, choices=TYPE_CHOICES, + help_text="Organization type") + status = models.CharField(max_length=20, choices=STATUS_CHOICES, + default="active", help_text="Current status") + + # Location + country = models.CharField(max_length=2, help_text="ISO 3166-1 alpha-2 country code") + + # Audit + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.SET_NULL, + null=True, blank=True, related_name="created_organizations") + updated_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.SET_NULL, + null=True, blank=True, related_name="updated_organizations") + + class Meta: + verbose_name = "Organization" + verbose_name_plural = "Organizations" + + def __str__(self): + return self.name + + def get_absolute_url(self): + from django.urls import reverse + return reverse("organization-detail", kwargs={"slug": self.slug}) +``` + +--- + +## Standard Choice Values + +Use these exact values for interoperability between apps: + +### TYPE_CHOICES + +```python +TYPE_CHOICES = [ + ("for-profit", "For-Profit"), + ("non-profit", "Non-Profit"), + ("government", "Government"), + ("ngo", "NGO"), + ("educational", "Educational"), + ("healthcare", "Healthcare"), + ("cooperative", "Cooperative"), +] +``` + +### STATUS_CHOICES + +```python +STATUS_CHOICES = [ + ("active", "Active"), + ("inactive", "Inactive"), + ("pending", "Pending"), + ("suspended", "Suspended"), + ("dissolved", "Dissolved"), + ("merged", "Merged"), +] +``` + +### SIZE_CHOICES (recommended) + +```python +SIZE_CHOICES = [ + ("micro", "Micro (1-9)"), + ("small", "Small (10-49)"), + ("medium", "Medium (50-249)"), + ("large", "Large (250-999)"), + ("enterprise", "Enterprise (1000+)"), +] +``` + +### PARENT_RELATIONSHIP_CHOICES (if using hierarchy) + +```python +PARENT_RELATIONSHIP_CHOICES = [ + ("subsidiary", "Subsidiary"), + ("division", "Division"), + ("branch", "Branch"), + ("franchise", "Franchise"), + ("joint-venture", "Joint Venture"), + ("department", "Department"), +] +``` + +--- + +## Recommended Fields + +Most apps should include these fields: + +```python +# Extended identity +legal_name = models.CharField(max_length=255, blank=True, default="", + help_text="Full legal entity name") +abbreviated_name = models.CharField(max_length=50, blank=True, default="", + db_index=True, help_text="Short name/acronym") + +# Classification +size = models.CharField(max_length=20, choices=SIZE_CHOICES, blank=True, default="", + help_text="Organization size") + +# Contact +primary_email = models.EmailField(blank=True, default="", help_text="Primary contact email") +primary_phone = models.CharField(max_length=20, blank=True, default="", help_text="Primary phone") +website = models.URLField(blank=True, default="", help_text="Organization website") + +# Address +address_line1 = models.CharField(max_length=255, blank=True, default="") +address_line2 = models.CharField(max_length=255, blank=True, default="") +city = models.CharField(max_length=100, blank=True, default="") +state_province = models.CharField(max_length=100, blank=True, default="") +postal_code = models.CharField(max_length=20, blank=True, default="") + +# Content +overview = models.TextField(blank=True, default="", help_text="Organization description") + +# Metadata +is_active = models.BooleanField(default=True, help_text="Soft delete flag") +tags = models.JSONField(default=list, blank=True, help_text="Flexible tags") +``` + +--- + +## Hierarchy Pattern + +For apps that need parent-child organization relationships: + +```python +# Hierarchical relationships +parent_organization = models.ForeignKey( + "self", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="subsidiaries", + help_text="Parent organization", +) +parent_relationship_type = models.CharField( + max_length=20, + choices=PARENT_RELATIONSHIP_CHOICES, + blank=True, + default="", + help_text="Type of relationship with parent", +) +``` + +### Hierarchy Utility Functions + +```python +def get_ancestors(org): + """Walk up the parent chain. Returns list of Organization instances.""" + ancestors = [] + current = org.parent_organization + while current: + ancestors.append(current) + current = current.parent_organization + return ancestors + + +def get_descendants(org): + """Recursively collect all child organizations.""" + descendants = [] + for child in org.subsidiaries.all(): + descendants.append(child) + descendants.extend(get_descendants(child)) + return descendants +``` + +⚠️ **Warning:** Recursive queries can be expensive. For deep hierarchies, consider using `django-mptt` or `django-treebeard`, or store a materialized path. + +--- + +## Domain Extension Examples + +### Financial App + +```python +class Organization(BaseOrganization): + revenue = models.DecimalField(max_digits=15, decimal_places=2, null=True, blank=True) + revenue_year = models.PositiveIntegerField(null=True, blank=True) + employee_count = models.PositiveIntegerField(null=True, blank=True) + stock_symbol = models.CharField(max_length=10, blank=True, default="", db_index=True) + fiscal_year_end_month = models.PositiveSmallIntegerField(null=True, blank=True) +``` + +### Healthcare App + +```python +class Organization(BaseOrganization): + npi_number = models.CharField(max_length=10, blank=True, default="") + facility_type = models.CharField(max_length=30, choices=FACILITY_CHOICES) + bed_count = models.PositiveIntegerField(null=True, blank=True) + accreditation = models.JSONField(default=list, blank=True) +``` + +### Education App + +```python +class Organization(BaseOrganization): + institution_type = models.CharField(max_length=30, choices=INSTITUTION_CHOICES) + student_count = models.PositiveIntegerField(null=True, blank=True) + accreditation_body = models.CharField(max_length=100, blank=True, default="") +``` + +--- + +## Anti-Patterns + +- ❌ Don't use `null=True` on CharField/TextField — use `blank=True, default=""` +- ❌ Don't put all possible fields in a single model — extend per domain +- ❌ Don't use `Meta.ordering` on Organization — specify in queries +- ❌ Don't override `save()` for hierarchy calculation — use signals or service functions +- ❌ Don't expose sequential IDs in URLs — use slug or short UUID + +--- + +## Indexing Recommendations + +```python +class Meta: + indexes = [ + models.Index(fields=["name"], name="org_name_idx"), + models.Index(fields=["status"], name="org_status_idx"), + models.Index(fields=["type"], name="org_type_idx"), + models.Index(fields=["country"], name="org_country_idx"), + ] +``` + +Add domain-specific indexes as needed (e.g., `stock_symbol` for financial apps). diff --git a/docs/Pattern_S3-Storage_V1-00.md b/docs/Pattern_S3-Storage_V1-00.md new file mode 100644 index 0000000..c05f1d8 --- /dev/null +++ b/docs/Pattern_S3-Storage_V1-00.md @@ -0,0 +1,434 @@ +# S3/MinIO File Storage Pattern v1.0.0 + +Standardizes how Django apps in Spelunker store, read, and reference files in S3/MinIO, covering upload paths, model metadata fields, storage-agnostic I/O, and test isolation. + +## 🐾 Red Panda Approval™ + +This pattern follows Red Panda Approval standards. + +--- + +## Why a Pattern, Not a Shared Implementation + +Each Django app stores files for a different domain purpose with different path conventions, processing workflows, and downstream consumers, making a single shared model impractical. + +- The **rfp_manager** app needs files scoped under an RFP ID (info docs, question spreadsheets, generated exports), with no embedding — only LLM summarization +- The **solution_library** app needs files tied to vendor/solution hierarchies, with full text embedding and chunk storage, plus scraped documents that have no Django `FileField` at all +- The **rag** app needs to programmatically write chunk texts to S3 during embedding and read them back for search context +- The **core** app needs a simple image upload for organization logos without any processing pipeline + +Instead, this pattern defines: + +- **Required fields** — the minimum every file-backed model must have +- **Recommended fields** — metadata most implementations should track +- **Standard path conventions** — bucket key prefixes each domain owns +- **Storage-agnostic I/O** — how to read and write files so tests work without a real S3 bucket + +--- + +## Required Fields + +Every model that stores a file in S3/MinIO must have at minimum: + +```python +from django.core.validators import FileExtensionValidator +from django.db import models + +def my_domain_upload_path(instance, filename): + """Return a scoped S3 key for this domain.""" + return f'my_domain/{instance.parent_id}/{filename}' + +class MyDocument(models.Model): + file = models.FileField( + upload_to=my_domain_upload_path, # or a string prefix + validators=[FileExtensionValidator(allowed_extensions=[...])], + ) + file_type = models.CharField(max_length=100, blank=True) # extension without dot + file_size = models.PositiveIntegerField(null=True, blank=True) # bytes +``` + +--- + +## Standard Path Conventions + +Use these exact key prefixes so buckets stay organized and IAM policies can target prefixes. + +| App / Purpose | S3 Key Prefix | +|--------------------------------|--------------------------------------------| +| Solution library documents | `documents/` | +| Scraped documentation sources | `scraped/{source_id}/{filename}` | +| Embedding chunk texts | `chunks/{document_id}/chunk_{index}.txt` | +| RFP information documents | `rfp_info_documents/{rfp_id}/{filename}` | +| RFP question spreadsheets | `rfp_question_documents/{rfp_id}/{filename}` | +| RFP generated exports | `rfp_exports/{rfp_id}/{filename}` | +| Organization logos | `orgs/logos/` | + +--- + +## Recommended Fields and Behaviors + +Most file-backed models should also include these and populate them automatically. + +```python +class MyDocument(models.Model): + # ... required fields above ... + + # Recommended: explicit S3 key for programmatic access and admin visibility + s3_key = models.CharField(max_length=500, blank=True) + + def save(self, *args, **kwargs): + """Auto-populate file metadata on every save.""" + if self.file: + self.s3_key = self.file.name + if hasattr(self.file, 'size'): + self.file_size = self.file.size + if self.file.name and '.' in self.file.name: + self.file_type = self.file.name.rsplit('.', 1)[-1].lower() + super().save(*args, **kwargs) +``` + +--- + +## Pattern Variant 1: FileField Upload (User-Initiated Upload) + +Used by `rfp_manager.RFPInformationDocument`, `rfp_manager.RFPQuestionDocument`, `rfp_manager.RFPExport`, `solution_library.Document`, and `core.Organization`. + +The user (or Celery task generating an export) provides a file. Django's `FileField` handles the upload to S3 automatically via the configured storage backend. + +```python +import os +from django.core.validators import FileExtensionValidator +from django.db import models + + +def rfp_info_document_path(instance, filename): + """Scope uploads under the parent RFP's ID to keep the bucket organized.""" + return f'rfp_info_documents/{instance.rfp.id}/{filename}' + + +class RFPInformationDocument(models.Model): + file = models.FileField( + upload_to=rfp_info_document_path, + validators=[FileExtensionValidator( + allowed_extensions=['pdf', 'doc', 'docx', 'txt', 'md'] + )], + ) + title = models.CharField(max_length=500) + file_type = models.CharField(max_length=100, blank=True) + file_size = models.PositiveIntegerField(null=True, blank=True) + + def save(self, *args, **kwargs): + if self.file: + if hasattr(self.file, 'size'): + self.file_size = self.file.size + if self.file.name: + self.file_type = os.path.splitext(self.file.name)[1].lstrip('.') + super().save(*args, **kwargs) +``` + +--- + +## Pattern Variant 2: Programmatic Write (Code-Generated Content) + +Used by `rag.services.embeddings` (chunk texts) and `solution_library.services.sync` (scraped documents). + +Content is generated or fetched in code and written directly to S3 using `default_storage.save()` with a `ContentFile`. The model records the resulting S3 key for later retrieval. + +```python +from django.core.files.base import ContentFile +from django.core.files.storage import default_storage + + +def store_chunk(document_id: int, chunk_index: int, text: str) -> str: + """ + Store an embedding chunk in S3 and return the saved key. + + Returns: + The actual S3 key (may differ from requested if file_overwrite=False) + """ + s3_key = f'chunks/{document_id}/chunk_{chunk_index}.txt' + saved_key = default_storage.save(s3_key, ContentFile(text.encode('utf-8'))) + return saved_key + + +def store_scraped_document(source_id: int, filename: str, content: str) -> str: + """Store scraped document content in S3 and return the saved key.""" + s3_key = f'scraped/{source_id}/{filename}' + return default_storage.save(s3_key, ContentFile(content.encode('utf-8'))) +``` + +When creating the model record after a programmatic write, use `s3_key` rather than a `FileField`: + +```python +Document.objects.create( + title=filename, + s3_key=saved_key, + file_size=len(content), + file_type='md', + # Note: `file` field is intentionally empty — this is a scraped document +) +``` + +--- + +## Pattern Variant 3: Storage-Agnostic Read + +Used by `rfp_manager.services.excel_processor`, `rag.services.embeddings._read_document_content`, and `solution_library.models.DocumentEmbedding.get_chunk_text`. + +Always read via `default_storage.open()` so the same code works against S3 in production and `FileSystemStorage` in tests. Never construct a filesystem path from `settings.MEDIA_ROOT`. + +```python +from django.core.files.storage import default_storage +from io import BytesIO + + +def load_binary_from_storage(file_path: str) -> BytesIO: + """ + Read a binary file from storage into a BytesIO buffer. + Works against S3/MinIO in production and FileSystemStorage in tests. + """ + with default_storage.open(file_path, 'rb') as f: + return BytesIO(f.read()) + + +def read_text_from_storage(s3_key: str) -> str: + """Read a text file from storage.""" + with default_storage.open(s3_key, 'r') as f: + return f.read() +``` + +When a model has both a `file` field (user upload) and a bare `s3_key` (scraped/programmatic), check which path applies: + +```python +def _read_document_content(self, document) -> str: + if document.s3_key and not document.file: + # Scraped document: no FileField, read by key + with default_storage.open(document.s3_key, 'r') as f: + return f.read() + # Uploaded document: use the FileField + with document.file.open('r') as f: + return f.read() +``` + +--- + +## Pattern Variant 4: S3 Connectivity Validation + +Used by `solution_library.models.Document.clean()` and `solution_library.services.sync.sync_documentation_source`. + +Validate that the bucket is reachable before attempting an upload or sync. This surfaces credential errors with a user-friendly message rather than a cryptic 500. + +```python +from botocore.exceptions import ClientError, NoCredentialsError +from django.core.exceptions import ValidationError +from django.core.files.storage import default_storage + + +def validate_s3_connectivity(): + """ + Raise ValidationError if S3/MinIO bucket is not accessible. + Only call on new uploads or at the start of a background sync. + """ + if not hasattr(default_storage, 'bucket'): + return # Not an S3 backend (e.g., tests), skip validation + + try: + default_storage.bucket.meta.client.head_bucket( + Bucket=default_storage.bucket_name + ) + except ClientError as e: + code = e.response.get('Error', {}).get('Code', '') + if code == '403': + raise ValidationError( + "S3/MinIO credentials are invalid or permissions are insufficient." + ) + elif code == '404': + raise ValidationError( + f"Bucket '{default_storage.bucket_name}' does not exist." + ) + raise ValidationError(f"S3/MinIO error ({code}): {e}") + except NoCredentialsError: + raise ValidationError("S3/MinIO credentials are not configured.") +``` + +In a model's `clean()`, guard with `not self.pk` to avoid checking on every update: + +```python +def clean(self): + super().clean() + if self.file and not self.pk: # New uploads only + validate_s3_connectivity() +``` + +--- + +## Domain Extension Examples + +### rfp_manager App + +RFP documents are scoped under the RFP ID for isolation and easy cleanup. The app uses three document types (info, question, export), each with its own callable path function to keep the bucket navigation clear. + +```python +def rfp_export_path(instance, filename): + return f'rfp_exports/{instance.rfp.id}/{filename}' + +class RFPExport(models.Model): + export_file = models.FileField(upload_to=rfp_export_path) + version = models.CharField(max_length=50) + file_size = models.PositiveIntegerField(null=True, blank=True) + question_count = models.IntegerField() + answered_count = models.IntegerField() + # No s3_key field - export files are always accessed via FileField +``` + +### solution_library App + +Solution library documents track an explicit `s3_key` because the app supports two document origins: user uploads (with `FileField`) and scraped documents (programmatic write only, no `FileField`). For embedding, chunk texts are stored separately in S3 and referenced from `DocumentEmbedding` via `chunk_s3_key`. + +```python +class Document(models.Model): + file = models.FileField(upload_to='documents/', blank=True) # blank=True: scraped docs + s3_key = models.CharField(max_length=500, blank=True) # always populated + content_hash = models.CharField(max_length=64, blank=True, db_index=True) + +class DocumentEmbedding(models.Model): + document = models.ForeignKey(Document, on_delete=models.CASCADE, related_name='embeddings') + chunk_s3_key = models.CharField(max_length=500) # e.g. chunks/42/chunk_7.txt + chunk_index = models.IntegerField() + chunk_size = models.PositiveIntegerField() + embedding = VectorField(null=True, blank=True) # pgvector column + + def get_chunk_text(self) -> str: + from django.core.files.storage import default_storage + with default_storage.open(self.chunk_s3_key, 'r') as f: + return f.read() +``` + +--- + +## Anti-Patterns + +- ❌ Don't build filesystem paths with `os.path.join(settings.MEDIA_ROOT, ...)` — always read through `default_storage.open()` +- ❌ Don't store file content as a `TextField` or `BinaryField` in the database +- ❌ Don't use `default_acl='public-read'` — all Spelunker buckets use `private` ACL with `querystring_auth=True` (pre-signed URLs) +- ❌ Don't skip `FileExtensionValidator` on upload fields — it is the first line of defence against unexpected file types +- ❌ Don't call `document.file.storage.size()` or `.exists()` in hot paths — these make network round-trips; use the `s3_key` and metadata fields for display purposes +- ❌ Don't make S3 API calls in tests without first overriding `STORAGES` in `test_settings.py` +- ❌ Don't use `file_overwrite=True` — the global setting `file_overwrite=False` ensures Django auto-appends a unique suffix rather than silently overwriting existing objects + +--- + +## Settings + +```python +# spelunker/settings.py + +STORAGES = { + "default": { + "BACKEND": "storages.backends.s3boto3.S3Boto3Storage", + "OPTIONS": { + "access_key": env('S3_ACCESS_KEY'), + "secret_key": env('S3_SECRET_KEY'), + "bucket_name": env('S3_BUCKET_NAME'), + "endpoint_url": env('S3_ENDPOINT_URL'), # Use for MinIO or non-AWS S3 + "use_ssl": env('S3_USE_SSL'), + "default_acl": env('S3_DEFAULT_ACL'), # Must be 'private' + "region_name": env('S3_REGION_NAME'), + "file_overwrite": False, # Prevent silent overwrites + "querystring_auth": True, # Pre-signed URLs for all access + "verify": env.bool('S3_VERIFY_SSL', default=True), + } + }, + "staticfiles": { + # Static files are served locally (nginx), never from S3 + "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage", + }, +} +``` + +Environment variables (see `.env.example`): + +```bash +S3_ACCESS_KEY= +S3_SECRET_KEY= +S3_BUCKET_NAME=spelunker-documents +S3_ENDPOINT_URL=http://localhost:9000 # MinIO local dev +S3_USE_SSL=False +S3_VERIFY_SSL=False +S3_DEFAULT_ACL=private +S3_REGION_NAME=us-east-1 +``` + +Test override (disables all S3 calls): + +```python +# spelunker/test_settings.py + +STORAGES = { + "default": { + "BACKEND": "django.core.files.storage.FileSystemStorage", + "OPTIONS": {"location": "/tmp/test_media/"}, + }, + "staticfiles": { + "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage", + }, +} +``` + +--- + +## Testing + +Standard test cases every file-backed implementation should cover. + +```python +import os +import tempfile +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase, override_settings + + +@override_settings( + STORAGES={ + "default": { + "BACKEND": "django.core.files.storage.FileSystemStorage", + "OPTIONS": {"location": tempfile.mkdtemp()}, + }, + "staticfiles": { + "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage", + }, + } +) +class MyDocumentStorageTest(TestCase): + + def test_file_metadata_populated_on_save(self): + """file_type and file_size are auto-populated from the uploaded file.""" + uploaded = SimpleUploadedFile("report.pdf", b"%PDF-1.4 content", content_type="application/pdf") + doc = MyDocument.objects.create(file=uploaded, title="Test") + self.assertEqual(doc.file_type, "pdf") + self.assertGreater(doc.file_size, 0) + + def test_upload_path_includes_parent_id(self): + """upload_to callable scopes the key under the parent ID.""" + uploaded = SimpleUploadedFile("q.xlsx", b"PK content") + doc = MyDocument.objects.create(file=uploaded, title="Questions", rfp=self.rfp) + self.assertIn(str(self.rfp.id), doc.file.name) + + def test_rejected_extension(self): + """FileExtensionValidator rejects disallowed file types.""" + from django.core.exceptions import ValidationError + uploaded = SimpleUploadedFile("hack.exe", b"MZ") + doc = MyDocument(file=uploaded, title="Bad") + with self.assertRaises(ValidationError): + doc.full_clean() + + def test_storage_agnostic_read(self): + """Reading via default_storage.open() works against FileSystemStorage.""" + from django.core.files.base import ContentFile + from django.core.files.storage import default_storage + key = default_storage.save("test/hello.txt", ContentFile(b"hello world")) + with default_storage.open(key, 'r') as f: + content = f.read() + self.assertEqual(content, "hello world") + default_storage.delete(key) +``` diff --git a/docs/Pattern_SSO-Allauth-Casdoor_V1-00.md b/docs/Pattern_SSO-Allauth-Casdoor_V1-00.md new file mode 100644 index 0000000..c7d7e89 --- /dev/null +++ b/docs/Pattern_SSO-Allauth-Casdoor_V1-00.md @@ -0,0 +1,736 @@ +# SSO with Allauth & Casdoor Pattern v1.0.0 + +Standardizes OIDC-based Single Sign-On using Django Allauth and Casdoor, covering adapter customization, user provisioning, group mapping, superuser protection, and configurable local-login fallback. Used by the `core` Django application. + +## 🐾 Red Panda Approval™ + +This pattern follows Red Panda Approval standards. + +--- + +## Why a Pattern, Not a Shared Implementation + +Every Django project that adopts SSO has different identity-provider configurations, claim schemas, permission models, and organizational structures: + +- A **project management** app needs role claims mapped to project-scoped permissions +- An **e-commerce** app needs tenant/store claims with purchase-limit groups +- An **RFP tool** (Spelunker) needs organization + group claims mapped to View Only / Staff / SME / Admin groups + +Instead, this pattern defines: + +- **Required components** — every implementation must have +- **Required settings** — Django & Allauth configuration values +- **Standard conventions** — group names, claim mappings, redirect URL format +- **Extension guidelines** — for domain-specific provisioning logic + +--- + +## Required Components + +Every SSO implementation following this pattern must provide these files: + +| Component | Location | Purpose | +|-----------|----------|---------| +| Social account adapter | `/adapters.py` | User provisioning, group mapping, superuser protection | +| Local account adapter | `/adapters.py` | Disable local signup, authentication logging | +| Management command | `/management/commands/create_sso_groups.py` | Idempotent group + permission creation | +| Login template | `templates/account/login.html` | SSO button + conditional local login form | +| Context processor | `/context_processors.py` | Expose `CASDOOR_ENABLED` / `ALLOW_LOCAL_LOGIN` to templates | +| SSL patch (optional) | `/ssl_patch.py` | Development-only SSL bypass | + +### Minimum settings.py configuration + +```python +# INSTALLED_APPS — required entries +INSTALLED_APPS = [ + # ... standard Django apps ... + 'allauth', + 'allauth.account', + 'allauth.socialaccount', + 'allauth.socialaccount.providers.openid_connect', + '', +] + +# MIDDLEWARE — Allauth middleware is required +MIDDLEWARE = [ + # ... standard Django middleware ... + 'allauth.account.middleware.AccountMiddleware', +] + +# AUTHENTICATION_BACKENDS — both local and SSO +AUTHENTICATION_BACKENDS = [ + 'django.contrib.auth.backends.ModelBackend', + 'allauth.account.auth_backends.AuthenticationBackend', +] +``` + +--- + +## Standard Values / Conventions + +### Environment Variables + +Every deployment must set these environment variables (or `.env` entries): + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `CASDOOR_ENABLED` | Yes | — | Enable/disable SSO (`true`/`false`) | +| `CASDOOR_ORIGIN` | Yes | — | Casdoor backend URL for OIDC discovery | +| `CASDOOR_ORIGIN_FRONTEND` | Yes | — | Casdoor frontend URL (may differ behind reverse proxy) | +| `CASDOOR_CLIENT_ID` | Yes | — | OAuth client ID from Casdoor application | +| `CASDOOR_CLIENT_SECRET` | Yes | — | OAuth client secret from Casdoor application | +| `CASDOOR_ORG_NAME` | Yes | — | Default organization slug in Casdoor | +| `ALLOW_LOCAL_LOGIN` | No | `false` | Show local login form for non-superusers | +| `CASDOOR_SSL_VERIFY` | No | `true` | SSL verification (`true`, `false`, or CA-bundle path) | + +### Redirect URL Convention + +The Allauth OIDC callback URL follows a fixed format. Register this URL in Casdoor: + +``` +/accounts/oidc//login/callback/ +``` + +For Spelunker with `provider_id = casdoor`: + +``` +/accounts/oidc/casdoor/login/callback/ +``` + +> **Important:** The path segment is `oidc`, not `openid_connect`. + +### Standard Group Mapping + +Casdoor group names map to Django groups with consistent naming: + +| Casdoor Group | Django Group | `is_staff` | Permissions | +|---------------|-------------|------------|-------------| +| `view_only` | `View Only` | `False` | `view_*` | +| `staff` | `Staff` | `True` | `view_*`, `add_*`, `change_*` | +| `sme` | `SME` | `True` | `view_*`, `add_*`, `change_*` | +| `admin` | `Admin` | `True` | `view_*`, `add_*`, `change_*`, `delete_*` | + +### Standard OIDC Claim Mapping + +| Casdoor Claim | Django Field | Notes | +|---------------|-------------|-------| +| `email` | `User.username`, `User.email` | Full email used as username | +| `given_name` | `User.first_name` | — | +| `family_name` | `User.last_name` | — | +| `name` | Parsed into first/last | Fallback when given/family absent | +| `organization` | Organization lookup/create | Via adapter | +| `groups` | Django Group membership | Via adapter mapping | + +--- + +## Recommended Settings + +Most implementations should include these Allauth settings: + +```python +# Authentication mode +ACCOUNT_LOGIN_METHODS = {'email'} +ACCOUNT_SIGNUP_FIELDS = ['email*', 'password1*', 'password2*'] +ACCOUNT_EMAIL_VERIFICATION = 'optional' +ACCOUNT_SESSION_REMEMBER = True +ACCOUNT_LOGIN_ON_PASSWORD_RESET = True +ACCOUNT_UNIQUE_EMAIL = True + +# Redirects +LOGIN_REDIRECT_URL = '/dashboard/' +ACCOUNT_LOGOUT_REDIRECT_URL = '/' +LOGIN_URL = '/accounts/login/' + +# Social account behavior +SOCIALACCOUNT_AUTO_SIGNUP = True +SOCIALACCOUNT_EMAIL_VERIFICATION = 'none' +SOCIALACCOUNT_QUERY_EMAIL = True +SOCIALACCOUNT_STORE_TOKENS = True +SOCIALACCOUNT_ADAPTER = '.adapters.CasdoorAccountAdapter' +ACCOUNT_ADAPTER = '.adapters.LocalAccountAdapter' + +# Session management +SESSION_COOKIE_AGE = 28800 # 8 hours +SESSION_SAVE_EVERY_REQUEST = True + +# Account linking — auto-connect SSO to an existing local account with +# the same verified email instead of raising a conflict error +SOCIALACCOUNT_EMAIL_AUTHENTICATION_AUTO_CONNECT = True +``` + +### Multi-Factor Authentication (Recommended) + +Add `allauth.mfa` for TOTP/WebAuthn second-factor support: + +```python +INSTALLED_APPS += ['allauth.mfa'] +MFA_ADAPTER = 'allauth.mfa.adapter.DefaultMFAAdapter' +``` + +MFA is enforced per-user inside Django; Casdoor may also enforce its own MFA upstream. + +### Rate Limiting on Local Login (Recommended) + +Protect the local login form from brute-force attacks with `django-axes` or similar: + +```python +# pip install django-axes +INSTALLED_APPS += ['axes'] +AUTHENTICATION_BACKENDS = [ + 'axes.backends.AxesStandaloneBackend', + 'django.contrib.auth.backends.ModelBackend', + 'allauth.account.auth_backends.AuthenticationBackend', +] +AXES_FAILURE_LIMIT = 5 # Lock after 5 failures +AXES_COOLOFF_TIME = 1 # 1-hour cooloff +AXES_LOCKOUT_PARAMETERS = ['ip_address', 'username'] +``` + +--- + +## Social Account Adapter + +The social account adapter is the core of the pattern. It handles user provisioning on SSO login, maps claims to Django fields, enforces superuser protection, and assigns groups. + +```python +from allauth.socialaccount.adapter import DefaultSocialAccountAdapter +from allauth.exceptions import ImmediateHttpResponse +from django.contrib.auth.models import User, Group +from django.contrib import messages +from django.shortcuts import redirect +import logging + +logger = logging.getLogger(__name__) + + +class CasdoorAccountAdapter(DefaultSocialAccountAdapter): + + def is_open_for_signup(self, request, sociallogin): + """Always allow SSO-initiated signup.""" + return True + + def pre_social_login(self, request, sociallogin): + """ + Runs on every SSO login (new and returning users). + + 1. Blocks superusers — they must use local auth. + 2. Re-syncs organization and group claims for returning users + so that IdP changes are reflected immediately. + """ + if sociallogin.user.id: + user = sociallogin.user + + # --- Superuser gate --- + if user.is_superuser: + logger.warning( + f"SSO login blocked for superuser {user.username}. " + "Superusers must use local authentication." + ) + messages.error( + request, + "Superuser accounts must use local authentication." + ) + raise ImmediateHttpResponse(redirect('account_login')) + + # --- Re-sync claims for returning users --- + extra_data = sociallogin.account.extra_data + + org_identifier = extra_data.get('organization', '') + if org_identifier: + self._assign_organization(user, org_identifier) + + groups = extra_data.get('groups', []) + self._assign_groups(user, groups) + + user.is_staff = any( + g in ['staff', 'sme', 'admin'] for g in groups + ) + user.save(update_fields=['is_staff']) + + def populate_user(self, request, sociallogin, data): + """Map Casdoor claims to Django User fields.""" + user = super().populate_user(request, sociallogin, data) + + email = data.get('email', '') + user.username = email + user.email = email + + user.first_name = data.get('given_name', '') + user.last_name = data.get('family_name', '') + + # Fallback: parse full 'name' claim + if not user.first_name and not user.last_name: + full_name = data.get('name', '') + if full_name: + parts = full_name.split(' ', 1) + user.first_name = parts[0] + user.last_name = parts[1] if len(parts) > 1 else '' + + # Security: SSO users are never superusers + user.is_superuser = False + + # Set is_staff from group membership + groups = data.get('groups', []) + user.is_staff = any(g in ['staff', 'sme', 'admin'] for g in groups) + + return user + + def save_user(self, request, sociallogin, form=None): + """Save user and handle organization + group mapping.""" + user = super().save_user(request, sociallogin, form) + extra_data = sociallogin.account.extra_data + + org_identifier = extra_data.get('organization', '') + if org_identifier: + self._assign_organization(user, org_identifier) + + groups = extra_data.get('groups', []) + self._assign_groups(user, groups) + return user + + def _assign_organization(self, user, org_identifier): + """Assign (or create) organization from the OIDC claim.""" + # Domain-specific — see Extension Examples below + raise NotImplementedError("Override per project") + + def _assign_groups(self, user, group_names): + """Map Casdoor groups to Django groups.""" + group_mapping = { + 'view_only': 'View Only', + 'staff': 'Staff', + 'sme': 'SME', + 'admin': 'Admin', + } + user.groups.clear() + for casdoor_group in group_names: + django_group_name = group_mapping.get(casdoor_group.lower()) + if django_group_name: + group, _ = Group.objects.get_or_create(name=django_group_name) + user.groups.add(group) + logger.info(f"Added {user.username} to group {django_group_name}") +``` + +--- + +## Local Account Adapter + +Prevents local registration and logs authentication failures: + +```python +from allauth.account.adapter import DefaultAccountAdapter +import logging + +logger = logging.getLogger(__name__) + + +class LocalAccountAdapter(DefaultAccountAdapter): + + def is_open_for_signup(self, request): + """Disable local signup — all users come via SSO or admin.""" + return False + + def authentication_failed(self, request, **kwargs): + """Log failures for security monitoring.""" + logger.warning( + f"Local authentication failed from {request.META.get('REMOTE_ADDR')}" + ) + super().authentication_failed(request, **kwargs) +``` + +--- + +## OIDC Provider Configuration + +Register Casdoor as an OpenID Connect provider in `settings.py`: + +```python +SOCIALACCOUNT_PROVIDERS = { + 'openid_connect': { + 'APPS': [ + { + 'provider_id': 'casdoor', + 'name': 'Casdoor SSO', + 'client_id': CASDOOR_CLIENT_ID, + 'secret': CASDOOR_CLIENT_SECRET, + 'settings': { + 'server_url': f'{CASDOOR_ORIGIN}/.well-known/openid-configuration', + }, + } + ], + 'OAUTH_PKCE_ENABLED': True, + } +} +``` + +--- + +## Management Command — Group Creation + +An idempotent management command ensures groups and permissions exist: + +```python +from django.core.management.base import BaseCommand +from django.contrib.auth.models import Group, Permission + + +class Command(BaseCommand): + help = 'Create Django groups for Casdoor SSO integration' + + def handle(self, *args, **options): + groups_config = { + 'View Only': {'permissions': ['view']}, + 'Staff': {'permissions': ['view', 'add', 'change']}, + 'SME': {'permissions': ['view', 'add', 'change']}, + 'Admin': {'permissions': ['view', 'add', 'change', 'delete']}, + } + + # Add your domain-specific model names here + models_to_permission = [ + 'vendor', 'document', 'rfp', 'rfpquestion', + ] + + for group_name, config in groups_config.items(): + group, created = Group.objects.get_or_create(name=group_name) + status = 'Created' if created else 'Exists' + self.stdout.write(f'{status}: {group_name}') + + for perm_prefix in config['permissions']: + for model in models_to_permission: + try: + perm = Permission.objects.get( + codename=f'{perm_prefix}_{model}' + ) + group.permissions.add(perm) + except Permission.DoesNotExist: + pass + + self.stdout.write(self.style.SUCCESS('SSO groups created successfully')) +``` + +--- + +## Login Template + +The login template shows an SSO button when Casdoor is enabled and conditionally reveals the local login form: + +```html +{% load socialaccount %} + + +{% if CASDOOR_ENABLED %} +
+ {% csrf_token %} + +
+{% endif %} + + +{% if ALLOW_LOCAL_LOGIN or user.is_superuser %} +
+ {% csrf_token %} + {{ form.as_p }} + +
+{% endif %} +``` + +> **Why POST?** Using a `` GET link to initiate the OAuth flow skips CSRF +> validation. Allauth's `{% provider_login_url %}` is designed for use inside a +> `
` so the CSRF token is verified before the redirect. + +--- + +## Context Processor + +Exposes SSO settings to every template: + +```python +from django.conf import settings + + +def user_preferences(request): + context = {} + + # Always expose SSO flags for the login page + context['CASDOOR_ENABLED'] = getattr(settings, 'CASDOOR_ENABLED', False) + context['ALLOW_LOCAL_LOGIN'] = getattr(settings, 'ALLOW_LOCAL_LOGIN', False) + + return context +``` + +Register in `settings.py`: + +```python +TEMPLATES = [{ + 'OPTIONS': { + 'context_processors': [ + # ... standard processors ... + '.context_processors.user_preferences', + ], + }, +}] +``` + +--- + +## SSL Bypass (Development Only) + +For sandbox environments with self-signed certificates, an optional SSL patch disables verification at the `requests` library level: + +```python +import os, logging +logger = logging.getLogger(__name__) + + +def apply_ssl_bypass(): + ssl_verify = os.environ.get('CASDOOR_SSL_VERIFY', 'true').lower() + if ssl_verify != 'false': + return + + logger.warning("SSL verification DISABLED — sandbox only") + + import urllib3 + from requests.adapters import HTTPAdapter + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + _original_send = HTTPAdapter.send + + def _patched_send(self, request, stream=False, timeout=None, + verify=True, cert=None, proxies=None): + return _original_send(self, request, stream=stream, + timeout=timeout, verify=False, + cert=cert, proxies=proxies) + + HTTPAdapter.send = _patched_send + +apply_ssl_bypass() +``` + +Load it at the top of `settings.py` **before** any library imports that make HTTP calls: + +```python +_ssl_verify = os.environ.get('CASDOOR_SSL_VERIFY', 'true').lower() +if _ssl_verify == 'false': + import .ssl_patch # noqa: F401 +``` + +--- + +## Logout Flow + +By default, Django's `account_logout` destroys the local session but does **not** terminate the upstream Casdoor session. The user remains logged in at the IdP and will be silently re-authenticated on next visit. + +### Options + +| Strategy | Behaviour | Implementation | +|----------|-----------|----------------| +| **Local-only logout** (default) | Destroys Django session; IdP session survives | No extra work | +| **IdP redirect logout** | Redirects to Casdoor's `/api/logout` after local logout | Override `ACCOUNT_LOGOUT_REDIRECT_URL` to point at Casdoor | +| **OIDC back-channel logout** | Casdoor notifies Django to invalidate sessions | Requires Casdoor back-channel support + a Django webhook endpoint | + +### Recommended: IdP redirect logout + +```python +# settings.py +ACCOUNT_LOGOUT_REDIRECT_URL = ( + f'{CASDOOR_ORIGIN}/api/logout' + f'?post_logout_redirect_uri=https://your-app.example.com/' +) +``` + +This ensures the Casdoor session cookie is cleared before the user returns to your app. + +--- + +## Domain Extension Examples + +### Spelunker (RFP Tool) + +Spelunker's adapter creates organizations on first encounter and links them to user profiles: + +```python +def _assign_organization(self, user, org_identifier): + from django.db import models + from django.utils.text import slugify + from core.models import Organization + + try: + org = Organization.objects.filter( + models.Q(slug=org_identifier) | models.Q(name=org_identifier) + ).first() + + if not org: + org = Organization.objects.create( + name=org_identifier, + slug=slugify(org_identifier), + type='for-profit', + legal_country='CA', + status='active', + ) + logger.info(f"Created organization: {org.name}") + + if hasattr(user, 'profile'): + logger.info(f"Assigned {user.username} → {org.name}") + + except Exception as e: + logger.error(f"Organization assignment error: {e}") +``` + +### Multi-Tenant SaaS App + +A multi-tenant app might restrict users to a single tenant and enforce tenant isolation: + +```python +def _assign_organization(self, user, org_identifier): + from tenants.models import Tenant + + tenant = Tenant.objects.filter(external_id=org_identifier).first() + if not tenant: + raise ValueError(f"Unknown tenant: {org_identifier}") + + user.tenant = tenant + user.save(update_fields=['tenant']) +``` + +--- + +## Anti-Patterns + +- ❌ Don't allow SSO to grant `is_superuser` — always force `is_superuser = False` in `populate_user` +- ❌ Don't *log-and-continue* for superuser SSO attempts — raise `ImmediateHttpResponse` to actually block the login +- ❌ Don't disable local login for superusers — they need emergency access when SSO is unavailable +- ❌ Don't rely on SSO username claims — use email as the canonical identifier +- ❌ Don't hard-code the OIDC provider URL — always read from environment variables +- ❌ Don't skip the management command — groups and permissions must be idempotent and repeatable +- ❌ Don't use `CASDOOR_SSL_VERIFY=false` in production — only for sandbox environments with self-signed certificates +- ❌ Don't forget PKCE — always set `OAUTH_PKCE_ENABLED: True` for Authorization Code flow +- ❌ Don't sync groups only on first login — re-sync in `pre_social_login` so IdP changes take effect immediately +- ❌ Don't use a GET link (``) to start the OAuth flow — use a POST form so CSRF protection applies +- ❌ Don't assume Django logout kills the IdP session — configure an IdP redirect or back-channel logout +- ❌ Don't leave the local login endpoint unprotected — add rate limiting (e.g. `django-axes`) to prevent brute-force attacks + +--- + +## Settings + +All Django settings this pattern recognizes: + +```python +# settings.py + +# --- SSO Provider --- +CASDOOR_ENABLED = env.bool('CASDOOR_ENABLED') # Master SSO toggle +CASDOOR_ORIGIN = env('CASDOOR_ORIGIN') # OIDC discovery base URL +CASDOOR_ORIGIN_FRONTEND = env('CASDOOR_ORIGIN_FRONTEND') # Frontend URL (may differ) +CASDOOR_CLIENT_ID = env('CASDOOR_CLIENT_ID') # OAuth client ID +CASDOOR_CLIENT_SECRET = env('CASDOOR_CLIENT_SECRET') # OAuth client secret +CASDOOR_ORG_NAME = env('CASDOOR_ORG_NAME') # Default organization +CASDOOR_SSL_VERIFY = env('CASDOOR_SSL_VERIFY') # true | false | /path/to/ca.pem + +# --- Login Behavior --- +ALLOW_LOCAL_LOGIN = env.bool('ALLOW_LOCAL_LOGIN', default=False) # Show local form + +# --- Allauth --- +SOCIALACCOUNT_ADAPTER = '.adapters.CasdoorAccountAdapter' +ACCOUNT_ADAPTER = '.adapters.LocalAccountAdapter' +``` + +--- + +## Testing + +Standard test cases every implementation should cover: + +```python +from django.test import TestCase, override_settings +from unittest.mock import MagicMock +from django.contrib.auth.models import User, Group +from .adapters import CasdoorAccountAdapter, LocalAccountAdapter + + +class CasdoorAdapterTest(TestCase): + + def setUp(self): + self.adapter = CasdoorAccountAdapter() + + def test_signup_always_open(self): + """SSO signup must always be permitted.""" + self.assertTrue(self.adapter.is_open_for_signup(MagicMock(), MagicMock())) + + def test_superuser_never_set_via_sso(self): + """populate_user must force is_superuser=False.""" + sociallogin = MagicMock() + data = {'email': 'admin@example.com', 'groups': ['admin']} + user = self.adapter.populate_user(MagicMock(), sociallogin, data) + self.assertFalse(user.is_superuser) + + def test_email_used_as_username(self): + """Username must be the full email address.""" + sociallogin = MagicMock() + data = {'email': 'jane@example.com'} + user = self.adapter.populate_user(MagicMock(), sociallogin, data) + self.assertEqual(user.username, 'jane@example.com') + + def test_staff_flag_from_groups(self): + """is_staff must be True when user belongs to staff/sme/admin.""" + sociallogin = MagicMock() + for group in ['staff', 'sme', 'admin']: + data = {'email': 'user@example.com', 'groups': [group]} + user = self.adapter.populate_user(MagicMock(), sociallogin, data) + self.assertTrue(user.is_staff, f"is_staff should be True for group '{group}'") + + def test_name_fallback_parsing(self): + """When given_name/family_name absent, parse 'name' claim.""" + sociallogin = MagicMock() + data = {'email': 'user@example.com', 'name': 'Jane Doe'} + user = self.adapter.populate_user(MagicMock(), sociallogin, data) + self.assertEqual(user.first_name, 'Jane') + self.assertEqual(user.last_name, 'Doe') + + def test_group_mapping(self): + """Casdoor groups must map to correctly named Django groups.""" + Group.objects.create(name='View Only') + Group.objects.create(name='Staff') + user = User.objects.create_user('test@example.com', 'test@example.com') + self.adapter._assign_groups(user, ['view_only', 'staff']) + group_names = set(user.groups.values_list('name', flat=True)) + self.assertEqual(group_names, {'View Only', 'Staff'}) + + def test_superuser_sso_login_blocked(self): + """pre_social_login must raise ImmediateHttpResponse for superusers.""" + from allauth.exceptions import ImmediateHttpResponse + user = User.objects.create_superuser( + 'admin@example.com', 'admin@example.com', 'pass' + ) + sociallogin = MagicMock() + sociallogin.user = user + sociallogin.user.id = user.id + with self.assertRaises(ImmediateHttpResponse): + self.adapter.pre_social_login(MagicMock(), sociallogin) + + def test_groups_resync_on_returning_login(self): + """pre_social_login must re-sync groups for existing users.""" + Group.objects.create(name='Admin') + Group.objects.create(name='Staff') + user = User.objects.create_user('user@example.com', 'user@example.com') + user.groups.add(Group.objects.get(name='Staff')) + + sociallogin = MagicMock() + sociallogin.user = user + sociallogin.user.id = user.id + sociallogin.account.extra_data = { + 'groups': ['admin'], + 'organization': '', + } + self.adapter.pre_social_login(MagicMock(), sociallogin) + group_names = set(user.groups.values_list('name', flat=True)) + self.assertEqual(group_names, {'Admin'}) + + +class LocalAdapterTest(TestCase): + + def test_local_signup_disabled(self): + """Local signup must always be disabled.""" + adapter = LocalAccountAdapter() + self.assertFalse(adapter.is_open_for_signup(MagicMock())) +``` diff --git a/docs/Red Panda Django.md b/docs/Red Panda Django.md new file mode 100644 index 0000000..165ce3f --- /dev/null +++ b/docs/Red Panda Django.md @@ -0,0 +1,96 @@ +## Red Panda Approval™ + +This project follows Red Panda Approval standards - our gold standard for Django application quality. Code must be elegant, reliable, and maintainable to earn the approval of our adorable red panda judges. + +### The 5 Sacred Django Criteria + +1. **Fresh Migration Test** - Clean migrations from empty database +2. **Elegant Simplicity** - No unnecessary complexity +3. **Observable & Debuggable** - Proper logging and error handling +4. **Consistent Patterns** - Follow Django conventions +5. **Actually Works** - Passes all checks and serves real user needs + +### Standards + +# Environment +Virtual environment: ~/env/PROJECT/bin/activate +Python version: 3.12 + +# Code Organization +Maximum file length: 1000 lines +CSS: External .css files only (no inline/embedded) +JS: External .js files only (no inline/embedded) + +# Required Packages +- Bootstrap 5.x (no custom CSS unless absolutely necessary) +- Bootstrap Icons (no emojis) +- django-crispy-forms + crispy-bootstrap5 +- django-allauth + +# Testing +Framework: Django TestCase (not pytest) +Minimum coverage: XX%? (optional) + +### Database Conventions + +# Development vs Production +- Development: SQLite +- Production: PostgreSQL +- Use dj-database-url for configuration + +# Model Naming +- Model names: singular PascalCase (User, BlogPost, OrderItem) +- Related names: plural snake_case with proper English pluralization + - user.blog_posts, order.items + - category.industries (not industrys) + - person.children (not childs) + - analysis.analyses (not analysiss) +- Through tables: describe relationship (ProjectMembership, CourseEnrollment) + +# Field Naming +- Foreign keys: singular without _id suffix (author, category, parent) +- Boolean fields: use prefixes (is_active, has_permission, can_edit) +- Date fields: use suffixes (created_at, updated_at, published_on) +- Avoid abbreviations (use description, not desc) + +# Required Model Fields +All models should include: +- created_at = models.DateTimeField(auto_now_add=True) +- updated_at = models.DateTimeField(auto_now=True) + +Consider adding: +- id = models.UUIDField(primary_key=True) for public-facing models +- is_active = models.BooleanField(default=True) for soft deletes + +# Indexing +- Add db_index=True to frequently queried fields +- Use Meta.indexes for composite indexes +- Document why each index exists + +# Migrations +- Never edit migrations that have been deployed +- Use meaningful migration names: --name add_email_to_profile +- One logical change per migration when possible +- Test migrations both forward and backward + +# Queries +- Use select_related() for foreign keys +- Use prefetch_related() for reverse relations and M2M +- Avoid queries in loops (N+1 problem) +- Use .only() and .defer() for large models +- Add comments explaining complex querysets + +## Monitoring & Health Check Endpoints +Follow standard Kubernetes health check endpoints for container orchestration: + +### /ready/ - Readiness probe checks if the application is ready to serve traffic +Validates database connectivity +Validates cache connectivity +Returns 200 if ready, 503 if dependencies are unavailable +Used by load balancers to determine if pod should receive traffic + +### /live/ - Liveness probe checks if the application process is alive +Simple health check with minimal logic +Returns 200 if Django is responding to requests +Used by Kubernetes to determine if pod should be restarted +Note: For detailed metrics and monitoring, use Prometheus and Alloy integration rather than custom health endpoints. \ No newline at end of file diff --git a/docs/Red Panda Standards_Django_V1-00.md b/docs/Red Panda Standards_Django_V1-00.md new file mode 100644 index 0000000..d65ac4b --- /dev/null +++ b/docs/Red Panda Standards_Django_V1-00.md @@ -0,0 +1,306 @@ +## 🐾 Red Panda Approval™ + +This project follows Red Panda Approval standards — our gold standard for Django application quality. Code must be elegant, reliable, and maintainable to earn the approval of our adorable red panda judges. + +### The 5 Sacred Django Criteria +1. **Fresh Migration Test** — Clean migrations from empty database +2. **Elegant Simplicity** — No unnecessary complexity +3. **Observable & Debuggable** — Proper logging and error handling +4. **Consistent Patterns** — Follow Django conventions +5. **Actually Works** — Passes all checks and serves real user needs + +## Environment Standards +- Virtual environment: ~/env/PROJECT/bin/activate +- Use pyproject.toml for project configuration (no setup.py, no requirements.txt) +- Python version: specified in pyproject.toml +- Dependencies: floor-pinned with ceiling (e.g. `Django>=5.2,<6.0`) + +### Dependency Pinning + +```toml +# Correct — floor pin with ceiling +dependencies = [ + "Django>=5.2,<6.0", + "djangorestframework>=3.14,<4.0", + "cryptography>=41.0,<45.0", +] + +# Wrong — exact pins in library packages +dependencies = [ + "Django==5.2.7", # too strict, breaks downstream +] +``` + +Exact pins (`==`) are only appropriate in application-level lock files, not in reusable library packages. + +## Directory Structure +myproject/ # Git repository root +├── .gitignore +├── README.md +├── pyproject.toml # Project configuration (moved to repo root) +├── docker-compose.yml +├── .env # Docker Compose environment (DATABASE_URL=postgres://...) +├── .env.example +│ +├── project/ # Django project root (manage.py lives here) +│ ├── manage.py +│ ├── Dockerfile +│ ├── .env # Local development environment (DATABASE_URL=sqlite:///...) +│ ├── .env.example +│ │ +│ ├── config/ # Django configuration module +│ │ ├── __init__.py +│ │ ├── settings.py +│ │ ├── urls.py +│ │ ├── wsgi.py +│ │ └── asgi.py +│ │ +│ ├── accounts/ # Django app +│ │ ├── __init__.py +│ │ ├── models.py +│ │ ├── views.py +│ │ └── urls.py +│ │ +│ ├── blog/ # Django app +│ │ ├── __init__.py +│ │ ├── models.py +│ │ ├── views.py +│ │ └── urls.py +│ │ +│ ├── static/ +│ │ ├── css/ +│ │ └── js/ +│ │ +│ └── templates/ +│ └── base.html +│ +├── web/ # Nginx configuration +│ └── nginx.conf +│ +├── db/ # PostgreSQL configuration +│ └── postgresql.conf +│ +└── docs/ # Project documentation + └── index.md + +## Settings Structure +- Use a single settings.py file +- Use django-environ or python-dotenv for environment variables +- Never commit .env files to version control +- Provide .env.example with all required variables documented +- Create .gitignore file +- Create a .dockerignore file + +## Code Organization +- Imports: PEP 8 ordering (stdlib, third-party, local) +- Type hints on function parameters +- CSS: External .css files only (no inline styles, no embedded `",rE:!0,sL:"css"}},{cN:"tag",b:"|$)",e:">",k:{title:"script"},c:[c],starts:{e:"",rE:!0,sL:""}},s,{cN:"pi",b:/<\?\w+/,e:/\?>/,r:10},{cN:"tag",b:"",c:[{cN:"title",b:/[^ \/><\n\t]+/,r:0},c]}]}});hljs.registerLanguage("autohotkey",function(e){var r={cN:"escape",b:"`[\\s\\S]"},c=e.C(";","$",{r:0}),n=[{cN:"built_in",b:"A_[a-zA-Z0-9]+"},{cN:"built_in",bK:"ComSpec Clipboard ClipboardAll ErrorLevel"}];return{cI:!0,k:{keyword:"Break Continue Else Gosub If Loop Return While",literal:"A true false NOT AND OR"},c:n.concat([r,e.inherit(e.QSM,{c:[r]}),c,{cN:"number",b:e.NR,r:0},{cN:"var_expand",b:"%",e:"%",i:"\\n",c:[r]},{cN:"label",c:[r],v:[{b:'^[^\\n";]+::(?!=)'},{b:'^[^\\n";]+:(?!=)',r:0}]},{b:",\\s*,",r:10}])}});hljs.registerLanguage("r",function(e){var r="([a-zA-Z]|\\.[a-zA-Z.])[a-zA-Z0-9._]*";return{c:[e.HCM,{b:r,l:r,k:{keyword:"function if in break next repeat else for return switch while try tryCatch stop warning require library attach detach source setMethod setGeneric setGroupGeneric setClass ...",literal:"NULL NA TRUE FALSE T F Inf NaN NA_integer_|10 NA_real_|10 NA_character_|10 NA_complex_|10"},r:0},{cN:"number",b:"0[xX][0-9a-fA-F]+[Li]?\\b",r:0},{cN:"number",b:"\\d+(?:[eE][+\\-]?\\d*)?L\\b",r:0},{cN:"number",b:"\\d+\\.(?!\\d)(?:i\\b)?",r:0},{cN:"number",b:"\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",r:0},{b:"`",e:"`",r:0},{cN:"string",c:[e.BE],v:[{b:'"',e:'"'},{b:"'",e:"'"}]}]}});hljs.registerLanguage("cs",function(e){var r="abstract as base bool break byte case catch char checked const continue decimal dynamic default delegate do double else enum event explicit extern false finally fixed float for foreach goto if implicit in int interface internal is lock long null when object operator out override params private protected public readonly ref sbyte sealed short sizeof stackalloc static string struct switch this true try typeof uint ulong unchecked unsafe ushort using virtual volatile void while async protected public private internal ascending descending from get group into join let orderby partial select set value var where yield",t=e.IR+"(<"+e.IR+">)?";return{aliases:["csharp"],k:r,i:/::/,c:[e.C("///","$",{rB:!0,c:[{cN:"xmlDocTag",v:[{b:"///",r:0},{b:""},{b:""}]}]}),e.CLCM,e.CBCM,{cN:"preprocessor",b:"#",e:"$",k:"if else elif endif define undef warning error line region endregion pragma checksum"},{cN:"string",b:'@"',e:'"',c:[{b:'""'}]},e.ASM,e.QSM,e.CNM,{bK:"class namespace interface",e:/[{;=]/,i:/[^\s:]/,c:[e.TM,e.CLCM,e.CBCM]},{bK:"new return throw await",r:0},{cN:"function",b:"("+t+"\\s+)+"+e.IR+"\\s*\\(",rB:!0,e:/[{;=]/,eE:!0,k:r,c:[{b:e.IR+"\\s*\\(",rB:!0,c:[e.TM],r:0},{cN:"params",b:/\(/,e:/\)/,k:r,r:0,c:[e.ASM,e.QSM,e.CNM,e.CBCM]},e.CLCM,e.CBCM]}]}});hljs.registerLanguage("nsis",function(e){var t={cN:"symbol",b:"\\$(ADMINTOOLS|APPDATA|CDBURN_AREA|CMDLINE|COMMONFILES32|COMMONFILES64|COMMONFILES|COOKIES|DESKTOP|DOCUMENTS|EXEDIR|EXEFILE|EXEPATH|FAVORITES|FONTS|HISTORY|HWNDPARENT|INSTDIR|INTERNET_CACHE|LANGUAGE|LOCALAPPDATA|MUSIC|NETHOOD|OUTDIR|PICTURES|PLUGINSDIR|PRINTHOOD|PROFILE|PROGRAMFILES32|PROGRAMFILES64|PROGRAMFILES|QUICKLAUNCH|RECENT|RESOURCES_LOCALIZED|RESOURCES|SENDTO|SMPROGRAMS|SMSTARTUP|STARTMENU|SYSDIR|TEMP|TEMPLATES|VIDEOS|WINDIR)"},n={cN:"constant",b:"\\$+{[a-zA-Z0-9_]+}"},i={cN:"variable",b:"\\$+[a-zA-Z0-9_]+",i:"\\(\\){}"},r={cN:"constant",b:"\\$+\\([a-zA-Z0-9_]+\\)"},o={cN:"params",b:"(ARCHIVE|FILE_ATTRIBUTE_ARCHIVE|FILE_ATTRIBUTE_NORMAL|FILE_ATTRIBUTE_OFFLINE|FILE_ATTRIBUTE_READONLY|FILE_ATTRIBUTE_SYSTEM|FILE_ATTRIBUTE_TEMPORARY|HKCR|HKCU|HKDD|HKEY_CLASSES_ROOT|HKEY_CURRENT_CONFIG|HKEY_CURRENT_USER|HKEY_DYN_DATA|HKEY_LOCAL_MACHINE|HKEY_PERFORMANCE_DATA|HKEY_USERS|HKLM|HKPD|HKU|IDABORT|IDCANCEL|IDIGNORE|IDNO|IDOK|IDRETRY|IDYES|MB_ABORTRETRYIGNORE|MB_DEFBUTTON1|MB_DEFBUTTON2|MB_DEFBUTTON3|MB_DEFBUTTON4|MB_ICONEXCLAMATION|MB_ICONINFORMATION|MB_ICONQUESTION|MB_ICONSTOP|MB_OK|MB_OKCANCEL|MB_RETRYCANCEL|MB_RIGHT|MB_RTLREADING|MB_SETFOREGROUND|MB_TOPMOST|MB_USERICON|MB_YESNO|NORMAL|OFFLINE|READONLY|SHCTX|SHELL_CONTEXT|SYSTEM|TEMPORARY)"},l={cN:"constant",b:"\\!(addincludedir|addplugindir|appendfile|cd|define|delfile|echo|else|endif|error|execute|finalize|getdllversionsystem|ifdef|ifmacrodef|ifmacrondef|ifndef|if|include|insertmacro|macroend|macro|makensis|packhdr|searchparse|searchreplace|tempfile|undef|verbose|warning)"};return{cI:!1,k:{keyword:"Abort AddBrandingImage AddSize AllowRootDirInstall AllowSkipFiles AutoCloseWindow BGFont BGGradient BrandingText BringToFront Call CallInstDLL Caption ChangeUI CheckBitmap ClearErrors CompletedText ComponentText CopyFiles CRCCheck CreateDirectory CreateFont CreateShortCut Delete DeleteINISec DeleteINIStr DeleteRegKey DeleteRegValue DetailPrint DetailsButtonText DirText DirVar DirVerify EnableWindow EnumRegKey EnumRegValue Exch Exec ExecShell ExecWait ExpandEnvStrings File FileBufSize FileClose FileErrorText FileOpen FileRead FileReadByte FileReadUTF16LE FileReadWord FileSeek FileWrite FileWriteByte FileWriteUTF16LE FileWriteWord FindClose FindFirst FindNext FindWindow FlushINI FunctionEnd GetCurInstType GetCurrentAddress GetDlgItem GetDLLVersion GetDLLVersionLocal GetErrorLevel GetFileTime GetFileTimeLocal GetFullPathName GetFunctionAddress GetInstDirError GetLabelAddress GetTempFileName Goto HideWindow Icon IfAbort IfErrors IfFileExists IfRebootFlag IfSilent InitPluginsDir InstallButtonText InstallColors InstallDir InstallDirRegKey InstProgressFlags InstType InstTypeGetText InstTypeSetText IntCmp IntCmpU IntFmt IntOp IsWindow LangString LicenseBkColor LicenseData LicenseForceSelection LicenseLangString LicenseText LoadLanguageFile LockWindow LogSet LogText ManifestDPIAware ManifestSupportedOS MessageBox MiscButtonText Name Nop OutFile Page PageCallbacks PageExEnd Pop Push Quit ReadEnvStr ReadINIStr ReadRegDWORD ReadRegStr Reboot RegDLL Rename RequestExecutionLevel ReserveFile Return RMDir SearchPath SectionEnd SectionGetFlags SectionGetInstTypes SectionGetSize SectionGetText SectionGroupEnd SectionIn SectionSetFlags SectionSetInstTypes SectionSetSize SectionSetText SendMessage SetAutoClose SetBrandingImage SetCompress SetCompressor SetCompressorDictSize SetCtlColors SetCurInstType SetDatablockOptimize SetDateSave SetDetailsPrint SetDetailsView SetErrorLevel SetErrors SetFileAttributes SetFont SetOutPath SetOverwrite SetPluginUnload SetRebootFlag SetRegView SetShellVarContext SetSilent ShowInstDetails ShowUninstDetails ShowWindow SilentInstall SilentUnInstall Sleep SpaceTexts StrCmp StrCmpS StrCpy StrLen SubCaption SubSectionEnd Unicode UninstallButtonText UninstallCaption UninstallIcon UninstallSubCaption UninstallText UninstPage UnRegDLL Var VIAddVersionKey VIFileVersion VIProductVersion WindowIcon WriteINIStr WriteRegBin WriteRegDWORD WriteRegExpandStr WriteRegStr WriteUninstaller XPStyle",literal:"admin all auto both colored current false force hide highest lastused leave listonly none normal notset off on open print show silent silentlog smooth textonly true user "},c:[e.HCM,e.CBCM,{cN:"string",b:'"',e:'"',i:"\\n",c:[{cN:"symbol",b:"\\$(\\\\(n|r|t)|\\$)"},t,n,i,r]},e.C(";","$",{r:0}),{cN:"function",bK:"Function PageEx Section SectionGroup SubSection",e:"$"},l,n,i,r,o,e.NM,{cN:"literal",b:e.IR+"::"+e.IR}]}});hljs.registerLanguage("less",function(e){var r="[\\w-]+",t="("+r+"|@{"+r+"})",a=[],c=[],n=function(e){return{cN:"string",b:"~?"+e+".*?"+e}},i=function(e,r,t){return{cN:e,b:r,r:t}},s=function(r,t,a){return e.inherit({cN:r,b:t+"\\(",e:"\\(",rB:!0,eE:!0,r:0},a)},b={b:"\\(",e:"\\)",c:c,r:0};c.push(e.CLCM,e.CBCM,n("'"),n('"'),e.CSSNM,i("hexcolor","#[0-9A-Fa-f]+\\b"),s("function","(url|data-uri)",{starts:{cN:"string",e:"[\\)\\n]",eE:!0}}),s("function",r),b,i("variable","@@?"+r,10),i("variable","@{"+r+"}"),i("built_in","~?`[^`]*?`"),{cN:"attribute",b:r+"\\s*:",e:":",rB:!0,eE:!0});var o=c.concat({b:"{",e:"}",c:a}),u={bK:"when",eW:!0,c:[{bK:"and not"}].concat(c)},C={cN:"attribute",b:t,e:":",eE:!0,c:[e.CLCM,e.CBCM],i:/\S/,starts:{e:"[;}]",rE:!0,c:c,i:"[<=$]"}},l={cN:"at_rule",b:"@(import|media|charset|font-face|(-[a-z]+-)?keyframes|supports|document|namespace|page|viewport|host)\\b",starts:{e:"[;{}]",rE:!0,c:c,r:0}},d={cN:"variable",v:[{b:"@"+r+"\\s*:",r:15},{b:"@"+r}],starts:{e:"[;}]",rE:!0,c:o}},p={v:[{b:"[\\.#:&\\[]",e:"[;{}]"},{b:t+"[^;]*{",e:"{"}],rB:!0,rE:!0,i:"[<='$\"]",c:[e.CLCM,e.CBCM,u,i("keyword","all\\b"),i("variable","@{"+r+"}"),i("tag",t+"%?",0),i("id","#"+t),i("class","\\."+t,0),i("keyword","&",0),s("pseudo",":not"),s("keyword",":extend"),i("pseudo","::?"+t),{cN:"attr_selector",b:"\\[",e:"\\]"},{b:"\\(",e:"\\)",c:o},{b:"!important"}]};return a.push(e.CLCM,e.CBCM,l,d,p,C),{cI:!0,i:"[=>'/<($\"]",c:a}});hljs.registerLanguage("pf",function(t){var o={cN:"variable",b:/\$[\w\d#@][\w\d_]*/},e={cN:"variable",b://};return{aliases:["pf.conf"],l:/[a-z0-9_<>-]+/,k:{built_in:"block match pass load anchor|5 antispoof|10 set table",keyword:"in out log quick on rdomain inet inet6 proto from port os to routeallow-opts divert-packet divert-reply divert-to flags group icmp-typeicmp6-type label once probability recieved-on rtable prio queuetos tag tagged user keep fragment for os dropaf-to|10 binat-to|10 nat-to|10 rdr-to|10 bitmask least-stats random round-robinsource-hash static-portdup-to reply-to route-toparent bandwidth default min max qlimitblock-policy debug fingerprints hostid limit loginterface optimizationreassemble ruleset-optimization basic none profile skip state-defaultsstate-policy timeoutconst counters persistno modulate synproxy state|5 floating if-bound no-sync pflow|10 sloppysource-track global rule max-src-nodes max-src-states max-src-connmax-src-conn-rate overload flushscrub|5 max-mss min-ttl no-df|10 random-id",literal:"all any no-route self urpf-failed egress|5 unknown"},c:[t.HCM,t.NM,t.QSM,o,e]}});hljs.registerLanguage("lasso",function(e){var r="[a-zA-Z_][a-zA-Z0-9_.]*",a="<\\?(lasso(script)?|=)",t="\\]|\\?>",s={literal:"true false none minimal full all void and or not bw nbw ew new cn ncn lt lte gt gte eq neq rx nrx ft",built_in:"array date decimal duration integer map pair string tag xml null boolean bytes keyword list locale queue set stack staticarray local var variable global data self inherited",keyword:"error_code error_msg error_pop error_push error_reset cache database_names database_schemanames database_tablenames define_tag define_type email_batch encode_set html_comment handle handle_error header if inline iterate ljax_target link link_currentaction link_currentgroup link_currentrecord link_detail link_firstgroup link_firstrecord link_lastgroup link_lastrecord link_nextgroup link_nextrecord link_prevgroup link_prevrecord log loop namespace_using output_none portal private protect records referer referrer repeating resultset rows search_args search_arguments select sort_args sort_arguments thread_atomic value_list while abort case else if_empty if_false if_null if_true loop_abort loop_continue loop_count params params_up return return_value run_children soap_definetag soap_lastrequest soap_lastresponse tag_name ascending average by define descending do equals frozen group handle_failure import in into join let match max min on order parent protected provide public require returnhome skip split_thread sum take thread to trait type where with yield yieldhome"},n=e.C("",{r:0}),o={cN:"preprocessor",b:"\\[noprocess\\]",starts:{cN:"markup",e:"\\[/noprocess\\]",rE:!0,c:[n]}},i={cN:"preprocessor",b:"\\[/noprocess|"+a},l={cN:"variable",b:"'"+r+"'"},c=[e.CLCM,{cN:"javadoc",b:"/\\*\\*!",e:"\\*/",c:[e.PWM]},e.CBCM,e.inherit(e.CNM,{b:e.CNR+"|(-?infinity|nan)\\b"}),e.inherit(e.ASM,{i:null}),e.inherit(e.QSM,{i:null}),{cN:"string",b:"`",e:"`"},{cN:"variable",v:[{b:"[#$]"+r},{b:"#",e:"\\d+",i:"\\W"}]},{cN:"tag",b:"::\\s*",e:r,i:"\\W"},{cN:"attribute",v:[{b:"-"+e.UIR,r:0},{b:"(\\.\\.\\.)"}]},{cN:"subst",v:[{b:"->\\s*",c:[l]},{b:":=|/(?!\\w)=?|[-+*%=<>&|!?\\\\]+",r:0}]},{cN:"built_in",b:"\\.\\.?\\s*",r:0,c:[l]},{cN:"class",bK:"define",rE:!0,e:"\\(|=>",c:[e.inherit(e.TM,{b:e.UIR+"(=(?!>))?"})]}];return{aliases:["ls","lassoscript"],cI:!0,l:r+"|&[lg]t;",k:s,c:[{cN:"preprocessor",b:t,r:0,starts:{cN:"markup",e:"\\[|"+a,rE:!0,r:0,c:[n]}},o,i,{cN:"preprocessor",b:"\\[no_square_brackets",starts:{e:"\\[/no_square_brackets\\]",l:r+"|&[lg]t;",k:s,c:[{cN:"preprocessor",b:t,r:0,starts:{cN:"markup",e:"\\[noprocess\\]|"+a,rE:!0,c:[n]}},o,i].concat(c)}},{cN:"preprocessor",b:"\\[",r:0},{cN:"shebang",b:"^#!.+lasso9\\b",r:10}].concat(c)}});hljs.registerLanguage("prolog",function(c){var r={cN:"atom",b:/[a-z][A-Za-z0-9_]*/,r:0},b={cN:"name",v:[{b:/[A-Z][a-zA-Z0-9_]*/},{b:/_[A-Za-z0-9_]*/}],r:0},a={b:/\(/,e:/\)/,r:0},e={b:/\[/,e:/\]/},n={cN:"comment",b:/%/,e:/$/,c:[c.PWM]},t={cN:"string",b:/`/,e:/`/,c:[c.BE]},g={cN:"string",b:/0\'(\\\'|.)/},N={cN:"string",b:/0\'\\s/},o={b:/:-/},s=[r,b,a,o,e,n,c.CBCM,c.QSM,c.ASM,t,g,N,c.CNM];return a.c=s,e.c=s,{c:s.concat([{b:/\.$/}])}});hljs.registerLanguage("oxygene",function(e){var r="abstract add and array as asc aspect assembly async begin break block by case class concat const copy constructor continue create default delegate desc distinct div do downto dynamic each else empty end ensure enum equals event except exit extension external false final finalize finalizer finally flags for forward from function future global group has if implementation implements implies in index inherited inline interface into invariants is iterator join locked locking loop matching method mod module namespace nested new nil not notify nullable of old on operator or order out override parallel params partial pinned private procedure property protected public queryable raise read readonly record reintroduce remove repeat require result reverse sealed select self sequence set shl shr skip static step soft take then to true try tuple type union unit unsafe until uses using var virtual raises volatile where while with write xor yield await mapped deprecated stdcall cdecl pascal register safecall overload library platform reference packed strict published autoreleasepool selector strong weak unretained",t=e.C("{","}",{r:0}),a=e.C("\\(\\*","\\*\\)",{r:10}),n={cN:"string",b:"'",e:"'",c:[{b:"''"}]},o={cN:"string",b:"(#\\d+)+"},i={cN:"function",bK:"function constructor destructor procedure method",e:"[:;]",k:"function constructor|10 destructor|10 procedure|10 method|10",c:[e.TM,{cN:"params",b:"\\(",e:"\\)",k:r,c:[n,o]},t,a]};return{cI:!0,k:r,i:'("|\\$[G-Zg-z]|\\/\\*||->)',c:[t,a,e.CLCM,n,o,e.NM,i,{cN:"class",b:"=\\bclass\\b",e:"end;",k:r,c:[n,o,t,a,e.CLCM,i]}]}});hljs.registerLanguage("applescript",function(e){var t=e.inherit(e.QSM,{i:""}),r={cN:"params",b:"\\(",e:"\\)",c:["self",e.CNM,t]},o=e.C("--","$"),n=e.C("\\(\\*","\\*\\)",{c:["self",o]}),a=[o,n,e.HCM];return{aliases:["osascript"],k:{keyword:"about above after against and around as at back before beginning behind below beneath beside between but by considering contain contains continue copy div does eighth else end equal equals error every exit fifth first for fourth from front get given global if ignoring in into is it its last local me middle mod my ninth not of on onto or over prop property put ref reference repeat returning script second set seventh since sixth some tell tenth that the|0 then third through thru timeout times to transaction try until where while whose with without",constant:"AppleScript false linefeed return pi quote result space tab true",type:"alias application boolean class constant date file integer list number real record string text",command:"activate beep count delay launch log offset read round run say summarize write",property:"character characters contents day frontmost id item length month name paragraph paragraphs rest reverse running time version weekday word words year"},c:[t,e.CNM,{cN:"type",b:"\\bPOSIX file\\b"},{cN:"command",b:"\\b(clipboard info|the clipboard|info for|list (disks|folder)|mount volume|path to|(close|open for) access|(get|set) eof|current date|do shell script|get volume settings|random number|set volume|system attribute|system info|time to GMT|(load|run|store) script|scripting components|ASCII (character|number)|localized string|choose (application|color|file|file name|folder|from list|remote application|URL)|display (alert|dialog))\\b|^\\s*return\\b"},{cN:"constant",b:"\\b(text item delimiters|current application|missing value)\\b"},{cN:"keyword",b:"\\b(apart from|aside from|instead of|out of|greater than|isn't|(doesn't|does not) (equal|come before|come after|contain)|(greater|less) than( or equal)?|(starts?|ends|begins?) with|contained by|comes (before|after)|a (ref|reference))\\b"},{cN:"property",b:"\\b(POSIX path|(date|time) string|quoted form)\\b"},{cN:"function_start",bK:"on",i:"[${=;\\n]",c:[e.UTM,r]}].concat(a),i:"//|->|=>"}});hljs.registerLanguage("makefile",function(e){var a={cN:"variable",b:/\$\(/,e:/\)/,c:[e.BE]};return{aliases:["mk","mak"],c:[e.HCM,{b:/^\w+\s*\W*=/,rB:!0,r:0,starts:{cN:"constant",e:/\s*\W*=/,eE:!0,starts:{e:/$/,r:0,c:[a]}}},{cN:"title",b:/^[\w]+:\s*$/},{cN:"phony",b:/^\.PHONY:/,e:/$/,k:".PHONY",l:/[\.\w]+/},{b:/^\t+/,e:/$/,r:0,c:[e.QSM,a]}]}});hljs.registerLanguage("dust",function(e){var a="if eq ne lt lte gt gte select default math sep";return{aliases:["dst"],cI:!0,sL:"xml",subLanguageMode:"continuous",c:[{cN:"expression",b:"{",e:"}",r:0,c:[{cN:"begin-block",b:"#[a-zA-Z- .]+",k:a},{cN:"string",b:'"',e:'"'},{cN:"end-block",b:"\\/[a-zA-Z- .]+",k:a},{cN:"variable",b:"[a-zA-Z-.]+",k:a,r:0}]}]}});hljs.registerLanguage("clojure-repl",function(e){return{c:[{cN:"prompt",b:/^([\w.-]+|\s*#_)=>/,starts:{e:/$/,sL:"clojure",subLanguageMode:"continuous"}}]}});hljs.registerLanguage("dart",function(e){var t={cN:"subst",b:"\\$\\{",e:"}",k:"true false null this is new super"},r={cN:"string",v:[{b:"r'''",e:"'''"},{b:'r"""',e:'"""'},{b:"r'",e:"'",i:"\\n"},{b:'r"',e:'"',i:"\\n"},{b:"'''",e:"'''",c:[e.BE,t]},{b:'"""',e:'"""',c:[e.BE,t]},{b:"'",e:"'",i:"\\n",c:[e.BE,t]},{b:'"',e:'"',i:"\\n",c:[e.BE,t]}]};t.c=[e.CNM,r];var n={keyword:"assert break case catch class const continue default do else enum extends false final finally for if in is new null rethrow return super switch this throw true try var void while with",literal:"abstract as dynamic export external factory get implements import library operator part set static typedef",built_in:"print Comparable DateTime Duration Function Iterable Iterator List Map Match Null Object Pattern RegExp Set Stopwatch String StringBuffer StringSink Symbol Type Uri bool double int num document window querySelector querySelectorAll Element ElementList"};return{k:n,c:[r,{cN:"dartdoc",b:"/\\*\\*",e:"\\*/",sL:"markdown",subLanguageMode:"continuous"},{cN:"dartdoc",b:"///",e:"$",sL:"markdown",subLanguageMode:"continuous"},e.CLCM,e.CBCM,{cN:"class",bK:"class interface",e:"{",eE:!0,c:[{bK:"extends implements"},e.UTM]},e.CNM,{cN:"annotation",b:"@[A-Za-z]+"},{b:"=>"}]}}); \ No newline at end of file diff --git a/mnemosyne/staticfiles/rest_framework/docs/js/jquery.json-view.min.js b/mnemosyne/staticfiles/rest_framework/docs/js/jquery.json-view.min.js new file mode 100644 index 0000000..ce3a604 --- /dev/null +++ b/mnemosyne/staticfiles/rest_framework/docs/js/jquery.json-view.min.js @@ -0,0 +1,7 @@ +/** + * jquery.json-view - jQuery collapsible JSON plugin + * @version v1.0.0 + * @link http://github.com/bazh/jquery.json-view + * @license MIT + */ +!function(e){"use strict";var n=function(n){var a=e("",{"class":"collapser",on:{click:function(){var n=e(this);n.toggleClass("collapsed");var a=n.parent().children(".block"),p=a.children("ul");n.hasClass("collapsed")?(p.hide(),a.children(".dots, .comments").show()):(p.show(),a.children(".dots, .comments").hide())}}});return n&&a.addClass("collapsed"),a},a=function(a,p){var t=e.extend({},{nl2br:!0},p),r=function(e){return e.toString()?e.toString().replace(/&/g,"&").replace(/"/g,""").replace(//g,">"):""},s=function(n,a){return e("",{"class":a,html:r(n)})},l=function(a,p){switch(e.type(a)){case"object":p||(p=0);var c=e("",{"class":"block"}),d=Object.keys(a).length;if(!d)return c.append(s("{","b")).append(" ").append(s("}","b"));c.append(s("{","b"));var i=e("