mnemosyne/docs/synesis_api_usage_guide.html

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Synesis — API Usage Guide</title>
    <!-- Bootstrap CSS -->
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
    <!-- Mermaid -->
    <script src="https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"></script>
</head>
<body>
    <div class="container-fluid">

        <!-- Navigation -->
        <nav class="navbar navbar-dark bg-dark rounded mb-4">
            <div class="container-fluid">
                <a class="navbar-brand" href="api_usage_guide.html">Synesis API Guide</a>
                <div class="navbar-nav d-flex flex-row">
                    <a class="nav-link me-3" href="#overview">Overview</a>
                    <a class="nav-link me-3" href="#architecture">Architecture</a>
                    <a class="nav-link me-3" href="#embeddings">Embeddings</a>
                    <a class="nav-link me-3" href="#reranking">Reranking</a>
                    <a class="nav-link me-3" href="#integration">Integration</a>
                    <a class="nav-link" href="#operations">Operations</a>
                </div>
            </div>
        </nav>

        <nav aria-label="breadcrumb">
            <ol class="breadcrumb">
                <li class="breadcrumb-item"><a href="api_usage_guide.html">Synesis</a></li>
                <li class="breadcrumb-item active">API Usage Guide</li>
            </ol>
        </nav>

        <!-- Title -->
        <div class="row mb-4">
            <div class="col-12">
                <h1 class="display-4 mb-2">Synesis — API Usage Guide</h1>
                <p class="lead">Multimodal embedding and reranking service powered by Qwen3-VL-2B. Supports text, image, and mixed-modal inputs over a simple REST API.</p>
            </div>
        </div>

        <!-- ============================================================ -->
        <!-- OVERVIEW -->
        <!-- ============================================================ -->
        <section id="overview" class="mb-5">
            <h2 class="h2 mb-4">Overview</h2>

            <div class="row g-4 mb-4">
                <div class="col-lg-4">
                    <div class="card h-100">
                        <div class="card-body">
                            <h3 class="card-title text-primary">Embeddings</h3>
                            <p>Generate dense vector representations for text, images, or both. Vectors are suitable for semantic search, retrieval, clustering, and classification.</p>
                            <code>POST /v1/embeddings</code>
                        </div>
                    </div>
                </div>
                <div class="col-lg-4">
                    <div class="card h-100">
                        <div class="card-body">
                            <h3 class="card-title text-primary">Reranking</h3>
                            <p>Given a query and a list of candidate documents, score and sort them by relevance. Use after an initial retrieval step to improve precision.</p>
                            <code>POST /v1/rerank</code>
                        </div>
                    </div>
                </div>
                <div class="col-lg-4">
                    <div class="card h-100">
                        <div class="card-body">
                            <h3 class="card-title text-primary">Similarity</h3>
                            <p>Convenience endpoint to compute cosine similarity between two inputs without managing vectors yourself.</p>
                            <code>POST /v1/similarity</code>
                        </div>
                    </div>
                </div>
            </div>

            <div class="alert alert-info border-start border-4 border-info">
                <h3>Interactive API Explorer</h3>
                <p class="mb-0">Full request/response schemas, try-it-out functionality, and auto-generated curl examples are available at <strong><code>http://&lt;host&gt;:8400/docs</code></strong> (Swagger UI). Use it to experiment with every endpoint interactively.</p>
            </div>

            <div class="alert alert-secondary border-start border-4 border-secondary">
                <h3>Base URL</h3>
                <p>All endpoints are served from a single base URL. Configure this in your consuming application:</p>
                <pre class="mb-0">http://&lt;synesis-host&gt;:8400</pre>
                <p class="mt-2 mb-0">Default port is <code>8400</code>. No authentication is required (secure via network policy / firewall).</p>
            </div>
        </section>

        <!-- ============================================================ -->
        <!-- ARCHITECTURE -->
        <!-- ============================================================ -->
        <section id="architecture" class="mb-5">
            <h2 class="h2 mb-4">Architecture</h2>

            <div class="alert alert-info border-start border-4 border-info">
                <h3>Service Architecture</h3>
                <p>Synesis loads two Qwen3-VL-2B models into GPU memory at startup: one for embeddings and one for reranking. Both share the same NVIDIA 3090 (24 GB VRAM).</p>
            </div>

            <div class="card my-4">
                <div class="card-body">
                    <h3 class="card-title text-primary">Request Flow</h3>
                    <div class="mermaid">
graph LR
    Client["Client Application"] -->|HTTP POST| FastAPI["FastAPI<br/>:8400"]
    FastAPI -->|/v1/embeddings| Embedder["Qwen3-VL<br/>Embedder 2B"]
    FastAPI -->|/v1/rerank| Reranker["Qwen3-VL<br/>Reranker 2B"]
    FastAPI -->|/v1/similarity| Embedder
    Embedder --> GPU["NVIDIA 3090<br/>24 GB VRAM"]
    Reranker --> GPU
    FastAPI -->|/metrics| Prometheus["Prometheus"]
                    </div>
                </div>
            </div>

            <div class="card my-4">
                <div class="card-body">
                    <h3 class="card-title text-primary">Typical RAG Integration</h3>
                    <div class="mermaid">
sequenceDiagram
    participant App as Your Application
    participant Synesis as Synesis API
    participant VDB as Vector Database

    Note over App: Indexing Phase
    App->>Synesis: POST /v1/embeddings (documents)
    Synesis-->>App: embedding vectors
    App->>VDB: Store vectors + metadata

    Note over App: Query Phase
    App->>Synesis: POST /v1/embeddings (query)
    Synesis-->>App: query vector
    App->>VDB: ANN search (top 50)
    VDB-->>App: candidate documents
    App->>Synesis: POST /v1/rerank (query + candidates)
    Synesis-->>App: ranked results with scores
    App->>App: Use top 5-10 results
                    </div>
                </div>
            </div>
        </section>

        <!-- ============================================================ -->
        <!-- EMBEDDINGS -->
        <!-- ============================================================ -->
        <section id="embeddings" class="mb-5">
            <h2 class="h2 mb-4">Embeddings API</h2>

            <div class="alert alert-primary border-start border-4 border-primary">
                <h3>POST /v1/embeddings</h3>
                <p class="mb-0">Generate dense vector embeddings for one or more inputs. Each input can be text, an image, or both (multimodal).</p>
            </div>

            <!-- Request Schema -->
            <h3 class="mt-4">Request Body</h3>
            <table class="table table-bordered">
                <thead class="table-dark">
                    <tr>
                        <th>Field</th>
                        <th>Type</th>
                        <th>Required</th>
                        <th>Description</th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td><code>inputs</code></td>
                        <td>array</td>
                        <td>Yes</td>
                        <td>List of items to embed (1 to <code>max_batch_size</code>).</td>
                    </tr>
                    <tr>
                        <td><code>inputs[].text</code></td>
                        <td>string</td>
                        <td>*</td>
                        <td>Text content. At least one of <code>text</code> or <code>image</code> is required.</td>
                    </tr>
                    <tr>
                        <td><code>inputs[].image</code></td>
                        <td>string</td>
                        <td>*</td>
                        <td>Image file path or URL. At least one of <code>text</code> or <code>image</code> is required.</td>
                    </tr>
                    <tr>
                        <td><code>inputs[].instruction</code></td>
                        <td>string</td>
                        <td>No</td>
                        <td>Optional task instruction to guide embedding (e.g. "Represent this document for retrieval").</td>
                    </tr>
                    <tr>
                        <td><code>dimension</code></td>
                        <td>int</td>
                        <td>No</td>
                        <td>Output vector dimension (64–2048). Default: 2048. See <a href="#dimensions">Dimensions</a>.</td>
                    </tr>
                    <tr>
                        <td><code>normalize</code></td>
                        <td>bool</td>
                        <td>No</td>
                        <td>L2-normalize output vectors. Default: <code>true</code>.</td>
                    </tr>
                </tbody>
            </table>

            <!-- Response Schema -->
            <h3 class="mt-4">Response Body</h3>
            <table class="table table-bordered">
                <thead class="table-dark">
                    <tr>
                        <th>Field</th>
                        <th>Type</th>
                        <th>Description</th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td><code>embeddings[]</code></td>
                        <td>array</td>
                        <td>One embedding per input, in order.</td>
                    </tr>
                    <tr>
                        <td><code>embeddings[].index</code></td>
                        <td>int</td>
                        <td>Position in the input array.</td>
                    </tr>
                    <tr>
                        <td><code>embeddings[].embedding</code></td>
                        <td>float[]</td>
                        <td>The dense vector (length = <code>dimension</code>).</td>
                    </tr>
                    <tr>
                        <td><code>usage.input_count</code></td>
                        <td>int</td>
                        <td>Number of inputs processed.</td>
                    </tr>
                    <tr>
                        <td><code>usage.dimension</code></td>
                        <td>int</td>
                        <td>Dimension of returned vectors.</td>
                    </tr>
                    <tr>
                        <td><code>usage.elapsed_ms</code></td>
                        <td>float</td>
                        <td>Server-side processing time in milliseconds.</td>
                    </tr>
                </tbody>
            </table>

            <!-- Input Types -->
            <h3 class="mt-4">Input Modalities</h3>
            <div class="row g-4">
                <div class="col-lg-4">
                    <div class="card h-100">
                        <div class="card-body">
                            <h4 class="card-title text-primary">Text Only</h4>
                            <pre class="mb-0">{
  "inputs": [
    {"text": "quantum computing basics"},
    {"text": "machine learning tutorial"}
  ]
}</pre>
                        </div>
                    </div>
                </div>
                <div class="col-lg-4">
                    <div class="card h-100">
                        <div class="card-body">
                            <h4 class="card-title text-primary">Image Only</h4>
                            <pre class="mb-0">{
  "inputs": [
    {"image": "/data/photos/cat.jpg"},
    {"image": "https://example.com/dog.png"}
  ]
}</pre>
                        </div>
                    </div>
                </div>
                <div class="col-lg-4">
                    <div class="card h-100">
                        <div class="card-body">
                            <h4 class="card-title text-primary">Multimodal</h4>
                            <pre class="mb-0">{
  "inputs": [
    {
      "text": "product photo",
      "image": "/data/products/shoe.jpg"
    }
  ]
}</pre>
                        </div>
                    </div>
                </div>
            </div>
        </section>

        <!-- ============================================================ -->
        <!-- RERANKING -->
        <!-- ============================================================ -->
        <section id="reranking" class="mb-5">
            <h2 class="h2 mb-4">Reranking API</h2>

            <div class="alert alert-primary border-start border-4 border-primary">
                <h3>POST /v1/rerank</h3>
                <p class="mb-0">Score and rank a list of candidate documents against a query. Returns documents sorted by relevance (highest score first).</p>
            </div>

            <!-- Request Schema -->
            <h3 class="mt-4">Request Body</h3>
            <table class="table table-bordered">
                <thead class="table-dark">
                    <tr>
                        <th>Field</th>
                        <th>Type</th>
                        <th>Required</th>
                        <th>Description</th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td><code>query</code></td>
                        <td>object</td>
                        <td>Yes</td>
                        <td>The query to rank against. Must contain <code>text</code>, <code>image</code>, or both.</td>
                    </tr>
                    <tr>
                        <td><code>query.text</code></td>
                        <td>string</td>
                        <td>*</td>
                        <td>Query text. At least one of <code>text</code> or <code>image</code> required.</td>
                    </tr>
                    <tr>
                        <td><code>query.image</code></td>
                        <td>string</td>
                        <td>*</td>
                        <td>Query image path or URL.</td>
                    </tr>
                    <tr>
                        <td><code>documents</code></td>
                        <td>array</td>
                        <td>Yes</td>
                        <td>Candidate documents to rerank (1 to <code>max_batch_size</code>).</td>
                    </tr>
                    <tr>
                        <td><code>documents[].text</code></td>
                        <td>string</td>
                        <td>*</td>
                        <td>Document text. At least one of <code>text</code> or <code>image</code> required per document.</td>
                    </tr>
                    <tr>
                        <td><code>documents[].image</code></td>
                        <td>string</td>
                        <td>*</td>
                        <td>Document image path or URL.</td>
                    </tr>
                    <tr>
                        <td><code>instruction</code></td>
                        <td>string</td>
                        <td>No</td>
                        <td>Task instruction (e.g. "Retrieve images relevant to the query.").</td>
                    </tr>
                    <tr>
                        <td><code>top_n</code></td>
                        <td>int</td>
                        <td>No</td>
                        <td>Return only the top N results. Default: return all.</td>
                    </tr>
                </tbody>
            </table>

            <!-- Response Schema -->
            <h3 class="mt-4">Response Body</h3>
            <table class="table table-bordered">
                <thead class="table-dark">
                    <tr>
                        <th>Field</th>
                        <th>Type</th>
                        <th>Description</th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td><code>results[]</code></td>
                        <td>array</td>
                        <td>Documents sorted by relevance score (descending).</td>
                    </tr>
                    <tr>
                        <td><code>results[].index</code></td>
                        <td>int</td>
                        <td>Original position of this document in the input array.</td>
                    </tr>
                    <tr>
                        <td><code>results[].score</code></td>
                        <td>float</td>
                        <td>Relevance score (higher = more relevant).</td>
                    </tr>
                    <tr>
                        <td><code>results[].document</code></td>
                        <td>object</td>
                        <td>The document that was ranked (echoed back).</td>
                    </tr>
                    <tr>
                        <td><code>usage.query_count</code></td>
                        <td>int</td>
                        <td>Always 1.</td>
                    </tr>
                    <tr>
                        <td><code>usage.document_count</code></td>
                        <td>int</td>
                        <td>Total documents scored.</td>
                    </tr>
                    <tr>
                        <td><code>usage.returned_count</code></td>
                        <td>int</td>
                        <td>Number of results returned (respects <code>top_n</code>).</td>
                    </tr>
                    <tr>
                        <td><code>usage.elapsed_ms</code></td>
                        <td>float</td>
                        <td>Server-side processing time in milliseconds.</td>
                    </tr>
                </tbody>
            </table>

            <!-- Rerank Examples -->
            <h3 class="mt-4">Example: Text Query → Text Documents</h3>
            <div class="card my-3">
                <div class="card-body">
                    <pre class="mb-0">{
  "query": {"text": "How do neural networks learn?"},
  "documents": [
    {"text": "Neural networks adjust weights through backpropagation..."},
    {"text": "The stock market experienced a downturn in Q3..."},
    {"text": "Deep learning uses gradient descent to minimize loss..."},
    {"text": "Photosynthesis converts sunlight into chemical energy..."}
  ],
  "top_n": 2
}</pre>
                </div>
            </div>

            <h3 class="mt-4">Example: Text Query → Image Documents</h3>
            <div class="card my-3">
                <div class="card-body">
                    <pre class="mb-0">{
  "query": {"text": "melancholy album artwork"},
  "documents": [
    {"image": "/data/covers/cover1.jpg"},
    {"image": "/data/covers/cover2.jpg"},
    {"text": "dark moody painting", "image": "/data/covers/cover3.jpg"}
  ],
  "instruction": "Retrieve images relevant to the query.",
  "top_n": 2
}</pre>
                </div>
            </div>
        </section>

        <!-- ============================================================ -->
        <!-- DIMENSIONS, BATCHES, PERFORMANCE -->
        <!-- ============================================================ -->
        <section id="dimensions" class="mb-5">
            <h2 class="h2 mb-4">Dimensions, Batches &amp; Performance</h2>

            <div class="alert alert-danger border-start border-4 border-danger">
                <h3>Matryoshka Dimension Truncation</h3>
                <p>Synesis uses <strong>Matryoshka Representation Learning (MRL)</strong>. The model always computes full 2048-dimensional vectors internally, then truncates to your requested dimension. This means you can choose a dimension that balances <strong>quality vs. storage/speed</strong>.</p>
                <table class="table table-bordered mt-3 mb-0">
                    <thead class="table-dark">
                        <tr>
                            <th>Dimension</th>
                            <th>Vector Size</th>
                            <th>Quality</th>
                            <th>Use Case</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td><code>2048</code> (default)</td>
                            <td>8 KB / vector (float32)</td>
                            <td>Maximum</td>
                            <td>Highest accuracy retrieval, small collections</td>
                        </tr>
                        <tr>
                            <td><code>1024</code></td>
                            <td>4 KB / vector</td>
                            <td>Very high</td>
                            <td>Good balance for most production systems</td>
                        </tr>
                        <tr>
                            <td><code>512</code></td>
                            <td>2 KB / vector</td>
                            <td>High</td>
                            <td>Large-scale search with reasonable quality</td>
                        </tr>
                        <tr>
                            <td><code>256</code></td>
                            <td>1 KB / vector</td>
                            <td>Good</td>
                            <td>Very large collections, cost-sensitive</td>
                        </tr>
                        <tr>
                            <td><code>128</code></td>
                            <td>512 B / vector</td>
                            <td>Moderate</td>
                            <td>Rough filtering, pre-screening</td>
                        </tr>
                        <tr>
                            <td><code>64</code></td>
                            <td>256 B / vector</td>
                            <td>Basic</td>
                            <td>Coarse clustering, topic grouping</td>
                        </tr>
                    </tbody>
                </table>
            </div>

            <div class="alert alert-warning border-start border-4 border-warning">
                <h3>Important: Consistency</h3>
                <p class="mb-0">All vectors in the same index/collection <strong>must use the same dimension</strong>. Choose a dimension at index creation time and use it consistently for both indexing and querying. You cannot mix 512-d and 1024-d vectors in the same vector database index.</p>
            </div>

            <div class="alert alert-info border-start border-4 border-info">
                <h3>Batch Size &amp; Microbatching</h3>
                <p>The <code>max_batch_size</code> setting (default: <strong>32</strong>) controls the maximum number of inputs per API call. This is tuned for the 3090's 24 GB VRAM.</p>
                <ul>
                    <li><strong>Text-only inputs:</strong> Batch sizes up to 32 are safe.</li>
                    <li><strong>Image inputs:</strong> Images consume significantly more VRAM. Reduce batch sizes to 8–16 when embedding images, depending on resolution.</li>
                    <li><strong>Mixed-modal inputs:</strong> Treat as image batches for sizing purposes.</li>
                </ul>
                <h4>Microbatching Strategy</h4>
                <p>When processing large datasets (thousands of documents), <strong>do not send all items in a single request</strong>. Instead, implement client-side microbatching:</p>
                <ol class="mb-0">
                    <li>Split your dataset into chunks of 16–32 items.</li>
                    <li>Send each chunk as a separate <code>/v1/embeddings</code> request.</li>
                    <li>Collect and concatenate the resulting vectors.</li>
                    <li>For images, use smaller chunk sizes (8–16) to avoid OOM errors.</li>
                    <li>Add a small delay between requests if processing thousands of items to avoid GPU thermal throttling.</li>
                </ol>
            </div>

            <div class="alert alert-secondary border-start border-4 border-secondary">
                <h3>Reranking Batch Limits</h3>
                <p class="mb-0">The reranker also respects <code>max_batch_size</code> for the number of candidate documents. If you have more than 32 candidates, either pre-filter with embeddings first (recommended) or split into multiple rerank calls and merge results.</p>
            </div>
        </section>

        <!-- ============================================================ -->
        <!-- INTEGRATION GUIDE -->
        <!-- ============================================================ -->
        <section id="integration" class="mb-5">
            <h2 class="h2 mb-4">Integration Guide</h2>

            <div class="alert alert-primary border-start border-4 border-primary">
                <h3>Configuring a Consuming Application</h3>
                <p>To integrate Synesis into another system, configure these settings:</p>
                <table class="table table-bordered mt-3 mb-0">
                    <thead class="table-dark">
                        <tr>
                            <th>Setting</th>
                            <th>Value</th>
                            <th>Notes</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td>Embedding API URL</td>
                            <td><code>http://&lt;host&gt;:8400/v1/embeddings</code></td>
                            <td>POST, JSON body</td>
                        </tr>
                        <tr>
                            <td>Rerank API URL</td>
                            <td><code>http://&lt;host&gt;:8400/v1/rerank</code></td>
                            <td>POST, JSON body</td>
                        </tr>
                        <tr>
                            <td>Health check URL</td>
                            <td><code>http://&lt;host&gt;:8400/ready/</code></td>
                            <td>GET, 200 = ready</td>
                        </tr>
                        <tr>
                            <td>Embedding dimension</td>
                            <td><code>2048</code> (or your chosen value)</td>
                            <td>Must match vector DB index config</td>
                        </tr>
                        <tr>
                            <td>Authentication</td>
                            <td>None</td>
                            <td>Secure via network policy</td>
                        </tr>
                        <tr>
                            <td>Content-Type</td>
                            <td><code>application/json</code></td>
                            <td>All endpoints</td>
                        </tr>
                        <tr>
                            <td>Timeout</td>
                            <td>30–60 seconds</td>
                            <td>Image inputs take longer; adjust for batch size</td>
                        </tr>
                    </tbody>
                </table>
            </div>

            <h3 class="mt-4">Python Integration Example</h3>
            <div class="card my-3">
                <div class="card-body">
                    <pre class="mb-0">import requests

SYNESIS_URL = "http://synesis-host:8400"

# --- Generate embeddings ---
resp = requests.post(f"{SYNESIS_URL}/v1/embeddings", json={
    "inputs": [
        {"text": "How to train a neural network"},
        {"text": "Best practices for deep learning"},
    ],
    "dimension": 1024,
})
data = resp.json()
vectors = [e["embedding"] for e in data["embeddings"]]
# vectors[0] is a list of 1024 floats

# --- Rerank candidates ---
resp = requests.post(f"{SYNESIS_URL}/v1/rerank", json={
    "query": {"text": "neural network training"},
    "documents": [
        {"text": "Backpropagation adjusts weights using gradients..."},
        {"text": "The weather forecast for tomorrow is sunny..."},
        {"text": "Stochastic gradient descent is an optimization method..."},
    ],
    "top_n": 2,
})
ranked = resp.json()
for result in ranked["results"]:
    print(f"  #{result['index']} score={result['score']:.4f}")
    print(f"    {result['document']['text'][:80]}")</pre>
                </div>
            </div>

            <h3 class="mt-4">Typical Two-Stage Retrieval Pipeline</h3>
            <div class="alert alert-info border-start border-4 border-info">
                <ol class="mb-0">
                    <li><strong>Index time:</strong> Embed all documents via <code>/v1/embeddings</code> and store vectors in your vector database (e.g. pgvector, Qdrant, Milvus, Weaviate).</li>
                    <li><strong>Query time — Stage 1 (Recall):</strong> Embed the query via <code>/v1/embeddings</code>, perform approximate nearest neighbour (ANN) search in the vector DB to retrieve top 20–50 candidates.</li>
                    <li><strong>Query time — Stage 2 (Precision):</strong> Pass the query and candidates to <code>/v1/rerank</code> to get precise relevance scores. Return the top 5–10 to the user or LLM context.</li>
                </ol>
            </div>
        </section>

        <!-- ============================================================ -->
        <!-- SIMILARITY -->
        <!-- ============================================================ -->
        <section id="similarity" class="mb-5">
            <h2 class="h2 mb-4">Similarity API</h2>

            <div class="alert alert-primary border-start border-4 border-primary">
                <h3>POST /v1/similarity</h3>
                <p class="mb-0">Compute cosine similarity between exactly two inputs. A convenience wrapper — embeds both, normalizes, and returns the dot product.</p>
            </div>

            <h3 class="mt-4">Request Body</h3>
            <table class="table table-bordered">
                <thead class="table-dark">
                    <tr>
                        <th>Field</th>
                        <th>Type</th>
                        <th>Required</th>
                        <th>Description</th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td><code>a</code></td>
                        <td>object</td>
                        <td>Yes</td>
                        <td>First input (<code>text</code>, <code>image</code>, or both).</td>
                    </tr>
                    <tr>
                        <td><code>b</code></td>
                        <td>object</td>
                        <td>Yes</td>
                        <td>Second input (<code>text</code>, <code>image</code>, or both).</td>
                    </tr>
                    <tr>
                        <td><code>dimension</code></td>
                        <td>int</td>
                        <td>No</td>
                        <td>Embedding dimension for comparison (64–2048). Default: 2048.</td>
                    </tr>
                </tbody>
            </table>

            <h3 class="mt-4">Response Body</h3>
            <table class="table table-bordered">
                <thead class="table-dark">
                    <tr>
                        <th>Field</th>
                        <th>Type</th>
                        <th>Description</th>
                    </tr>
                </thead>
                <tbody>
                    <tr>
                        <td><code>score</code></td>
                        <td>float</td>
                        <td>Cosine similarity (−1.0 to 1.0). Higher = more similar.</td>
                    </tr>
                    <tr>
                        <td><code>dimension</code></td>
                        <td>int</td>
                        <td>Dimension used for the comparison.</td>
                    </tr>
                </tbody>
            </table>
        </section>

        <!-- ============================================================ -->
        <!-- OPERATIONS -->
        <!-- ============================================================ -->
        <section id="operations" class="mb-5">
            <h2 class="h2 mb-4">Operations &amp; Monitoring</h2>

            <div class="alert alert-info border-start border-4 border-info">
                <h3>Health &amp; Readiness Endpoints</h3>
                <table class="table table-bordered mt-3 mb-0">
                    <thead class="table-dark">
                        <tr>
                            <th>Endpoint</th>
                            <th>Method</th>
                            <th>Purpose</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td><code>/ready/</code></td>
                            <td>GET</td>
                            <td>Readiness probe. Returns 200 when both models are loaded and GPU is available. 503 otherwise. Use for load balancer health checks.</td>
                        </tr>
                        <tr>
                            <td><code>/live/</code></td>
                            <td>GET</td>
                            <td>Liveness probe. Returns 200 if the process is alive. Use for container restart decisions.</td>
                        </tr>
                        <tr>
                            <td><code>/health</code></td>
                            <td>GET</td>
                            <td>Detailed status: model paths, loaded state, GPU device name, VRAM usage.</td>
                        </tr>
                        <tr>
                            <td><code>/models</code><br><code>/v1/models</code></td>
                            <td>GET</td>
                            <td>List available models (OpenAI-compatible). Returns model IDs, capabilities, and metadata. Used by OpenAI SDK clients for model discovery.</td>
                        </tr>
                        <tr>
                            <td><code>/metrics</code></td>
                            <td>GET</td>
                            <td>Prometheus metrics (request counts, latency histograms, GPU memory, model status).</td>
                        </tr>
                    </tbody>
                </table>
            </div>

            <div class="alert alert-warning border-start border-4 border-warning">
                <h3>Prometheus Metrics</h3>
                <p>Key custom metrics exposed:</p>
                <ul class="mb-0">
                    <li><code>embedding_model_loaded</code> — Gauge (1 = loaded)</li>
                    <li><code>reranker_model_loaded</code> — Gauge (1 = loaded)</li>
                    <li><code>embedding_gpu_memory_bytes</code> — Gauge (current GPU allocation)</li>
                    <li><code>embedding_inference_requests_total{endpoint}</code> — Counter per endpoint (embeddings, similarity, rerank)</li>
                    <li><code>embedding_inference_duration_seconds{endpoint}</code> — Histogram of inference latency</li>
                    <li>Plus standard HTTP metrics from <code>prometheus-fastapi-instrumentator</code></li>
                </ul>
            </div>

            <div class="alert alert-secondary border-start border-4 border-secondary">
                <h3>Environment Configuration</h3>
                <p>All settings use the <code>EMBEDDING_</code> prefix and can be overridden via environment variables or <code>/etc/default/synesis</code>:</p>
                <table class="table table-bordered mt-3 mb-0">
                    <thead class="table-dark">
                        <tr>
                            <th>Variable</th>
                            <th>Default</th>
                            <th>Description</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td><code>EMBEDDING_MODEL_PATH</code></td>
                            <td><code>./models/Qwen3-VL-Embedding-2B</code></td>
                            <td>Path to embedding model weights</td>
                        </tr>
                        <tr>
                            <td><code>EMBEDDING_RERANKER_MODEL_PATH</code></td>
                            <td><code>./models/Qwen3-VL-Reranker-2B</code></td>
                            <td>Path to reranker model weights</td>
                        </tr>
                        <tr>
                            <td><code>EMBEDDING_TORCH_DTYPE</code></td>
                            <td><code>float16</code></td>
                            <td>Model precision (<code>float16</code> or <code>bfloat16</code>)</td>
                        </tr>
                        <tr>
                            <td><code>EMBEDDING_USE_FLASH_ATTENTION</code></td>
                            <td><code>true</code></td>
                            <td>Enable Flash Attention 2</td>
                        </tr>
                        <tr>
                            <td><code>EMBEDDING_DEFAULT_DIMENSION</code></td>
                            <td><code>2048</code></td>
                            <td>Default embedding dimension when not specified per request</td>
                        </tr>
                        <tr>
                            <td><code>EMBEDDING_MAX_BATCH_SIZE</code></td>
                            <td><code>32</code></td>
                            <td>Maximum inputs per request (both embeddings and rerank)</td>
                        </tr>
                        <tr>
                            <td><code>EMBEDDING_HOST</code></td>
                            <td><code>0.0.0.0</code></td>
                            <td>Bind address</td>
                        </tr>
                        <tr>
                            <td><code>EMBEDDING_PORT</code></td>
                            <td><code>8400</code></td>
                            <td>Listen port</td>
                        </tr>
                    </tbody>
                </table>
            </div>
        </section>

        <!-- ============================================================ -->
        <!-- ERROR HANDLING -->
        <!-- ============================================================ -->
        <section id="errors" class="mb-5">
            <h2 class="h2 mb-4">Error Handling</h2>

            <div class="alert alert-danger border-start border-4 border-danger">
                <h3>HTTP Status Codes</h3>
                <table class="table table-bordered mt-3 mb-0">
                    <thead class="table-dark">
                        <tr>
                            <th>Code</th>
                            <th>Meaning</th>
                            <th>Action</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td><code>200</code></td>
                            <td>Success</td>
                            <td>Process the response.</td>
                        </tr>
                        <tr>
                            <td><code>422</code></td>
                            <td>Validation error</td>
                            <td>Check your request body. Batch size may exceed <code>max_batch_size</code>, or required fields are missing.</td>
                        </tr>
                        <tr>
                            <td><code>500</code></td>
                            <td>Inference error</td>
                            <td>Model failed during processing. Check server logs. May indicate OOM with large image batches.</td>
                        </tr>
                        <tr>
                            <td><code>503</code></td>
                            <td>Model not loaded</td>
                            <td>Service is starting up or a model failed to load. Retry after checking <code>/ready/</code>.</td>
                        </tr>
                    </tbody>
                </table>
            </div>
        </section>

        <!-- Footer -->
        <div class="alert alert-secondary border-start border-4 border-secondary">
            <p class="mb-0"><strong>Synesis v0.2.0</strong> — Qwen3-VL Embedding &amp; Reranking Service. For interactive API exploration, visit <code>/docs</code> on the running service.</p>
        </div>

    </div>

    <!-- Bootstrap JS -->
    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>

    <!-- Mermaid init -->
    <script>
        mermaid.initialize({
            startOnLoad: true,
            theme: window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'default'
        });
    </script>

    <!-- Dark mode support -->
    <script>
        if (window.matchMedia('(prefers-color-scheme: dark)').matches) {
            document.documentElement.setAttribute('data-bs-theme', 'dark');
        }
        window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', function(e) {
            document.documentElement.setAttribute('data-bs-theme', e.matches ? 'dark' : 'light');
        });
    </script>
</body>
</html>