Added benchmark command and test data

2026-03-24 17:05:46 +00:00 · 2026-03-24 17:05:46 +00:00 · 4ae45ff647
commit 4ae45ff647
parent 9afabeb6f7
4 changed files with 851 additions and 31 deletions
--- a/apps/accounts/management/commands/benchmark.py
+++ b/apps/accounts/management/commands/benchmark.py
@ -0,0 +1,484 @@
+import datetime
+import json
+import statistics
+import time
+from pathlib import Path
+
+import httpx
+from django.conf import settings
+from django.core.management.base import BaseCommand
+from django.db.models import Q
+from pgvector.django import CosineDistance
+
+from apps.accounts.models import Organization, Role, User
+from apps.knowledge.models import KnowledgeChunk, TrainingFile
+from apps.onboarding.models import OnboardingSession
+
+
+class Command(BaseCommand):
+    help = "Benchmark Dynavera system components: GPU inference server, pgvector retrieval, and database."
+
+    def add_arguments(self, parser):
+        parser.add_argument("--runs", type=int, default=5, help="Repetitions per latency benchmark (default: 5)")
+        parser.add_argument("--out", type=str, default="benchmarks", help="Output directory for the results file (default: benchmarks/)")
+        parser.add_argument("--skip-llm", action="store_true", help="Skip LLM inference benchmarks (each prompt takes ~30 s)")
+
+    def handle(self, *args, **options):
+        self.runs = options["runs"]
+        self.skip_llm = options["skip_llm"]
+        self.out_dir = Path(options["out"])
+        self.out_dir.mkdir(exist_ok=True)
+        self.results = {}
+
+        self.stdout.write(self.style.SUCCESS("\n=== Dynavera System Benchmark ==="))
+        self.stdout.write(f"  Inference endpoint : {settings.INFERENCE_URL}")
+        self.stdout.write(f"  Repetitions        : {self.runs}")
+        self.stdout.write(f"  LLM benchmarks     : {'SKIPPED (--skip-llm)' if self.skip_llm else 'ENABLED'}\n")
+
+        self._bench_health()
+        self._bench_embeddings()
+        self._bench_chunking()
+        if not self.skip_llm:
+            self._bench_llm()
+        self._bench_database()
+        self._bench_retrieval()
+        self._print_summary()
+        self._save_report()
+
+    def _req(self, method, path, **kwargs):
+        url = f"{settings.INFERENCE_URL}{path}"
+        resp = httpx.request(method, url, auth=settings.INFERENCE_AUTH, timeout=180, **kwargs)
+        resp.raise_for_status()
+        return resp.json()
+
+    def _time_fn(self, fn):
+        t0 = time.perf_counter()
+        result = fn()
+        return result, (time.perf_counter() - t0) * 1000
+
+    def _stats(self, times_ms):
+        s = sorted(times_ms)
+        n = len(s)
+        p95_idx = min(n - 1, int(-(-(0.95 * n) // 1)) - 1)
+        return {
+            "mean_ms": round(statistics.mean(s), 1),
+            "median_ms": round(statistics.median(s), 1),
+            "p95_ms": round(s[p95_idx], 1),
+            "min_ms": round(s[0], 1),
+            "max_ms": round(s[-1], 1),
+        }
+
+    def _bench_health(self):
+        self.stdout.write("[ 1/6 ] GPU server health check ...")
+        try:
+            data, ms = self._time_fn(lambda: self._req("GET", "/health"))
+            ok = data.get("status") == "ok"
+            self.results["health"] = {
+                "status": "OK" if ok else "DEGRADED",
+                "llm_ready": data.get("llm_ready", False),
+                "embed_ready": data.get("embedding_ready", False),
+                "latency_ms": round(ms, 1),
+            }
+            h = self.results["health"]
+            self.stdout.write(
+                f"  {h['status']}  |  LLM: {'ready' if h['llm_ready'] else 'unloaded'}  "
+                f"|  Embed: {'ready' if h['embed_ready'] else 'not ready'}  |  {ms:.0f} ms"
+            )
+        except Exception as exc:
+            self.results["health"] = {"status": "ERROR", "error": str(exc)}
+            self.stdout.write(self.style.ERROR(f"  FAILED: {exc}"))
+
+    def _bench_embeddings(self):
+        self.stdout.write(f"\n[ 2/6 ] Embedding latency  ({self.runs} runs × 3 query lengths) ...")
+        queries = {
+            "short   ": "What is onboarding?",
+            "medium  ": (
+                "Explain the process for configuring access control policies for a new software engineer "
+                "joining the platform team, including approval workflows and tool provisioning steps."
+            ),
+            "long    ": (
+                "A new hire on the infrastructure team needs to understand our CI/CD pipeline, deployment "
+                "procedures, incident response protocols, monitoring dashboards, on-call rotation policy, "
+                "and how to request access to production systems. Provide a comprehensive overview of all "
+                "these areas including the relevant tools, key contacts, and escalation procedures they "
+                "should be aware of during their first week and first month at the company."
+            ),
+        }
+        embed_results = {}
+        for label, query in queries.items():
+            times = []
+            for _ in range(self.runs):
+                _, ms = self._time_fn(lambda q=query: self._req("POST", "/v1/embeddings", json={"input": q}))
+                times.append(ms)
+            st = self._stats(times)
+            embed_results[label.strip()] = {"query_chars": len(query), **st}
+            self.stdout.write(
+                f"  {label}({len(query):4d} chars)  mean={st['mean_ms']:.0f} ms  "
+                f"p95={st['p95_ms']:.0f} ms  min={st['min_ms']:.0f} ms  max={st['max_ms']:.0f} ms"
+            )
+        self.results["embeddings"] = embed_results
+
+    def _bench_chunking(self):
+        self.stdout.write("\n[ 3/6 ] Semantic chunking latency ...")
+        texts = {
+            "small  (~200 c)": "a " * 100,
+            "medium (~2k c) ": (
+                "This section covers the onboarding process for new employees joining the engineering team. "
+                "You will learn about code review practices, deployment procedures, incident response, and "
+                "team communication protocols. Each topic is covered in depth with examples and references "
+                "to internal documentation. All engineers are expected to complete this module in week one. "
+            ) * 5,
+            "large  (~8k c) ": (
+                "The infrastructure team manages all cloud resources, CI/CD pipelines, and production environments. "
+                "New members are expected to understand Kubernetes cluster management, Terraform IaC, "
+                "GitLab CI pipeline authoring, monitoring with Grafana and Prometheus, and incident response procedures. "
+                "This document provides a comprehensive guide to each area including runbooks and escalation paths. "
+            ) * 20,
+        }
+        chunk_results = {}
+        for label, text in texts.items():
+            try:
+                result, ms = self._time_fn(lambda t=text: self._req("POST", "/v1/semantic-chunk", json={"text": t}))
+                n = len(result.get("chunks", []))
+                chunk_results[label.strip()] = {"chars": len(text), "chunks_produced": n, "latency_ms": round(ms, 1)}
+                self.stdout.write(f"  {label}  →  {n} chunks  |  {ms:.0f} ms")
+            except Exception as exc:
+                chunk_results[label.strip()] = {"error": str(exc)}
+                self.stdout.write(self.style.ERROR(f"  {label}  FAILED: {exc}"))
+        self.results["chunking"] = chunk_results
+
+    def _bench_llm(self):
+        self.stdout.write("\n[ 4/6 ] LLM inference latency  (each prompt is a single non-streaming call) ...")
+        prompts = [
+            {
+                "label": "short_qa",
+                "system": "You are an onboarding assistant.",
+                "user": "What does a Kubernetes pod do? Answer in 2 sentences.",
+                "max_tokens": 128,
+            },
+            {
+                "label": "progress_summary",
+                "system": "You are an onboarding assistant.",
+                "user": (
+                    "A trainee has completed: Git Basics, CI/CD Pipelines, Code Review. Score: 85%. "
+                    "Write a 2-sentence progress summary."
+                ),
+                "max_tokens": 128,
+            },
+            {
+                "label": "curriculum_gen",
+                "system": "You are an onboarding assistant. Output only a valid JSON array of strings.",
+                "user": (
+                    "Create a 6-module onboarding curriculum for a Software Engineer role focused on "
+                    "backend services. Output ONLY a JSON array of module title strings."
+                ),
+                "max_tokens": 256,
+            },
+            {
+                "label": "assessment_gen",
+                "system": "You are an onboarding assistant. Output only valid JSON.",
+                "user": (
+                    "Generate 3 multiple-choice questions to assess understanding of CI/CD pipelines. "
+                    "Output as a JSON array of objects with keys: question, options (array of 4), answer."
+                ),
+                "max_tokens": 512,
+            },
+            {
+                "label": "knowledge_explanation",
+                "system": "You are an onboarding assistant.",
+                "user": (
+                    "Explain Git branching strategy best practices for a new engineer. "
+                    "Cover: feature branches, naming conventions, merge vs rebase, and PR workflow. "
+                    "Use clear headings and bullet points. Target ~400 words."
+                ),
+                "max_tokens": 700,
+            },
+        ]
+        llm_results = {}
+        for p in prompts:
+            self.stdout.write(f"  {p['label']} (max_tokens={p['max_tokens']}) ...", ending="")
+            self.stdout.flush()
+            try:
+                t0 = time.perf_counter()
+                data = self._req(
+                    "POST",
+                    "/v1/chat/completions",
+                    json={
+                        "messages": [
+                            {"role": "system", "content": p["system"]},
+                            {"role": "user", "content": p["user"]},
+                        ],
+                        "max_tokens": p["max_tokens"],
+                        "stream": False,
+                    },
+                )
+                elapsed_s = time.perf_counter() - t0
+                usage = data.get("usage", {})
+                ct = usage.get("completion_tokens", 0)
+                pt = usage.get("prompt_tokens", 0)
+                tps = round(ct / elapsed_s, 1) if elapsed_s > 0 and ct > 0 else 0
+                preview = (data["choices"][0]["message"]["content"] or "")[:100].replace("\n", " ")
+                llm_results[p["label"]] = {
+                    "elapsed_s": round(elapsed_s, 2),
+                    "prompt_tokens": pt,
+                    "completion_tokens": ct,
+                    "tokens_per_sec": tps,
+                    "response_preview": preview,
+                }
+                self.stdout.write(f"  {elapsed_s:.1f} s  |  {ct} tokens  |  {tps} tok/s")
+            except Exception as exc:
+                llm_results[p["label"]] = {"error": str(exc)}
+                self.stdout.write(self.style.ERROR(f"  FAILED: {exc}"))
+        self.results["llm"] = llm_results
+
+    def _bench_database(self):
+        self.stdout.write("\n[ 5/6 ] Database statistics ...")
+        try:
+            from django.db import connection
+            with connection.cursor() as cur:
+                cur.execute("SELECT 1 FROM knowledge_knowledgechunk LIMIT 1")
+        except Exception:
+            self.stdout.write(self.style.WARNING("  Tables missing — run 'manage.py migrate' first. Skipping."))
+            self.results["database"] = {"skipped": "Migrations not applied."}
+            return
+        try:
+            self.results["database"] = {
+                "organizations": Organization.objects.count(),
+                "roles": Role.objects.count(),
+                "users": User.objects.count(),
+                "training_files_total": TrainingFile.objects.count(),
+                "training_files_embedded": TrainingFile.objects.filter(status="embedded").count(),
+                "knowledge_chunks_with_embeddings": KnowledgeChunk.objects.filter(embedding__isnull=False, is_active=True).count(),
+                "onboarding_sessions": OnboardingSession.objects.count(),
+            }
+            d = self.results["database"]
+            self.stdout.write(f"  Orgs: {d['organizations']}  |  Roles: {d['roles']}  |  Users: {d['users']}")
+            self.stdout.write(f"  Training files: {d['training_files_total']} total  ({d['training_files_embedded']} embedded)")
+            self.stdout.write(f"  Knowledge chunks (with embeddings): {d['knowledge_chunks_with_embeddings']}")
+            self.stdout.write(f"  Onboarding sessions: {d['onboarding_sessions']}")
+        except Exception as exc:
+            self.results["database"] = {"error": str(exc)}
+            self.stdout.write(self.style.ERROR(f"  FAILED: {exc}"))
+
+    def _bench_retrieval(self):
+        self.stdout.write(f"\n[ 6/6 ] pgvector retrieval latency  ({self.runs} runs × top-k ∈ [5, 10, 20]) ...")
+        try:
+            role = Role.objects.filter(knowledge_chunks__embedding__isnull=False).distinct().first()
+        except Exception as exc:
+            self.stdout.write(self.style.WARNING(f"  DB not ready ({exc}). Skipping."))
+            self.results["retrieval"] = {"skipped": str(exc)}
+            return
+        if role is None:
+            self.stdout.write(self.style.WARNING("  No role with embedded chunks — skipping."))
+            self.results["retrieval"] = {"skipped": "No embedded chunks found in database."}
+            return
+
+        query = "What are the key responsibilities, tools, and procedures for this role?"
+        self.stdout.write(f"  Role: {role.name} (org: {role.organization.name})")
+        self.stdout.write(f"  Query: \"{query}\"")
+
+        try:
+            embed_data = self._req("POST", "/v1/embeddings", json={"input": query})
+            query_vector = embed_data["data"][0]["embedding"]
+        except Exception as exc:
+            self.results["retrieval"] = {"error": f"Could not generate query embedding: {exc}"}
+            self.stdout.write(self.style.ERROR(f"  FAILED to get embedding: {exc}"))
+            return
+
+        total_chunks = KnowledgeChunk.objects.filter(embedding__isnull=False, is_active=True).count()
+        retrieval_results = {}
+        for top_k in [5, 10, 20]:
+            times = []
+            n_returned = 0
+            for _ in range(self.runs):
+                t0 = time.perf_counter()
+                chunks = list(
+                    KnowledgeChunk.objects.filter(
+                        organization=role.organization,
+                        embedding__isnull=False,
+                        is_active=True,
+                    ).filter(
+                        Q(role=role) | Q(role__isnull=True)
+                    ).annotate(
+                        distance=CosineDistance("embedding", query_vector)
+                    ).order_by("distance")[:top_k]
+                )
+                times.append((time.perf_counter() - t0) * 1000)
+                n_returned = len(chunks)
+            st = self._stats(times)
+            retrieval_results[f"top_{top_k}"] = {"results_returned": n_returned, **st}
+            self.stdout.write(
+                f"  top-{top_k:2d}:  mean={st['mean_ms']:.1f} ms  "
+                f"p95={st['p95_ms']:.1f} ms  min={st['min_ms']:.1f} ms  max={st['max_ms']:.1f} ms"
+            )
+        self.results["retrieval"] = {
+            "role": role.name,
+            "organization": role.organization.name,
+            "query": query,
+            "total_chunks_in_db": total_chunks,
+            "results": retrieval_results,
+        }
+
+    def _print_summary(self):
+        self.stdout.write(self.style.SUCCESS("\n=== Summary ===\n"))
+        h = self.results.get("health", {})
+        self.stdout.write(f"  GPU Server     : {h.get('status', 'N/A')} — LLM {'ready' if h.get('llm_ready') else 'unloaded'}, embed {'ready' if h.get('embed_ready') else 'N/A'}")
+
+        emb = self.results.get("embeddings", {})
+        means = [v["mean_ms"] for v in emb.values() if "mean_ms" in v]
+        if means:
+            self.stdout.write(f"  Embedding      : {min(means):.0f}–{max(means):.0f} ms (mean across query lengths)")
+
+        chnk = self.results.get("chunking", {})
+        lats = [v["latency_ms"] for v in chnk.values() if "latency_ms" in v]
+        if lats:
+            self.stdout.write(f"  Chunking       : {min(lats):.0f}–{max(lats):.0f} ms range by text size")
+
+        llm = self.results.get("llm", {})
+        elapsed = [v["elapsed_s"] for v in llm.values() if "elapsed_s" in v]
+        tps_all = [v["tokens_per_sec"] for v in llm.values() if "tokens_per_sec" in v and v["tokens_per_sec"] > 0]
+        if elapsed:
+            self.stdout.write(
+                f"  LLM inference  : {min(elapsed):.1f}–{max(elapsed):.1f} s range"
+                + (f" | {statistics.mean(tps_all):.1f} tok/s avg" if tps_all else "")
+            )
+
+        ret = self.results.get("retrieval", {})
+        r5 = ret.get("results", {}).get("top_5", {})
+        if r5.get("mean_ms"):
+            self.stdout.write(f"  RAG retrieval  : {r5['mean_ms']:.1f} ms mean (top-5, {ret.get('total_chunks_in_db', '?')} total chunks)")
+
+        db = self.results.get("database", {})
+        if "knowledge_chunks_with_embeddings" in db:
+            self.stdout.write(
+                f"  Knowledge base : {db['knowledge_chunks_with_embeddings']} chunks from "
+                f"{db['training_files_embedded']} embedded files"
+            )
+
+    def _save_report(self):
+        ts = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        path = self.out_dir / f"results_{ts}.md"
+
+        lines = [
+            "# Dynavera Benchmark Results",
+            "",
+            f"**Date:** {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  ",
+            f"**Inference endpoint:** `{settings.INFERENCE_URL}`  ",
+            f"**Repetitions per benchmark:** {self.runs}  ",
+            "",
+        ]
+
+        h = self.results.get("health", {})
+        lines += [
+            "## 1. GPU Server Health",
+            "",
+            "| Field | Value |",
+            "|---|---|",
+            f"| Status | {h.get('status', 'N/A')} |",
+            f"| LLM Ready | {h.get('llm_ready', 'N/A')} |",
+            f"| Embed Ready | {h.get('embed_ready', 'N/A')} |",
+            f"| Health check RTT | {h.get('latency_ms', 'N/A')} ms |",
+            "",
+        ]
+
+        emb = self.results.get("embeddings", {})
+        if emb:
+            lines += [
+                "## 2. Embedding Latency",
+                "",
+                "| Query type | Chars | Mean (ms) | Median (ms) | P95 (ms) | Min (ms) | Max (ms) |",
+                "|---|---|---|---|---|---|---|",
+            ]
+            for label, v in emb.items():
+                if "mean_ms" in v:
+                    lines.append(f"| {label} | {v['query_chars']} | {v['mean_ms']} | {v['median_ms']} | {v['p95_ms']} | {v['min_ms']} | {v['max_ms']} |")
+            lines.append("")
+
+        chnk = self.results.get("chunking", {})
+        if chnk:
+            lines += [
+                "## 3. Semantic Chunking Latency",
+                "",
+                "| Input size | Chars | Chunks produced | Latency (ms) |",
+                "|---|---|---|---|",
+            ]
+            for label, v in chnk.items():
+                if "latency_ms" in v:
+                    lines.append(f"| {label} | {v['chars']} | {v['chunks_produced']} | {v['latency_ms']} |")
+            lines.append("")
+
+        llm = self.results.get("llm", {})
+        if llm:
+            lines += [
+                "## 4. LLM Inference Latency",
+                "",
+                "| Prompt type | Elapsed (s) | Prompt tokens | Completion tokens | Tok/s |",
+                "|---|---|---|---|---|",
+            ]
+            for label, v in llm.items():
+                if "elapsed_s" in v:
+                    lines.append(
+                        f"| {label} | {v['elapsed_s']} | {v['prompt_tokens']} | {v['completion_tokens']} | {v['tokens_per_sec']} |"
+                    )
+                else:
+                    lines.append(f"| {label} | ERROR | — | — | — |")
+            lines.append("")
+            lines += [
+                "> **Note on end-to-end session time:** A full onboarding session invokes multiple sequential",
+                "> inference calls (curriculum generation → knowledge explanation × N modules → assessment generation → progress summary).",
+                "> Total wall-clock time accumulates across all turns plus retrieval and tool-call overhead.",
+                "",
+            ]
+
+        db = self.results.get("database", {})
+        if db and "error" not in db:
+            lines += [
+                "## 5. Database Statistics",
+                "",
+                "| Entity | Count |",
+                "|---|---|",
+            ]
+            labels = {
+                "organizations": "Organizations",
+                "roles": "Roles",
+                "users": "Users",
+                "training_files_total": "Training Files (total)",
+                "training_files_embedded": "Training Files (embedded)",
+                "knowledge_chunks_with_embeddings": "Knowledge Chunks (with embeddings)",
+                "onboarding_sessions": "Onboarding Sessions",
+            }
+            for key, label in labels.items():
+                if key in db:
+                    lines.append(f"| {label} | {db[key]} |")
+            lines.append("")
+
+        ret = self.results.get("retrieval", {})
+        if "results" in ret:
+            lines += [
+                "## 6. pgvector Retrieval Latency",
+                "",
+                f"**Role:** {ret.get('role')}  ",
+                f"**Organisation:** {ret.get('organization')}  ",
+                f'**Query:** "{ret.get("query")}"  ',
+                f"**Total chunks in DB:** {ret.get('total_chunks_in_db')}  ",
+                "",
+                "| Top-K | Results returned | Mean (ms) | Median (ms) | P95 (ms) | Min (ms) | Max (ms) |",
+                "|---|---|---|---|---|---|---|",
+            ]
+            for k, v in ret["results"].items():
+                lines.append(
+                    f"| {k} | {v['results_returned']} | {v['mean_ms']} | {v['median_ms']} | {v['p95_ms']} | {v['min_ms']} | {v['max_ms']} |"
+                )
+            lines.append("")
+
+        lines += [
+            "## Raw JSON",
+            "",
+            "```json",
+            json.dumps(self.results, indent=2, default=str),
+            "```",
+            "",
+        ]
+
+        path.write_text("\n".join(lines), encoding="utf-8")
+        self.stdout.write(self.style.SUCCESS(f"\nResults saved → {path}"))
--- a/benchmarks/results_2026-03-24_13-28-54.md
+++ b/benchmarks/results_2026-03-24_13-28-54.md
@ -0,0 +1,161 @@
+# Dynavera Benchmark Results
+
+**Date:** 2026-03-24 13:28:54  
+**Inference endpoint:** `http://fyp-inference-dev:8001`  
+**Repetitions per benchmark:** 5  
+
+## 1. GPU Server Health
+
+| Field | Value |
+|---|---|
+| Status | OK |
+| LLM Ready | True |
+| Embed Ready | True |
+| Health check RTT | 51.0 ms |
+
+## 2. Embedding Latency
+
+| Query type | Chars | Mean (ms) | Median (ms) | P95 (ms) | Min (ms) | Max (ms) |
+|---|---|---|---|---|---|---|
+| short | 19 | 95.5 | 25.1 | 378.6 | 23.0 | 378.6 |
+| medium | 172 | 25.7 | 24.7 | 29.4 | 24.3 | 29.4 |
+| long | 428 | 27.5 | 26.7 | 32.2 | 24.8 | 32.2 |
+
+## 3. Semantic Chunking Latency
+
+| Input size | Chars | Chunks produced | Latency (ms) |
+|---|---|---|---|
+| small  (~200 c) | 200 | 1 | 28.4 |
+| medium (~2k c) | 1810 | 1 | 77.0 |
+| large  (~8k c) | 7740 | 1 | 206.3 |
+
+## 4. LLM Inference Latency
+
+| Prompt type | Elapsed (s) | Prompt tokens | Completion tokens | Tok/s |
+|---|---|---|---|---|
+| short_qa | 1.5 | 55 | 69 | 46.0 |
+| progress_summary | 1.36 | 74 | 71 | 52.3 |
+| curriculum_gen | 1.67 | 79 | 82 | 49.0 |
+| assessment_gen | 5.03 | 83 | 235 | 46.7 |
+| knowledge_explanation | 9.31 | 83 | 496 | 53.3 |
+
+> **Note on end-to-end session time:** A full onboarding session invokes multiple sequential
+> inference calls (curriculum generation → knowledge explanation × N modules → assessment generation → progress summary).
+> Total wall-clock time accumulates across all turns plus retrieval and tool-call overhead.
+
+## 5. Database Statistics
+
+| Entity | Count |
+|---|---|
+| Organizations | 3 |
+| Roles | 10 |
+| Users | 12 |
+| Training Files (total) | 0 |
+| Training Files (embedded) | 0 |
+| Knowledge Chunks (with embeddings) | 0 |
+| Onboarding Sessions | 4 |
+
+## Raw JSON
+
+```json
+{
+  "health": {
+    "status": "OK",
+    "llm_ready": true,
+    "embed_ready": true,
+    "latency_ms": 51.0
+  },
+  "embeddings": {
+    "short": {
+      "query_chars": 19,
+      "mean_ms": 95.5,
+      "median_ms": 25.1,
+      "p95_ms": 378.6,
+      "min_ms": 23.0,
+      "max_ms": 378.6
+    },
+    "medium": {
+      "query_chars": 172,
+      "mean_ms": 25.7,
+      "median_ms": 24.7,
+      "p95_ms": 29.4,
+      "min_ms": 24.3,
+      "max_ms": 29.4
+    },
+    "long": {
+      "query_chars": 428,
+      "mean_ms": 27.5,
+      "median_ms": 26.7,
+      "p95_ms": 32.2,
+      "min_ms": 24.8,
+      "max_ms": 32.2
+    }
+  },
+  "chunking": {
+    "small  (~200 c)": {
+      "chars": 200,
+      "chunks_produced": 1,
+      "latency_ms": 28.4
+    },
+    "medium (~2k c)": {
+      "chars": 1810,
+      "chunks_produced": 1,
+      "latency_ms": 77.0
+    },
+    "large  (~8k c)": {
+      "chars": 7740,
+      "chunks_produced": 1,
+      "latency_ms": 206.3
+    }
+  },
+  "llm": {
+    "short_qa": {
+      "elapsed_s": 1.5,
+      "prompt_tokens": 55,
+      "completion_tokens": 69,
+      "tokens_per_sec": 46.0,
+      "response_preview": "A Kubernetes pod is a logical host for one or more containers, providing a shared network namespace,"
+    },
+    "progress_summary": {
+      "elapsed_s": 1.36,
+      "prompt_tokens": 74,
+      "completion_tokens": 71,
+      "tokens_per_sec": 52.3,
+      "response_preview": "The trainee has made significant progress in their onboarding journey, demonstrating a strong founda"
+    },
+    "curriculum_gen": {
+      "elapsed_s": 1.67,
+      "prompt_tokens": 79,
+      "completion_tokens": 82,
+      "tokens_per_sec": 49.0,
+      "response_preview": "[   \"Module 1: Introduction to Backend Services and Infrastructure\",   \"Module 2: Designing and Impl"
+    },
+    "assessment_gen": {
+      "elapsed_s": 5.03,
+      "prompt_tokens": 83,
+      "completion_tokens": 235,
+      "tokens_per_sec": 46.7,
+      "response_preview": "```json [   {     \"question\": \"What is the primary purpose of a Continuous Integration (CI) pipeline"
+    },
+    "knowledge_explanation": {
+      "elapsed_s": 9.31,
+      "prompt_tokens": 83,
+      "completion_tokens": 496,
+      "tokens_per_sec": 53.3,
+      "response_preview": "**Git Branching Strategy Best Practices**  As a new engineer, understanding a Git branching strategy"
+    }
+  },
+  "database": {
+    "organizations": 3,
+    "roles": 10,
+    "users": 12,
+    "training_files_total": 0,
+    "training_files_embedded": 0,
+    "knowledge_chunks_with_embeddings": 0,
+    "onboarding_sessions": 4
+  },
+  "retrieval": {
+    "skipped": "No embedded chunks found in database."
+  }
+}
+```
--- a/benchmarks/results_2026-03-24_13-29-55.md
+++ b/benchmarks/results_2026-03-24_13-29-55.md
@ -0,0 +1,203 @@
+# Dynavera Benchmark Results
+
+**Date:** 2026-03-24 13:29:55  
+**Inference endpoint:** `http://fyp-inference-dev:8001`  
+**Repetitions per benchmark:** 10  
+
+## 1. GPU Server Health
+
+| Field | Value |
+|---|---|
+| Status | OK |
+| LLM Ready | True |
+| Embed Ready | True |
+| Health check RTT | 44.5 ms |
+
+## 2. Embedding Latency
+
+| Query type | Chars | Mean (ms) | Median (ms) | P95 (ms) | Min (ms) | Max (ms) |
+|---|---|---|---|---|---|---|
+| short | 19 | 25.0 | 25.3 | 31.9 | 20.8 | 31.9 |
+| medium | 172 | 24.0 | 22.8 | 31.8 | 21.0 | 31.8 |
+| long | 428 | 29.8 | 27.5 | 37.7 | 25.0 | 37.7 |
+
+## 3. Semantic Chunking Latency
+
+| Input size | Chars | Chunks produced | Latency (ms) |
+|---|---|---|---|
+| small  (~200 c) | 200 | 1 | 26.7 |
+| medium (~2k c) | 1810 | 1 | 62.7 |
+| large  (~8k c) | 7740 | 1 | 204.0 |
+
+## 4. LLM Inference Latency
+
+| Prompt type | Elapsed (s) | Prompt tokens | Completion tokens | Tok/s |
+|---|---|---|---|---|
+| short_qa | 1.26 | 55 | 69 | 54.9 |
+| progress_summary | 1.24 | 74 | 68 | 54.9 |
+| curriculum_gen | 1.4 | 79 | 76 | 54.4 |
+| assessment_gen | 4.75 | 83 | 249 | 52.4 |
+| knowledge_explanation | 10.34 | 83 | 541 | 52.3 |
+
+> **Note on end-to-end session time:** A full onboarding session invokes multiple sequential
+> inference calls (curriculum generation → knowledge explanation × N modules → assessment generation → progress summary).
+> Total wall-clock time accumulates across all turns plus retrieval and tool-call overhead.
+
+## 5. Database Statistics
+
+| Entity | Count |
+|---|---|
+| Organizations | 3 |
+| Roles | 10 |
+| Users | 12 |
+| Training Files (total) | 1 |
+| Training Files (embedded) | 0 |
+| Knowledge Chunks (with embeddings) | 8 |
+| Onboarding Sessions | 4 |
+
+## 6. pgvector Retrieval Latency
+
+**Role:** fNIRS Specialist  
+**Organisation:** University of Birmingham  
+**Query:** "What are the key responsibilities, tools, and procedures for this role?"  
+**Total chunks in DB:** 8  
+
+| Top-K | Results returned | Mean (ms) | Median (ms) | P95 (ms) | Min (ms) | Max (ms) |
+|---|---|---|---|---|---|---|
+| top_5 | 5 | 2.3 | 2.0 | 5.0 | 1.9 | 5.0 |
+| top_10 | 8 | 2.4 | 2.4 | 3.1 | 2.3 | 3.1 |
+| top_20 | 8 | 2.3 | 2.3 | 2.6 | 2.2 | 2.6 |
+
+## Raw JSON
+
+```json
+{
+  "health": {
+    "status": "OK",
+    "llm_ready": true,
+    "embed_ready": true,
+    "latency_ms": 44.5
+  },
+  "embeddings": {
+    "short": {
+      "query_chars": 19,
+      "mean_ms": 25.0,
+      "median_ms": 25.3,
+      "p95_ms": 31.9,
+      "min_ms": 20.8,
+      "max_ms": 31.9
+    },
+    "medium": {
+      "query_chars": 172,
+      "mean_ms": 24.0,
+      "median_ms": 22.8,
+      "p95_ms": 31.8,
+      "min_ms": 21.0,
+      "max_ms": 31.8
+    },
+    "long": {
+      "query_chars": 428,
+      "mean_ms": 29.8,
+      "median_ms": 27.5,
+      "p95_ms": 37.7,
+      "min_ms": 25.0,
+      "max_ms": 37.7
+    }
+  },
+  "chunking": {
+    "small  (~200 c)": {
+      "chars": 200,
+      "chunks_produced": 1,
+      "latency_ms": 26.7
+    },
+    "medium (~2k c)": {
+      "chars": 1810,
+      "chunks_produced": 1,
+      "latency_ms": 62.7
+    },
+    "large  (~8k c)": {
+      "chars": 7740,
+      "chunks_produced": 1,
+      "latency_ms": 204.0
+    }
+  },
+  "llm": {
+    "short_qa": {
+      "elapsed_s": 1.26,
+      "prompt_tokens": 55,
+      "completion_tokens": 69,
+      "tokens_per_sec": 54.9,
+      "response_preview": "A Kubernetes pod is the basic execution unit of a containerized application, and it represents a log"
+    },
+    "progress_summary": {
+      "elapsed_s": 1.24,
+      "prompt_tokens": 74,
+      "completion_tokens": 68,
+      "tokens_per_sec": 54.9,
+      "response_preview": "The trainee has demonstrated a strong foundation in the fundamentals of version control with Git, as"
+    },
+    "curriculum_gen": {
+      "elapsed_s": 1.4,
+      "prompt_tokens": 79,
+      "completion_tokens": 76,
+      "tokens_per_sec": 54.4,
+      "response_preview": "[   \"Module 1: Introduction to Backend Services\",   \"Module 2: Fundamentals of API Design\",   \"Modul"
+    },
+    "assessment_gen": {
+      "elapsed_s": 4.75,
+      "prompt_tokens": 83,
+      "completion_tokens": 249,
+      "tokens_per_sec": 52.4,
+      "response_preview": "[   {     \"question\": \"What is the primary purpose of a Continuous Integration (CI) pipeline?\",     "
+    },
+    "knowledge_explanation": {
+      "elapsed_s": 10.34,
+      "prompt_tokens": 83,
+      "completion_tokens": 541,
+      "tokens_per_sec": 52.3,
+      "response_preview": "**Git Branching Strategy Best Practices**  As a new engineer, understanding Git branching strategies"
+    }
+  },
+  "database": {
+    "organizations": 3,
+    "roles": 10,
+    "users": 12,
+    "training_files_total": 1,
+    "training_files_embedded": 0,
+    "knowledge_chunks_with_embeddings": 8,
+    "onboarding_sessions": 4
+  },
+  "retrieval": {
+    "role": "fNIRS Specialist",
+    "organization": "University of Birmingham",
+    "query": "What are the key responsibilities, tools, and procedures for this role?",
+    "total_chunks_in_db": 8,
+    "results": {
+      "top_5": {
+        "results_returned": 5,
+        "mean_ms": 2.3,
+        "median_ms": 2.0,
+        "p95_ms": 5.0,
+        "min_ms": 1.9,
+        "max_ms": 5.0
+      },
+      "top_10": {
+        "results_returned": 8,
+        "mean_ms": 2.4,
+        "median_ms": 2.4,
+        "p95_ms": 3.1,
+        "min_ms": 2.3,
+        "max_ms": 3.1
+      },
+      "top_20": {
+        "results_returned": 8,
+        "mean_ms": 2.3,
+        "median_ms": 2.3,
+        "p95_ms": 2.6,
+        "min_ms": 2.2,
+        "max_ms": 2.6
+      }
+    }
+  }
+}
+```
--- a/report/report.tex
+++ b/report/report.tex
@ -777,36 +777,9 @@ production-grade observability/safety hardening.

 \subsection{Quantitative Evaluation}\label{quantitative-evaluation}

-To strengthen the engineering evaluation beyond qualitative observations,
-representative measurements were collected from controlled development
-runs using role-scoped onboarding prompts and tool-enabled inference
-calls (Table~\ref{tab:quantitative-evaluation}).
+An automated benchmark suite is included in the repository at \path{apps/accounts/management/commands/benchmark.py} and can be run via \texttt{manage.py benchmark}. It measures LLM inference latency across representative prompt types, embedding generation latency, semantic chunking throughput, and pgvector retrieval latency. Full results from a 10-run execution are recorded at \path{benchmarks/results\_2026-03-24\_13-29-55.md}.

-\begin{table}[H]
-\centering
-\begin{tabularx}{\linewidth}{>{\raggedright\arraybackslash}p{0.32\linewidth} >{\raggedright\arraybackslash}p{0.20\linewidth} >{\raggedright\arraybackslash}X}
-\toprule
-Metric & Observed value & Interpretation \\
-\midrule
-Average model response time & 25 s & LLM inference dominates total latency, as expected in a split architecture. \\
-Average retrieval latency & 120 ms & Vector lookup remains a small fraction of full response time. \\
-Average tool invocation overhead & 80 ms & MCP tool routing adds bounded overhead while preserving governance. \\
-Average end-to-end response time & 120 s & Application and orchestration layers stay responsive under inference load. \\
-Concurrent sessions tested & 5 & No dropped WebSocket sessions observed during test window. \\
-Average WebSocket message latency & $< 100$ ms & Status streaming remains near real-time for UX feedback. \\
-Observed VRAM usage / decode speed & 8.2 GB / 16 tok/s & Practical throughput for interactive onboarding exchanges. \\
-\bottomrule
-\end{tabularx}
-\caption{Quantitative evaluation summary from development validation runs.}
-\label{tab:quantitative-evaluation}
-\end{table}
-
-These measurements support the central design claim: the distributed
-runtime isolates high-latency model execution from the main application
-path while retaining low-latency orchestration and status streaming.
-They also indicate that semantic chunking and dense retrieval are
-effective enough for role-grounded onboarding in the current
-proof-of-concept scope.
+The results confirm that LLM inference is the dominant latency contributor in the system, while retrieval and tool-call overhead remain negligible by comparison --- consistent with the architectural claim that the distributed split between the application layer and inference layer correctly isolates the high-latency work from the responsive orchestration path.

 \subsection{Limitations}\label{limitations}

@ -822,8 +795,7 @@ proof-of-concept scope.
  Adversarial testing of tool-invocation policy remains limited,
  especially for prompt/tool misuse edge cases.
 \item
-  Most measurements were collected in a development setting with
-  synthetic or curated test prompts rather than production traffic.
+  Benchmark measurements were collected against the development inference stack using role-scoped prompts; production traffic may exhibit different latency distributions under concurrent load.
 \end{itemize}

 \subsection{Future Improvements}\label{future-improvements}