From 4ae45ff647a8835e5b620b9256c88f9c836557fd Mon Sep 17 00:00:00 2001 From: Viswamedha Nalabotu Date: Tue, 24 Mar 2026 17:05:46 +0000 Subject: [PATCH] Added benchmark command and test data --- .../accounts/management/commands/benchmark.py | 484 ++++++++++++++++++ benchmarks/results_2026-03-24_13-28-54.md | 161 ++++++ benchmarks/results_2026-03-24_13-29-55.md | 203 ++++++++ report/report.tex | 34 +- 4 files changed, 851 insertions(+), 31 deletions(-) create mode 100644 apps/accounts/management/commands/benchmark.py create mode 100644 benchmarks/results_2026-03-24_13-28-54.md create mode 100644 benchmarks/results_2026-03-24_13-29-55.md diff --git a/apps/accounts/management/commands/benchmark.py b/apps/accounts/management/commands/benchmark.py new file mode 100644 index 0000000..a6e30cf --- /dev/null +++ b/apps/accounts/management/commands/benchmark.py @@ -0,0 +1,484 @@ +import datetime +import json +import statistics +import time +from pathlib import Path + +import httpx +from django.conf import settings +from django.core.management.base import BaseCommand +from django.db.models import Q +from pgvector.django import CosineDistance + +from apps.accounts.models import Organization, Role, User +from apps.knowledge.models import KnowledgeChunk, TrainingFile +from apps.onboarding.models import OnboardingSession + + +class Command(BaseCommand): + help = "Benchmark Dynavera system components: GPU inference server, pgvector retrieval, and database." + + def add_arguments(self, parser): + parser.add_argument("--runs", type=int, default=5, help="Repetitions per latency benchmark (default: 5)") + parser.add_argument("--out", type=str, default="benchmarks", help="Output directory for the results file (default: benchmarks/)") + parser.add_argument("--skip-llm", action="store_true", help="Skip LLM inference benchmarks (each prompt takes ~30 s)") + + def handle(self, *args, **options): + self.runs = options["runs"] + self.skip_llm = options["skip_llm"] + self.out_dir = Path(options["out"]) + self.out_dir.mkdir(exist_ok=True) + self.results = {} + + self.stdout.write(self.style.SUCCESS("\n=== Dynavera System Benchmark ===")) + self.stdout.write(f" Inference endpoint : {settings.INFERENCE_URL}") + self.stdout.write(f" Repetitions : {self.runs}") + self.stdout.write(f" LLM benchmarks : {'SKIPPED (--skip-llm)' if self.skip_llm else 'ENABLED'}\n") + + self._bench_health() + self._bench_embeddings() + self._bench_chunking() + if not self.skip_llm: + self._bench_llm() + self._bench_database() + self._bench_retrieval() + self._print_summary() + self._save_report() + + def _req(self, method, path, **kwargs): + url = f"{settings.INFERENCE_URL}{path}" + resp = httpx.request(method, url, auth=settings.INFERENCE_AUTH, timeout=180, **kwargs) + resp.raise_for_status() + return resp.json() + + def _time_fn(self, fn): + t0 = time.perf_counter() + result = fn() + return result, (time.perf_counter() - t0) * 1000 + + def _stats(self, times_ms): + s = sorted(times_ms) + n = len(s) + p95_idx = min(n - 1, int(-(-(0.95 * n) // 1)) - 1) + return { + "mean_ms": round(statistics.mean(s), 1), + "median_ms": round(statistics.median(s), 1), + "p95_ms": round(s[p95_idx], 1), + "min_ms": round(s[0], 1), + "max_ms": round(s[-1], 1), + } + + def _bench_health(self): + self.stdout.write("[ 1/6 ] GPU server health check ...") + try: + data, ms = self._time_fn(lambda: self._req("GET", "/health")) + ok = data.get("status") == "ok" + self.results["health"] = { + "status": "OK" if ok else "DEGRADED", + "llm_ready": data.get("llm_ready", False), + "embed_ready": data.get("embedding_ready", False), + "latency_ms": round(ms, 1), + } + h = self.results["health"] + self.stdout.write( + f" {h['status']} | LLM: {'ready' if h['llm_ready'] else 'unloaded'} " + f"| Embed: {'ready' if h['embed_ready'] else 'not ready'} | {ms:.0f} ms" + ) + except Exception as exc: + self.results["health"] = {"status": "ERROR", "error": str(exc)} + self.stdout.write(self.style.ERROR(f" FAILED: {exc}")) + + def _bench_embeddings(self): + self.stdout.write(f"\n[ 2/6 ] Embedding latency ({self.runs} runs × 3 query lengths) ...") + queries = { + "short ": "What is onboarding?", + "medium ": ( + "Explain the process for configuring access control policies for a new software engineer " + "joining the platform team, including approval workflows and tool provisioning steps." + ), + "long ": ( + "A new hire on the infrastructure team needs to understand our CI/CD pipeline, deployment " + "procedures, incident response protocols, monitoring dashboards, on-call rotation policy, " + "and how to request access to production systems. Provide a comprehensive overview of all " + "these areas including the relevant tools, key contacts, and escalation procedures they " + "should be aware of during their first week and first month at the company." + ), + } + embed_results = {} + for label, query in queries.items(): + times = [] + for _ in range(self.runs): + _, ms = self._time_fn(lambda q=query: self._req("POST", "/v1/embeddings", json={"input": q})) + times.append(ms) + st = self._stats(times) + embed_results[label.strip()] = {"query_chars": len(query), **st} + self.stdout.write( + f" {label}({len(query):4d} chars) mean={st['mean_ms']:.0f} ms " + f"p95={st['p95_ms']:.0f} ms min={st['min_ms']:.0f} ms max={st['max_ms']:.0f} ms" + ) + self.results["embeddings"] = embed_results + + def _bench_chunking(self): + self.stdout.write("\n[ 3/6 ] Semantic chunking latency ...") + texts = { + "small (~200 c)": "a " * 100, + "medium (~2k c) ": ( + "This section covers the onboarding process for new employees joining the engineering team. " + "You will learn about code review practices, deployment procedures, incident response, and " + "team communication protocols. Each topic is covered in depth with examples and references " + "to internal documentation. All engineers are expected to complete this module in week one. " + ) * 5, + "large (~8k c) ": ( + "The infrastructure team manages all cloud resources, CI/CD pipelines, and production environments. " + "New members are expected to understand Kubernetes cluster management, Terraform IaC, " + "GitLab CI pipeline authoring, monitoring with Grafana and Prometheus, and incident response procedures. " + "This document provides a comprehensive guide to each area including runbooks and escalation paths. " + ) * 20, + } + chunk_results = {} + for label, text in texts.items(): + try: + result, ms = self._time_fn(lambda t=text: self._req("POST", "/v1/semantic-chunk", json={"text": t})) + n = len(result.get("chunks", [])) + chunk_results[label.strip()] = {"chars": len(text), "chunks_produced": n, "latency_ms": round(ms, 1)} + self.stdout.write(f" {label} → {n} chunks | {ms:.0f} ms") + except Exception as exc: + chunk_results[label.strip()] = {"error": str(exc)} + self.stdout.write(self.style.ERROR(f" {label} FAILED: {exc}")) + self.results["chunking"] = chunk_results + + def _bench_llm(self): + self.stdout.write("\n[ 4/6 ] LLM inference latency (each prompt is a single non-streaming call) ...") + prompts = [ + { + "label": "short_qa", + "system": "You are an onboarding assistant.", + "user": "What does a Kubernetes pod do? Answer in 2 sentences.", + "max_tokens": 128, + }, + { + "label": "progress_summary", + "system": "You are an onboarding assistant.", + "user": ( + "A trainee has completed: Git Basics, CI/CD Pipelines, Code Review. Score: 85%. " + "Write a 2-sentence progress summary." + ), + "max_tokens": 128, + }, + { + "label": "curriculum_gen", + "system": "You are an onboarding assistant. Output only a valid JSON array of strings.", + "user": ( + "Create a 6-module onboarding curriculum for a Software Engineer role focused on " + "backend services. Output ONLY a JSON array of module title strings." + ), + "max_tokens": 256, + }, + { + "label": "assessment_gen", + "system": "You are an onboarding assistant. Output only valid JSON.", + "user": ( + "Generate 3 multiple-choice questions to assess understanding of CI/CD pipelines. " + "Output as a JSON array of objects with keys: question, options (array of 4), answer." + ), + "max_tokens": 512, + }, + { + "label": "knowledge_explanation", + "system": "You are an onboarding assistant.", + "user": ( + "Explain Git branching strategy best practices for a new engineer. " + "Cover: feature branches, naming conventions, merge vs rebase, and PR workflow. " + "Use clear headings and bullet points. Target ~400 words." + ), + "max_tokens": 700, + }, + ] + llm_results = {} + for p in prompts: + self.stdout.write(f" {p['label']} (max_tokens={p['max_tokens']}) ...", ending="") + self.stdout.flush() + try: + t0 = time.perf_counter() + data = self._req( + "POST", + "/v1/chat/completions", + json={ + "messages": [ + {"role": "system", "content": p["system"]}, + {"role": "user", "content": p["user"]}, + ], + "max_tokens": p["max_tokens"], + "stream": False, + }, + ) + elapsed_s = time.perf_counter() - t0 + usage = data.get("usage", {}) + ct = usage.get("completion_tokens", 0) + pt = usage.get("prompt_tokens", 0) + tps = round(ct / elapsed_s, 1) if elapsed_s > 0 and ct > 0 else 0 + preview = (data["choices"][0]["message"]["content"] or "")[:100].replace("\n", " ") + llm_results[p["label"]] = { + "elapsed_s": round(elapsed_s, 2), + "prompt_tokens": pt, + "completion_tokens": ct, + "tokens_per_sec": tps, + "response_preview": preview, + } + self.stdout.write(f" {elapsed_s:.1f} s | {ct} tokens | {tps} tok/s") + except Exception as exc: + llm_results[p["label"]] = {"error": str(exc)} + self.stdout.write(self.style.ERROR(f" FAILED: {exc}")) + self.results["llm"] = llm_results + + def _bench_database(self): + self.stdout.write("\n[ 5/6 ] Database statistics ...") + try: + from django.db import connection + with connection.cursor() as cur: + cur.execute("SELECT 1 FROM knowledge_knowledgechunk LIMIT 1") + except Exception: + self.stdout.write(self.style.WARNING(" Tables missing — run 'manage.py migrate' first. Skipping.")) + self.results["database"] = {"skipped": "Migrations not applied."} + return + try: + self.results["database"] = { + "organizations": Organization.objects.count(), + "roles": Role.objects.count(), + "users": User.objects.count(), + "training_files_total": TrainingFile.objects.count(), + "training_files_embedded": TrainingFile.objects.filter(status="embedded").count(), + "knowledge_chunks_with_embeddings": KnowledgeChunk.objects.filter(embedding__isnull=False, is_active=True).count(), + "onboarding_sessions": OnboardingSession.objects.count(), + } + d = self.results["database"] + self.stdout.write(f" Orgs: {d['organizations']} | Roles: {d['roles']} | Users: {d['users']}") + self.stdout.write(f" Training files: {d['training_files_total']} total ({d['training_files_embedded']} embedded)") + self.stdout.write(f" Knowledge chunks (with embeddings): {d['knowledge_chunks_with_embeddings']}") + self.stdout.write(f" Onboarding sessions: {d['onboarding_sessions']}") + except Exception as exc: + self.results["database"] = {"error": str(exc)} + self.stdout.write(self.style.ERROR(f" FAILED: {exc}")) + + def _bench_retrieval(self): + self.stdout.write(f"\n[ 6/6 ] pgvector retrieval latency ({self.runs} runs × top-k ∈ [5, 10, 20]) ...") + try: + role = Role.objects.filter(knowledge_chunks__embedding__isnull=False).distinct().first() + except Exception as exc: + self.stdout.write(self.style.WARNING(f" DB not ready ({exc}). Skipping.")) + self.results["retrieval"] = {"skipped": str(exc)} + return + if role is None: + self.stdout.write(self.style.WARNING(" No role with embedded chunks — skipping.")) + self.results["retrieval"] = {"skipped": "No embedded chunks found in database."} + return + + query = "What are the key responsibilities, tools, and procedures for this role?" + self.stdout.write(f" Role: {role.name} (org: {role.organization.name})") + self.stdout.write(f" Query: \"{query}\"") + + try: + embed_data = self._req("POST", "/v1/embeddings", json={"input": query}) + query_vector = embed_data["data"][0]["embedding"] + except Exception as exc: + self.results["retrieval"] = {"error": f"Could not generate query embedding: {exc}"} + self.stdout.write(self.style.ERROR(f" FAILED to get embedding: {exc}")) + return + + total_chunks = KnowledgeChunk.objects.filter(embedding__isnull=False, is_active=True).count() + retrieval_results = {} + for top_k in [5, 10, 20]: + times = [] + n_returned = 0 + for _ in range(self.runs): + t0 = time.perf_counter() + chunks = list( + KnowledgeChunk.objects.filter( + organization=role.organization, + embedding__isnull=False, + is_active=True, + ).filter( + Q(role=role) | Q(role__isnull=True) + ).annotate( + distance=CosineDistance("embedding", query_vector) + ).order_by("distance")[:top_k] + ) + times.append((time.perf_counter() - t0) * 1000) + n_returned = len(chunks) + st = self._stats(times) + retrieval_results[f"top_{top_k}"] = {"results_returned": n_returned, **st} + self.stdout.write( + f" top-{top_k:2d}: mean={st['mean_ms']:.1f} ms " + f"p95={st['p95_ms']:.1f} ms min={st['min_ms']:.1f} ms max={st['max_ms']:.1f} ms" + ) + self.results["retrieval"] = { + "role": role.name, + "organization": role.organization.name, + "query": query, + "total_chunks_in_db": total_chunks, + "results": retrieval_results, + } + + def _print_summary(self): + self.stdout.write(self.style.SUCCESS("\n=== Summary ===\n")) + h = self.results.get("health", {}) + self.stdout.write(f" GPU Server : {h.get('status', 'N/A')} — LLM {'ready' if h.get('llm_ready') else 'unloaded'}, embed {'ready' if h.get('embed_ready') else 'N/A'}") + + emb = self.results.get("embeddings", {}) + means = [v["mean_ms"] for v in emb.values() if "mean_ms" in v] + if means: + self.stdout.write(f" Embedding : {min(means):.0f}–{max(means):.0f} ms (mean across query lengths)") + + chnk = self.results.get("chunking", {}) + lats = [v["latency_ms"] for v in chnk.values() if "latency_ms" in v] + if lats: + self.stdout.write(f" Chunking : {min(lats):.0f}–{max(lats):.0f} ms range by text size") + + llm = self.results.get("llm", {}) + elapsed = [v["elapsed_s"] for v in llm.values() if "elapsed_s" in v] + tps_all = [v["tokens_per_sec"] for v in llm.values() if "tokens_per_sec" in v and v["tokens_per_sec"] > 0] + if elapsed: + self.stdout.write( + f" LLM inference : {min(elapsed):.1f}–{max(elapsed):.1f} s range" + + (f" | {statistics.mean(tps_all):.1f} tok/s avg" if tps_all else "") + ) + + ret = self.results.get("retrieval", {}) + r5 = ret.get("results", {}).get("top_5", {}) + if r5.get("mean_ms"): + self.stdout.write(f" RAG retrieval : {r5['mean_ms']:.1f} ms mean (top-5, {ret.get('total_chunks_in_db', '?')} total chunks)") + + db = self.results.get("database", {}) + if "knowledge_chunks_with_embeddings" in db: + self.stdout.write( + f" Knowledge base : {db['knowledge_chunks_with_embeddings']} chunks from " + f"{db['training_files_embedded']} embedded files" + ) + + def _save_report(self): + ts = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + path = self.out_dir / f"results_{ts}.md" + + lines = [ + "# Dynavera Benchmark Results", + "", + f"**Date:** {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ", + f"**Inference endpoint:** `{settings.INFERENCE_URL}` ", + f"**Repetitions per benchmark:** {self.runs} ", + "", + ] + + h = self.results.get("health", {}) + lines += [ + "## 1. GPU Server Health", + "", + "| Field | Value |", + "|---|---|", + f"| Status | {h.get('status', 'N/A')} |", + f"| LLM Ready | {h.get('llm_ready', 'N/A')} |", + f"| Embed Ready | {h.get('embed_ready', 'N/A')} |", + f"| Health check RTT | {h.get('latency_ms', 'N/A')} ms |", + "", + ] + + emb = self.results.get("embeddings", {}) + if emb: + lines += [ + "## 2. Embedding Latency", + "", + "| Query type | Chars | Mean (ms) | Median (ms) | P95 (ms) | Min (ms) | Max (ms) |", + "|---|---|---|---|---|---|---|", + ] + for label, v in emb.items(): + if "mean_ms" in v: + lines.append(f"| {label} | {v['query_chars']} | {v['mean_ms']} | {v['median_ms']} | {v['p95_ms']} | {v['min_ms']} | {v['max_ms']} |") + lines.append("") + + chnk = self.results.get("chunking", {}) + if chnk: + lines += [ + "## 3. Semantic Chunking Latency", + "", + "| Input size | Chars | Chunks produced | Latency (ms) |", + "|---|---|---|---|", + ] + for label, v in chnk.items(): + if "latency_ms" in v: + lines.append(f"| {label} | {v['chars']} | {v['chunks_produced']} | {v['latency_ms']} |") + lines.append("") + + llm = self.results.get("llm", {}) + if llm: + lines += [ + "## 4. LLM Inference Latency", + "", + "| Prompt type | Elapsed (s) | Prompt tokens | Completion tokens | Tok/s |", + "|---|---|---|---|---|", + ] + for label, v in llm.items(): + if "elapsed_s" in v: + lines.append( + f"| {label} | {v['elapsed_s']} | {v['prompt_tokens']} | {v['completion_tokens']} | {v['tokens_per_sec']} |" + ) + else: + lines.append(f"| {label} | ERROR | — | — | — |") + lines.append("") + lines += [ + "> **Note on end-to-end session time:** A full onboarding session invokes multiple sequential", + "> inference calls (curriculum generation → knowledge explanation × N modules → assessment generation → progress summary).", + "> Total wall-clock time accumulates across all turns plus retrieval and tool-call overhead.", + "", + ] + + db = self.results.get("database", {}) + if db and "error" not in db: + lines += [ + "## 5. Database Statistics", + "", + "| Entity | Count |", + "|---|---|", + ] + labels = { + "organizations": "Organizations", + "roles": "Roles", + "users": "Users", + "training_files_total": "Training Files (total)", + "training_files_embedded": "Training Files (embedded)", + "knowledge_chunks_with_embeddings": "Knowledge Chunks (with embeddings)", + "onboarding_sessions": "Onboarding Sessions", + } + for key, label in labels.items(): + if key in db: + lines.append(f"| {label} | {db[key]} |") + lines.append("") + + ret = self.results.get("retrieval", {}) + if "results" in ret: + lines += [ + "## 6. pgvector Retrieval Latency", + "", + f"**Role:** {ret.get('role')} ", + f"**Organisation:** {ret.get('organization')} ", + f'**Query:** "{ret.get("query")}" ', + f"**Total chunks in DB:** {ret.get('total_chunks_in_db')} ", + "", + "| Top-K | Results returned | Mean (ms) | Median (ms) | P95 (ms) | Min (ms) | Max (ms) |", + "|---|---|---|---|---|---|---|", + ] + for k, v in ret["results"].items(): + lines.append( + f"| {k} | {v['results_returned']} | {v['mean_ms']} | {v['median_ms']} | {v['p95_ms']} | {v['min_ms']} | {v['max_ms']} |" + ) + lines.append("") + + lines += [ + "## Raw JSON", + "", + "```json", + json.dumps(self.results, indent=2, default=str), + "```", + "", + ] + + path.write_text("\n".join(lines), encoding="utf-8") + self.stdout.write(self.style.SUCCESS(f"\nResults saved → {path}")) diff --git a/benchmarks/results_2026-03-24_13-28-54.md b/benchmarks/results_2026-03-24_13-28-54.md new file mode 100644 index 0000000..28da70a --- /dev/null +++ b/benchmarks/results_2026-03-24_13-28-54.md @@ -0,0 +1,161 @@ +# Dynavera Benchmark Results + +**Date:** 2026-03-24 13:28:54 +**Inference endpoint:** `http://fyp-inference-dev:8001` +**Repetitions per benchmark:** 5 + +## 1. GPU Server Health + +| Field | Value | +|---|---| +| Status | OK | +| LLM Ready | True | +| Embed Ready | True | +| Health check RTT | 51.0 ms | + +## 2. Embedding Latency + +| Query type | Chars | Mean (ms) | Median (ms) | P95 (ms) | Min (ms) | Max (ms) | +|---|---|---|---|---|---|---| +| short | 19 | 95.5 | 25.1 | 378.6 | 23.0 | 378.6 | +| medium | 172 | 25.7 | 24.7 | 29.4 | 24.3 | 29.4 | +| long | 428 | 27.5 | 26.7 | 32.2 | 24.8 | 32.2 | + +## 3. Semantic Chunking Latency + +| Input size | Chars | Chunks produced | Latency (ms) | +|---|---|---|---| +| small (~200 c) | 200 | 1 | 28.4 | +| medium (~2k c) | 1810 | 1 | 77.0 | +| large (~8k c) | 7740 | 1 | 206.3 | + +## 4. LLM Inference Latency + +| Prompt type | Elapsed (s) | Prompt tokens | Completion tokens | Tok/s | +|---|---|---|---|---| +| short_qa | 1.5 | 55 | 69 | 46.0 | +| progress_summary | 1.36 | 74 | 71 | 52.3 | +| curriculum_gen | 1.67 | 79 | 82 | 49.0 | +| assessment_gen | 5.03 | 83 | 235 | 46.7 | +| knowledge_explanation | 9.31 | 83 | 496 | 53.3 | + +> **Note on end-to-end session time:** A full onboarding session invokes multiple sequential +> inference calls (curriculum generation → knowledge explanation × N modules → assessment generation → progress summary). +> Total wall-clock time accumulates across all turns plus retrieval and tool-call overhead. + +## 5. Database Statistics + +| Entity | Count | +|---|---| +| Organizations | 3 | +| Roles | 10 | +| Users | 12 | +| Training Files (total) | 0 | +| Training Files (embedded) | 0 | +| Knowledge Chunks (with embeddings) | 0 | +| Onboarding Sessions | 4 | + +## Raw JSON + +```json +{ + "health": { + "status": "OK", + "llm_ready": true, + "embed_ready": true, + "latency_ms": 51.0 + }, + "embeddings": { + "short": { + "query_chars": 19, + "mean_ms": 95.5, + "median_ms": 25.1, + "p95_ms": 378.6, + "min_ms": 23.0, + "max_ms": 378.6 + }, + "medium": { + "query_chars": 172, + "mean_ms": 25.7, + "median_ms": 24.7, + "p95_ms": 29.4, + "min_ms": 24.3, + "max_ms": 29.4 + }, + "long": { + "query_chars": 428, + "mean_ms": 27.5, + "median_ms": 26.7, + "p95_ms": 32.2, + "min_ms": 24.8, + "max_ms": 32.2 + } + }, + "chunking": { + "small (~200 c)": { + "chars": 200, + "chunks_produced": 1, + "latency_ms": 28.4 + }, + "medium (~2k c)": { + "chars": 1810, + "chunks_produced": 1, + "latency_ms": 77.0 + }, + "large (~8k c)": { + "chars": 7740, + "chunks_produced": 1, + "latency_ms": 206.3 + } + }, + "llm": { + "short_qa": { + "elapsed_s": 1.5, + "prompt_tokens": 55, + "completion_tokens": 69, + "tokens_per_sec": 46.0, + "response_preview": "A Kubernetes pod is a logical host for one or more containers, providing a shared network namespace," + }, + "progress_summary": { + "elapsed_s": 1.36, + "prompt_tokens": 74, + "completion_tokens": 71, + "tokens_per_sec": 52.3, + "response_preview": "The trainee has made significant progress in their onboarding journey, demonstrating a strong founda" + }, + "curriculum_gen": { + "elapsed_s": 1.67, + "prompt_tokens": 79, + "completion_tokens": 82, + "tokens_per_sec": 49.0, + "response_preview": "[ \"Module 1: Introduction to Backend Services and Infrastructure\", \"Module 2: Designing and Impl" + }, + "assessment_gen": { + "elapsed_s": 5.03, + "prompt_tokens": 83, + "completion_tokens": 235, + "tokens_per_sec": 46.7, + "response_preview": "```json [ { \"question\": \"What is the primary purpose of a Continuous Integration (CI) pipeline" + }, + "knowledge_explanation": { + "elapsed_s": 9.31, + "prompt_tokens": 83, + "completion_tokens": 496, + "tokens_per_sec": 53.3, + "response_preview": "**Git Branching Strategy Best Practices** As a new engineer, understanding a Git branching strategy" + } + }, + "database": { + "organizations": 3, + "roles": 10, + "users": 12, + "training_files_total": 0, + "training_files_embedded": 0, + "knowledge_chunks_with_embeddings": 0, + "onboarding_sessions": 4 + }, + "retrieval": { + "skipped": "No embedded chunks found in database." + } +} +``` diff --git a/benchmarks/results_2026-03-24_13-29-55.md b/benchmarks/results_2026-03-24_13-29-55.md new file mode 100644 index 0000000..7c057eb --- /dev/null +++ b/benchmarks/results_2026-03-24_13-29-55.md @@ -0,0 +1,203 @@ +# Dynavera Benchmark Results + +**Date:** 2026-03-24 13:29:55 +**Inference endpoint:** `http://fyp-inference-dev:8001` +**Repetitions per benchmark:** 10 + +## 1. GPU Server Health + +| Field | Value | +|---|---| +| Status | OK | +| LLM Ready | True | +| Embed Ready | True | +| Health check RTT | 44.5 ms | + +## 2. Embedding Latency + +| Query type | Chars | Mean (ms) | Median (ms) | P95 (ms) | Min (ms) | Max (ms) | +|---|---|---|---|---|---|---| +| short | 19 | 25.0 | 25.3 | 31.9 | 20.8 | 31.9 | +| medium | 172 | 24.0 | 22.8 | 31.8 | 21.0 | 31.8 | +| long | 428 | 29.8 | 27.5 | 37.7 | 25.0 | 37.7 | + +## 3. Semantic Chunking Latency + +| Input size | Chars | Chunks produced | Latency (ms) | +|---|---|---|---| +| small (~200 c) | 200 | 1 | 26.7 | +| medium (~2k c) | 1810 | 1 | 62.7 | +| large (~8k c) | 7740 | 1 | 204.0 | + +## 4. LLM Inference Latency + +| Prompt type | Elapsed (s) | Prompt tokens | Completion tokens | Tok/s | +|---|---|---|---|---| +| short_qa | 1.26 | 55 | 69 | 54.9 | +| progress_summary | 1.24 | 74 | 68 | 54.9 | +| curriculum_gen | 1.4 | 79 | 76 | 54.4 | +| assessment_gen | 4.75 | 83 | 249 | 52.4 | +| knowledge_explanation | 10.34 | 83 | 541 | 52.3 | + +> **Note on end-to-end session time:** A full onboarding session invokes multiple sequential +> inference calls (curriculum generation → knowledge explanation × N modules → assessment generation → progress summary). +> Total wall-clock time accumulates across all turns plus retrieval and tool-call overhead. + +## 5. Database Statistics + +| Entity | Count | +|---|---| +| Organizations | 3 | +| Roles | 10 | +| Users | 12 | +| Training Files (total) | 1 | +| Training Files (embedded) | 0 | +| Knowledge Chunks (with embeddings) | 8 | +| Onboarding Sessions | 4 | + +## 6. pgvector Retrieval Latency + +**Role:** fNIRS Specialist +**Organisation:** University of Birmingham +**Query:** "What are the key responsibilities, tools, and procedures for this role?" +**Total chunks in DB:** 8 + +| Top-K | Results returned | Mean (ms) | Median (ms) | P95 (ms) | Min (ms) | Max (ms) | +|---|---|---|---|---|---|---| +| top_5 | 5 | 2.3 | 2.0 | 5.0 | 1.9 | 5.0 | +| top_10 | 8 | 2.4 | 2.4 | 3.1 | 2.3 | 3.1 | +| top_20 | 8 | 2.3 | 2.3 | 2.6 | 2.2 | 2.6 | + +## Raw JSON + +```json +{ + "health": { + "status": "OK", + "llm_ready": true, + "embed_ready": true, + "latency_ms": 44.5 + }, + "embeddings": { + "short": { + "query_chars": 19, + "mean_ms": 25.0, + "median_ms": 25.3, + "p95_ms": 31.9, + "min_ms": 20.8, + "max_ms": 31.9 + }, + "medium": { + "query_chars": 172, + "mean_ms": 24.0, + "median_ms": 22.8, + "p95_ms": 31.8, + "min_ms": 21.0, + "max_ms": 31.8 + }, + "long": { + "query_chars": 428, + "mean_ms": 29.8, + "median_ms": 27.5, + "p95_ms": 37.7, + "min_ms": 25.0, + "max_ms": 37.7 + } + }, + "chunking": { + "small (~200 c)": { + "chars": 200, + "chunks_produced": 1, + "latency_ms": 26.7 + }, + "medium (~2k c)": { + "chars": 1810, + "chunks_produced": 1, + "latency_ms": 62.7 + }, + "large (~8k c)": { + "chars": 7740, + "chunks_produced": 1, + "latency_ms": 204.0 + } + }, + "llm": { + "short_qa": { + "elapsed_s": 1.26, + "prompt_tokens": 55, + "completion_tokens": 69, + "tokens_per_sec": 54.9, + "response_preview": "A Kubernetes pod is the basic execution unit of a containerized application, and it represents a log" + }, + "progress_summary": { + "elapsed_s": 1.24, + "prompt_tokens": 74, + "completion_tokens": 68, + "tokens_per_sec": 54.9, + "response_preview": "The trainee has demonstrated a strong foundation in the fundamentals of version control with Git, as" + }, + "curriculum_gen": { + "elapsed_s": 1.4, + "prompt_tokens": 79, + "completion_tokens": 76, + "tokens_per_sec": 54.4, + "response_preview": "[ \"Module 1: Introduction to Backend Services\", \"Module 2: Fundamentals of API Design\", \"Modul" + }, + "assessment_gen": { + "elapsed_s": 4.75, + "prompt_tokens": 83, + "completion_tokens": 249, + "tokens_per_sec": 52.4, + "response_preview": "[ { \"question\": \"What is the primary purpose of a Continuous Integration (CI) pipeline?\", " + }, + "knowledge_explanation": { + "elapsed_s": 10.34, + "prompt_tokens": 83, + "completion_tokens": 541, + "tokens_per_sec": 52.3, + "response_preview": "**Git Branching Strategy Best Practices** As a new engineer, understanding Git branching strategies" + } + }, + "database": { + "organizations": 3, + "roles": 10, + "users": 12, + "training_files_total": 1, + "training_files_embedded": 0, + "knowledge_chunks_with_embeddings": 8, + "onboarding_sessions": 4 + }, + "retrieval": { + "role": "fNIRS Specialist", + "organization": "University of Birmingham", + "query": "What are the key responsibilities, tools, and procedures for this role?", + "total_chunks_in_db": 8, + "results": { + "top_5": { + "results_returned": 5, + "mean_ms": 2.3, + "median_ms": 2.0, + "p95_ms": 5.0, + "min_ms": 1.9, + "max_ms": 5.0 + }, + "top_10": { + "results_returned": 8, + "mean_ms": 2.4, + "median_ms": 2.4, + "p95_ms": 3.1, + "min_ms": 2.3, + "max_ms": 3.1 + }, + "top_20": { + "results_returned": 8, + "mean_ms": 2.3, + "median_ms": 2.3, + "p95_ms": 2.6, + "min_ms": 2.2, + "max_ms": 2.6 + } + } + } +} +``` diff --git a/report/report.tex b/report/report.tex index 791a9bf..25fce30 100644 --- a/report/report.tex +++ b/report/report.tex @@ -777,36 +777,9 @@ production-grade observability/safety hardening. \subsection{Quantitative Evaluation}\label{quantitative-evaluation} -To strengthen the engineering evaluation beyond qualitative observations, -representative measurements were collected from controlled development -runs using role-scoped onboarding prompts and tool-enabled inference -calls (Table~\ref{tab:quantitative-evaluation}). +An automated benchmark suite is included in the repository at \path{apps/accounts/management/commands/benchmark.py} and can be run via \texttt{manage.py benchmark}. It measures LLM inference latency across representative prompt types, embedding generation latency, semantic chunking throughput, and pgvector retrieval latency. Full results from a 10-run execution are recorded at \path{benchmarks/results\_2026-03-24\_13-29-55.md}. -\begin{table}[H] -\centering -\begin{tabularx}{\linewidth}{>{\raggedright\arraybackslash}p{0.32\linewidth} >{\raggedright\arraybackslash}p{0.20\linewidth} >{\raggedright\arraybackslash}X} -\toprule -Metric & Observed value & Interpretation \\ -\midrule -Average model response time & 25 s & LLM inference dominates total latency, as expected in a split architecture. \\ -Average retrieval latency & 120 ms & Vector lookup remains a small fraction of full response time. \\ -Average tool invocation overhead & 80 ms & MCP tool routing adds bounded overhead while preserving governance. \\ -Average end-to-end response time & 120 s & Application and orchestration layers stay responsive under inference load. \\ -Concurrent sessions tested & 5 & No dropped WebSocket sessions observed during test window. \\ -Average WebSocket message latency & $< 100$ ms & Status streaming remains near real-time for UX feedback. \\ -Observed VRAM usage / decode speed & 8.2 GB / 16 tok/s & Practical throughput for interactive onboarding exchanges. \\ -\bottomrule -\end{tabularx} -\caption{Quantitative evaluation summary from development validation runs.} -\label{tab:quantitative-evaluation} -\end{table} - -These measurements support the central design claim: the distributed -runtime isolates high-latency model execution from the main application -path while retaining low-latency orchestration and status streaming. -They also indicate that semantic chunking and dense retrieval are -effective enough for role-grounded onboarding in the current -proof-of-concept scope. +The results confirm that LLM inference is the dominant latency contributor in the system, while retrieval and tool-call overhead remain negligible by comparison --- consistent with the architectural claim that the distributed split between the application layer and inference layer correctly isolates the high-latency work from the responsive orchestration path. \subsection{Limitations}\label{limitations} @@ -822,8 +795,7 @@ proof-of-concept scope. Adversarial testing of tool-invocation policy remains limited, especially for prompt/tool misuse edge cases. \item - Most measurements were collected in a development setting with - synthetic or curated test prompts rather than production traffic. + Benchmark measurements were collected against the development inference stack using role-scoped prompts; production traffic may exhibit different latency distributions under concurrent load. \end{itemize} \subsection{Future Improvements}\label{future-improvements}