Added llm api basic auth and disabled docs

This commit is contained in:
Viswamedha Nalabotu 2026-03-22 08:19:57 +00:00
parent 7becac6619
commit f9073d53b6
4 changed files with 25 additions and 7 deletions

View file

@ -32,3 +32,5 @@ POSTGRES_PORT=5432
INFERENCE_PROTOCOL=http INFERENCE_PROTOCOL=http
INFERENCE_HOST=fyp-inference-dev INFERENCE_HOST=fyp-inference-dev
INFERENCE_PORT=8001 INFERENCE_PORT=8001
INFERENCE_USERNAME=admin
INFERENCE_PASSWORD=changeme

View file

@ -34,6 +34,8 @@ POSTGRES_PORT=5432
INFERENCE_PROTOCOL=http INFERENCE_PROTOCOL=http
INFERENCE_HOST=localhost INFERENCE_HOST=localhost
INFERENCE_PORT=8001 INFERENCE_PORT=8001
INFERENCE_USERNAME=admin
INFERENCE_PASSWORD=change_this_to_a_secure_password
# Production YAML (Ignore if you're setting up locally) # Production YAML (Ignore if you're setting up locally)
FYP_DJANGO_IMAGE=dynavera-django:prod FYP_DJANGO_IMAGE=dynavera-django:prod

View file

@ -28,7 +28,9 @@ DJANGO_CELERY_BROKER_URL = os.getenv('DJANGO_CELERY_BROKER_URL', 'redis://localh
INFERENCE_HOST = os.getenv('INFERENCE_HOST', 'localhost') INFERENCE_HOST = os.getenv('INFERENCE_HOST', 'localhost')
INFERENCE_PORT = os.getenv('INFERENCE_PORT', '8001') INFERENCE_PORT = os.getenv('INFERENCE_PORT', '8001')
INFERENCE_PROTOCOL = os.getenv('INFERENCE_PROTOCOL', 'http') INFERENCE_PROTOCOL = os.getenv('INFERENCE_PROTOCOL', 'http')
INFERENCE_URL = f"{INFERENCE_PROTOCOL}://{INFERENCE_HOST}:{INFERENCE_PORT}" INFERENCE_USERNAME = os.getenv('INFERENCE_USERNAME', 'admin')
INFERENCE_PASSWORD = os.getenv('INFERENCE_PASSWORD', 'changeme')
INFERENCE_URL = f"{INFERENCE_PROTOCOL}://{INFERENCE_USERNAME}:{INFERENCE_PASSWORD}@{INFERENCE_HOST}:{INFERENCE_PORT}"
INFERENCE_SEMANTIC_CHUNK_ENDPOINT = f"{INFERENCE_URL}/v1/semantic-chunk" INFERENCE_SEMANTIC_CHUNK_ENDPOINT = f"{INFERENCE_URL}/v1/semantic-chunk"
INFERENCE_EMBEDDINGS_ENDPOINT = f"{INFERENCE_URL}/v1/embeddings" INFERENCE_EMBEDDINGS_ENDPOINT = f"{INFERENCE_URL}/v1/embeddings"
INFERENCE_CHAT_COMPLETIONS_ENDPOINT = f"{INFERENCE_URL}/v1/chat/completions" INFERENCE_CHAT_COMPLETIONS_ENDPOINT = f"{INFERENCE_URL}/v1/chat/completions"

View file

@ -8,8 +8,10 @@ from typing import Dict, Any
import numpy as np import numpy as np
from torch import cuda, no_grad, Tensor from torch import cuda, no_grad, Tensor
import torch.nn.functional as F import torch.nn.functional as F
from fastapi import FastAPI, Request, HTTPException import secrets
from fastapi import FastAPI, Request, HTTPException, Depends
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse
from fastapi.security import HTTPBasic, HTTPBasicCredentials
from llama_cpp import Llama from llama_cpp import Llama
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
@ -65,10 +67,20 @@ async def lifespan(app: FastAPI):
if cuda.is_available(): if cuda.is_available():
cuda.empty_cache() cuda.empty_cache()
app = FastAPI(title="Agentic GPU Node", lifespan=lifespan) app = FastAPI(title="Agentic GPU Node", lifespan=lifespan, docs_url=None, redoc_url=None, openapi_url=None)
_security = HTTPBasic()
_API_USER = os.getenv("INFERENCE_USERNAME", "admin")
_API_PASS = os.getenv("INFERENCE_PASSWORD", "changeme")
def require_auth(credentials: HTTPBasicCredentials = Depends(_security)):
valid_user = secrets.compare_digest(credentials.username.encode(), _API_USER.encode())
valid_pass = secrets.compare_digest(credentials.password.encode(), _API_PASS.encode())
if not (valid_user and valid_pass):
raise HTTPException(status_code=401, detail="Unauthorized", headers={"WWW-Authenticate": "Basic"})
@app.get("/health") @app.get("/health", dependencies=[Depends(require_auth)])
async def health(): async def health():
return { return {
"status": "ok", "status": "ok",
@ -85,7 +97,7 @@ def pad_and_normalize(embeddings: Tensor, target_dimensions: int) -> Tensor:
return F.normalize(embeddings, p=2, dim=1) return F.normalize(embeddings, p=2, dim=1)
@app.post("/v1/embeddings") @app.post("/v1/embeddings", dependencies=[Depends(require_auth)])
async def embeddings(request: Request): async def embeddings(request: Request):
data = await request.json() data = await request.json()
input_data = data.get("input", "") input_data = data.get("input", "")
@ -148,7 +160,7 @@ async def embeddings(request: Request):
}, },
} }
@app.post("/v1/semantic-chunk") @app.post("/v1/semantic-chunk", dependencies=[Depends(require_auth)])
async def semantic_chunk(request: Request): async def semantic_chunk(request: Request):
data = await request.json() data = await request.json()
raw_text = data.get("text", "") raw_text = data.get("text", "")
@ -208,7 +220,7 @@ async def semantic_chunk(request: Request):
result = await loop.run_in_executor(None, _chunk_and_embed) result = await loop.run_in_executor(None, _chunk_and_embed)
return result return result
@app.post("/v1/chat/completions") @app.post("/v1/chat/completions", dependencies=[Depends(require_auth)])
async def chat_completions(request: Request): async def chat_completions(request: Request):
try: try:
data = await request.json() data = await request.json()