Dynavera/gpu_server.py

import asyncio
import logging
import os
import json
import time
from contextlib import asynccontextmanager
from typing import Dict, Any

import numpy as np
from torch import cuda, no_grad, Tensor
import torch.nn.functional as F
import secrets
from fastapi import FastAPI, Request, HTTPException, Depends
from fastapi.responses import StreamingResponse
from fastapi.security import HTTPBasic, HTTPBasicCredentials
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)
logger = logging.getLogger("gpu-node")

EMBED_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
LLM_MODEL_PATH = os.getenv("LLM_MODEL_PATH", "/app/models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf")
LLM_IDLE_TIMEOUT = int(os.getenv("LLM_IDLE_TIMEOUT", "1800"))  
TARGET_DIMENSIONS = 768

state: Dict[str, Any] = {}
gpu_semaphore = asyncio.Semaphore(1)


def _load_llm() -> Llama:
    logger.info(f"Loading LLM: {LLM_MODEL_PATH}")
    return Llama(model_path=LLM_MODEL_PATH, n_gpu_layers=-1, n_ctx=8192, n_batch=512, verbose=False)

def _unload_llm():
    llm = state.pop("llm", None)
    del llm
    if cuda.is_available():
        cuda.empty_cache()
    logger.info("LLM unloaded due to inactivity")

async def _inactivity_watcher():
    while True:
        await asyncio.sleep(60)
        llm = state.get("llm")
        last_used = state.get("llm_last_used")
        if llm is not None and last_used is not None:
            if time.monotonic() - last_used > LLM_IDLE_TIMEOUT:
                async with gpu_semaphore:
                    _unload_llm()

def _touch_llm():
    state["llm_last_used"] = time.monotonic()

async def _ensure_llm() -> Llama:
    llm = state.get("llm")
    if llm is None:
        if not os.path.exists(LLM_MODEL_PATH):
            raise HTTPException(status_code=503, detail="LLM model file not found.")
        loop = asyncio.get_event_loop()
        state["llm"] = await loop.run_in_executor(None, _load_llm)
    _touch_llm()
    return state["llm"]

@asynccontextmanager
async def lifespan(app: FastAPI):
    device = "cuda" if cuda.is_available() else "cpu"
    logger.info(f"--- Initializing GPU Node on {device} ---")

    if device == "cpu":
        logger.warning("CUDA NOT DETECTED. Performance will be severely degraded.")

    try:
        logger.info(f"Loading Embedding Model: {EMBED_MODEL_NAME}")
        state["embed_model"] = SentenceTransformer(
            EMBED_MODEL_NAME,
            trust_remote_code=True,
            device=device
        )

        if not os.path.exists(LLM_MODEL_PATH):
            logger.warning(f"LLM file not found at {LLM_MODEL_PATH} — will load on first request")
        else:
            state["llm"] = _load_llm()
            _touch_llm()

        logger.info(f"--- GPU Node Ready (LLM idle timeout: {LLM_IDLE_TIMEOUT}s) ---")
    except Exception as e:
        logger.error(f"Failed to load models: {e}")
        raise e

    watcher = asyncio.create_task(_inactivity_watcher())

    yield

    watcher.cancel()
    state.clear()
    if cuda.is_available():
        cuda.empty_cache()

app = FastAPI(title="Agentic GPU Node", lifespan=lifespan, docs_url=None, redoc_url=None, openapi_url=None)

_security = HTTPBasic()
_API_USER = os.getenv("INFERENCE_USERNAME", "admin")
_API_PASS = os.getenv("INFERENCE_PASSWORD", "changeme")

def require_auth(credentials: HTTPBasicCredentials = Depends(_security)):
    valid_user = secrets.compare_digest(credentials.username.encode(), _API_USER.encode())
    valid_pass = secrets.compare_digest(credentials.password.encode(), _API_PASS.encode())
    if not (valid_user and valid_pass):
        raise HTTPException(status_code=401, detail="Unauthorized", headers={"WWW-Authenticate": "Basic"})


@app.get("/health", dependencies=[Depends(require_auth)])
async def health():
    return {
        "status": "ok",
        "embedding_ready": state.get("embed_model") is not None,
        "llm_ready": state.get("llm") is not None,
    }

def pad_and_normalize(embeddings: Tensor, target_dimensions: int) -> Tensor:
    curr_dim = embeddings.shape[1]
    if curr_dim < target_dimensions:
        embeddings = F.pad(embeddings, (0, target_dimensions - curr_dim), "constant", 0)
    elif curr_dim > target_dimensions:
        embeddings = embeddings[:, :target_dimensions]
    return F.normalize(embeddings, p=2, dim=1)


@app.post("/v1/embeddings", dependencies=[Depends(require_auth)])
async def embeddings(request: Request):
    data = await request.json()
    input_data = data.get("input", "")
    input_kind = type(input_data).__name__
    input_count = len(input_data) if isinstance(input_data, list) else (1 if isinstance(input_data, str) else 0)
    logger.info("/v1/embeddings request received: input_kind=%s input_count=%s", input_kind, input_count)
    logger.info("/v1/embeddings using target_dimensions=%s", TARGET_DIMENSIONS)

    if isinstance(input_data, str):
        inputs = [input_data]
    elif isinstance(input_data, list):
        inputs = [str(item) for item in input_data if str(item).strip()]
    else:
        logger.warning("/v1/embeddings bad input type: %s", input_kind)
        raise HTTPException(status_code=400, detail="'input' must be a string or list of strings")

    if not inputs:
        return {
            "object": "list",
            "data": [],
            "model": EMBED_MODEL_NAME,
            "usage": {"prompt_tokens": 0, "total_tokens": 0},
        }

    model = state.get("embed_model")
    if model is None:
        raise HTTPException(status_code=503, detail="Embedding model not initialized")

    prefixed_inputs = [
        text if text.startswith("search_") else f"search_query: {text}"
        for text in inputs
    ]

    loop = asyncio.get_event_loop()

    def _encode():
        with no_grad():
            vectors = model.encode(prefixed_inputs, convert_to_tensor=True)
            return pad_and_normalize(vectors, target_dimensions=TARGET_DIMENSIONS)

    async with gpu_semaphore:
        vectors = await loop.run_in_executor(None, _encode)

    vector_list = vectors.cpu().tolist()

    return {
        "object": "list",
        "data": [
            {
                "object": "embedding",
                "index": idx,
                "embedding": embedding,
            }
            for idx, embedding in enumerate(vector_list)
        ],
        "model": EMBED_MODEL_NAME,
        "usage": {
            "prompt_tokens": sum(len(text.split()) for text in inputs),
            "total_tokens": sum(len(text.split()) for text in inputs),
        },
    }

@app.post("/v1/semantic-chunk", dependencies=[Depends(require_auth)])
async def semantic_chunk(request: Request):
    data = await request.json()
    raw_text = data.get("text", "")
    threshold_percentile = data.get("threshold", 95)
    raw_text_len = len(raw_text) if isinstance(raw_text, str) else -1
    logger.info("/v1/semantic-chunk request received: text_len=%s threshold=%s", raw_text_len, threshold_percentile,)
    logger.info("/v1/semantic-chunk using target_dimensions=%s", TARGET_DIMENSIONS)

    if not raw_text:
        logger.info("/v1/semantic-chunk empty text payload")
        return {"chunks": [], "embeddings": []}

    if len(raw_text) > 50000:
        logger.warning("/v1/semantic-chunk payload too large: text_len=%s", len(raw_text))
        raise HTTPException(status_code=413, detail="Text block too large. Please batch on the client.")

    model = state.get("embed_model")
    if model is None:
        logger.error("/v1/semantic-chunk embedding model not initialized")
        raise HTTPException(status_code=503, detail="Embedding model not initialized")

    loop = asyncio.get_event_loop()
    sentences = [s.strip() for s in raw_text.replace('\n', ' ').split('. ') if s.strip()]

    def _chunk_and_embed():
        if len(sentences) < 2:
            single = model.encode([f"search_document: {raw_text}"], convert_to_tensor=True)
            single = pad_and_normalize(single, target_dimensions=TARGET_DIMENSIONS)
            return {"chunks": [raw_text], "embeddings": single.cpu().tolist()}

        s_embeddings = model.encode(sentences, convert_to_tensor=True)
        distances = [
            1 - F.cosine_similarity(s_embeddings[i].unsqueeze(0), s_embeddings[i+1].unsqueeze(0)).item()
            for i in range(len(s_embeddings) - 1)
        ]

        breakpoint_threshold = np.percentile(distances, threshold_percentile)
        indices = [i for i, d in enumerate(distances) if d > breakpoint_threshold]

        chunks = []
        start = 0
        for idx in indices:
            chunks.append(". ".join(sentences[start : idx + 1]) + ".")
            start = idx + 1
        chunks.append(". ".join(sentences[start:]) + ".")

        with no_grad():
            final_embeddings = model.encode(
                [f"search_document: {c}" for c in chunks],
                convert_to_tensor=True
            )
            final_embeddings = pad_and_normalize(final_embeddings, target_dimensions=TARGET_DIMENSIONS)

        return {"chunks": chunks, "embeddings": final_embeddings.cpu().tolist()}

    async with gpu_semaphore:
        result = await loop.run_in_executor(None, _chunk_and_embed)
    return result

@app.post("/v1/chat/completions", dependencies=[Depends(require_auth)])
async def chat_completions(request: Request):
    try:
        data = await request.json()
    except Exception as e:
        raw_body = await request.body()
        preview = raw_body[:500].decode("utf-8", errors="replace")
        logger.error(f"Invalid JSON payload for chat completions: {e}; body_preview={preview}")
        raise HTTPException(status_code=400, detail="Invalid JSON payload")

    messages = data.get("messages", [])
    stream = data.get("stream", False)

    logger.info(f"Chat completion request: {len(messages)} messages, stream={stream}")

    llm = await _ensure_llm()

    loop = asyncio.get_event_loop()
    temperature = data.get("temperature", 0.7)
    max_tokens = data.get("max_tokens", 1024)

    def _infer():
        return llm.create_chat_completion(
            messages=messages,
            stream=False,
            temperature=temperature,
            max_tokens=max_tokens,
            stop=["<|eot_id|>", "<|end_of_text|>"],
        )

    try:
        if stream:
            def _infer_stream():
                return llm.create_chat_completion(
                    messages=messages,
                    stream=True,
                    temperature=temperature,
                    max_tokens=max_tokens,
                    stop=["<|eot_id|>", "<|end_of_text|>"],
                )

            async def _stream_response():
                async with gpu_semaphore:
                    chunks = await loop.run_in_executor(None, lambda: list(_infer_stream()))
                for chunk in chunks:
                    yield f"data: {json.dumps(chunk)}\n\n"
                yield "data: [DONE]\n\n"

            return StreamingResponse(_stream_response(), media_type="text/event-stream")

        async with gpu_semaphore:
            response = await loop.run_in_executor(None, _infer)
        return response
    except Exception as e:
        logger.error(f"Inference error: {e}")
        raise HTTPException(status_code=500, detail=str(e))


if __name__ == "__main__":
    import uvicorn
    uvicorn.run("gpu_server:app", host="0.0.0.0", port=8001, reload=True)
Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`import asyncio`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`import logging`
			`import os`
			`import json`
Added VRAM release mechanism 2026-03-22 15:34:06 +00:00			`import time`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`from contextlib import asynccontextmanager`
			`from typing import Dict, Any`

			`import numpy as np`
Fixed formatting and output 2026-03-11 14:37:50 +00:00			`from torch import cuda, no_grad, Tensor`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`import torch.nn.functional as F`
Added llm api basic auth and disabled docs 2026-03-22 08:19:57 +00:00			`import secrets`
			`from fastapi import FastAPI, Request, HTTPException, Depends`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`from fastapi.responses import StreamingResponse`
Added llm api basic auth and disabled docs 2026-03-22 08:19:57 +00:00			`from fastapi.security import HTTPBasic, HTTPBasicCredentials`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`from llama_cpp import Llama`
			`from sentence_transformers import SentenceTransformer`

			`logging.basicConfig(`
			`level=logging.INFO,`
			`format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"`
			`)`
			`logger = logging.getLogger("gpu-node")`

			`EMBED_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"`
			`LLM_MODEL_PATH = os.getenv("LLM_MODEL_PATH", "/app/models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf")`
Added VRAM release mechanism 2026-03-22 15:34:06 +00:00			`LLM_IDLE_TIMEOUT = int(os.getenv("LLM_IDLE_TIMEOUT", "1800"))`
Hardcoded dimensions 2026-03-11 21:33:17 +00:00			`TARGET_DIMENSIONS = 768`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
			`state: Dict[str, Any] = {}`
Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`gpu_semaphore = asyncio.Semaphore(1)`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
Added VRAM release mechanism 2026-03-22 15:34:06 +00:00
			`def _load_llm() -> Llama:`
			`logger.info(f"Loading LLM: {LLM_MODEL_PATH}")`
			`return Llama(model_path=LLM_MODEL_PATH, n_gpu_layers=-1, n_ctx=8192, n_batch=512, verbose=False)`

			`def _unload_llm():`
			`llm = state.pop("llm", None)`
			`del llm`
			`if cuda.is_available():`
			`cuda.empty_cache()`
			`logger.info("LLM unloaded due to inactivity")`

			`async def _inactivity_watcher():`
			`while True:`
			`await asyncio.sleep(60)`
			`llm = state.get("llm")`
			`last_used = state.get("llm_last_used")`
			`if llm is not None and last_used is not None:`
			`if time.monotonic() - last_used > LLM_IDLE_TIMEOUT:`
			`async with gpu_semaphore:`
			`_unload_llm()`

			`def _touch_llm():`
			`state["llm_last_used"] = time.monotonic()`

			`async def _ensure_llm() -> Llama:`
			`llm = state.get("llm")`
			`if llm is None:`
			`if not os.path.exists(LLM_MODEL_PATH):`
			`raise HTTPException(status_code=503, detail="LLM model file not found.")`
			`loop = asyncio.get_event_loop()`
			`state["llm"] = await loop.run_in_executor(None, _load_llm)`
			`_touch_llm()`
			`return state["llm"]`

Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`@asynccontextmanager`
			`async def lifespan(app: FastAPI):`
Fixed formatting and output 2026-03-11 14:37:50 +00:00			`device = "cuda" if cuda.is_available() else "cpu"`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`logger.info(f"--- Initializing GPU Node on {device} ---")`

			`if device == "cpu":`
			`logger.warning("CUDA NOT DETECTED. Performance will be severely degraded.")`

			`try:`
			`logger.info(f"Loading Embedding Model: {EMBED_MODEL_NAME}")`
			`state["embed_model"] = SentenceTransformer(`
			`EMBED_MODEL_NAME,`
			`trust_remote_code=True,`
			`device=device`
			`)`

			`if not os.path.exists(LLM_MODEL_PATH):`
Added VRAM release mechanism 2026-03-22 15:34:06 +00:00			`logger.warning(f"LLM file not found at {LLM_MODEL_PATH} — will load on first request")`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`else:`
Added VRAM release mechanism 2026-03-22 15:34:06 +00:00			`state["llm"] = _load_llm()`
			`_touch_llm()`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
Added VRAM release mechanism 2026-03-22 15:34:06 +00:00			`logger.info(f"--- GPU Node Ready (LLM idle timeout: {LLM_IDLE_TIMEOUT}s) ---")`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`except Exception as e:`
			`logger.error(f"Failed to load models: {e}")`
			`raise e`

Added VRAM release mechanism 2026-03-22 15:34:06 +00:00			`watcher = asyncio.create_task(_inactivity_watcher())`

Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`yield`

Added VRAM release mechanism 2026-03-22 15:34:06 +00:00			`watcher.cancel()`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`state.clear()`
Fixed formatting and output 2026-03-11 14:37:50 +00:00			`if cuda.is_available():`
			`cuda.empty_cache()`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
Added llm api basic auth and disabled docs 2026-03-22 08:19:57 +00:00			`app = FastAPI(title="Agentic GPU Node", lifespan=lifespan, docs_url=None, redoc_url=None, openapi_url=None)`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
Added llm api basic auth and disabled docs 2026-03-22 08:19:57 +00:00			`_security = HTTPBasic()`
			`_API_USER = os.getenv("INFERENCE_USERNAME", "admin")`
			`_API_PASS = os.getenv("INFERENCE_PASSWORD", "changeme")`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
Added llm api basic auth and disabled docs 2026-03-22 08:19:57 +00:00			`def require_auth(credentials: HTTPBasicCredentials = Depends(_security)):`
			`valid_user = secrets.compare_digest(credentials.username.encode(), _API_USER.encode())`
			`valid_pass = secrets.compare_digest(credentials.password.encode(), _API_PASS.encode())`
			`if not (valid_user and valid_pass):`
			`raise HTTPException(status_code=401, detail="Unauthorized", headers={"WWW-Authenticate": "Basic"})`


			`@app.get("/health", dependencies=[Depends(require_auth)])`
Added onboarding role check, health endpoint and json check for gpu 2026-02-27 00:45:34 +00:00			`async def health():`
			`return {`
			`"status": "ok",`
			`"embedding_ready": state.get("embed_model") is not None,`
			`"llm_ready": state.get("llm") is not None,`
			`}`

Fixed formatting and output 2026-03-11 14:37:50 +00:00			`def pad_and_normalize(embeddings: Tensor, target_dimensions: int) -> Tensor:`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`curr_dim = embeddings.shape[1]`
Centralised embedding dimensions to 1 variable 2026-03-11 14:33:39 +00:00			`if curr_dim < target_dimensions:`
			`embeddings = F.pad(embeddings, (0, target_dimensions - curr_dim), "constant", 0)`
			`elif curr_dim > target_dimensions:`
			`embeddings = embeddings[:, :target_dimensions]`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`return F.normalize(embeddings, p=2, dim=1)`


Added llm api basic auth and disabled docs 2026-03-22 08:19:57 +00:00			`@app.post("/v1/embeddings", dependencies=[Depends(require_auth)])`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`async def embeddings(request: Request):`
			`data = await request.json()`
			`input_data = data.get("input", "")`
Added extra logging 2026-03-11 16:12:05 +00:00			`input_kind = type(input_data).__name__`
			`input_count = len(input_data) if isinstance(input_data, list) else (1 if isinstance(input_data, str) else 0)`
			`logger.info("/v1/embeddings request received: input_kind=%s input_count=%s", input_kind, input_count)`
Hardcoded dimensions 2026-03-11 21:33:17 +00:00			`logger.info("/v1/embeddings using target_dimensions=%s", TARGET_DIMENSIONS)`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
			`if isinstance(input_data, str):`
			`inputs = [input_data]`
			`elif isinstance(input_data, list):`
			`inputs = [str(item) for item in input_data if str(item).strip()]`
			`else:`
Added extra logging 2026-03-11 16:12:05 +00:00			`logger.warning("/v1/embeddings bad input type: %s", input_kind)`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`raise HTTPException(status_code=400, detail="'input' must be a string or list of strings")`

			`if not inputs:`
			`return {`
			`"object": "list",`
			`"data": [],`
			`"model": EMBED_MODEL_NAME,`
			`"usage": {"prompt_tokens": 0, "total_tokens": 0},`
			`}`

			`model = state.get("embed_model")`
			`if model is None:`
			`raise HTTPException(status_code=503, detail="Embedding model not initialized")`

			`prefixed_inputs = [`
			`text if text.startswith("search_") else f"search_query: {text}"`
			`for text in inputs`
			`]`

Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`loop = asyncio.get_event_loop()`

			`def _encode():`
			`with no_grad():`
			`vectors = model.encode(prefixed_inputs, convert_to_tensor=True)`
			`return pad_and_normalize(vectors, target_dimensions=TARGET_DIMENSIONS)`

			`async with gpu_semaphore:`
			`vectors = await loop.run_in_executor(None, _encode)`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
			`vector_list = vectors.cpu().tolist()`

			`return {`
			`"object": "list",`
			`"data": [`
			`{`
			`"object": "embedding",`
			`"index": idx,`
			`"embedding": embedding,`
			`}`
			`for idx, embedding in enumerate(vector_list)`
			`],`
			`"model": EMBED_MODEL_NAME,`
			`"usage": {`
			`"prompt_tokens": sum(len(text.split()) for text in inputs),`
			`"total_tokens": sum(len(text.split()) for text in inputs),`
			`},`
			`}`

Added llm api basic auth and disabled docs 2026-03-22 08:19:57 +00:00			`@app.post("/v1/semantic-chunk", dependencies=[Depends(require_auth)])`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`async def semantic_chunk(request: Request):`
			`data = await request.json()`
			`raw_text = data.get("text", "")`
			`threshold_percentile = data.get("threshold", 95)`
Added extra logging 2026-03-11 16:12:05 +00:00			`raw_text_len = len(raw_text) if isinstance(raw_text, str) else -1`
			`logger.info("/v1/semantic-chunk request received: text_len=%s threshold=%s", raw_text_len, threshold_percentile,)`
Hardcoded dimensions 2026-03-11 21:33:17 +00:00			`logger.info("/v1/semantic-chunk using target_dimensions=%s", TARGET_DIMENSIONS)`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
			`if not raw_text:`
Added extra logging 2026-03-11 16:12:05 +00:00			`logger.info("/v1/semantic-chunk empty text payload")`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`return {"chunks": [], "embeddings": []}`

			`if len(raw_text) > 50000:`
Added extra logging 2026-03-11 16:12:05 +00:00			`logger.warning("/v1/semantic-chunk payload too large: text_len=%s", len(raw_text))`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`raise HTTPException(status_code=413, detail="Text block too large. Please batch on the client.")`

			`model = state.get("embed_model")`
			`if model is None:`
Added extra logging 2026-03-11 16:12:05 +00:00			`logger.error("/v1/semantic-chunk embedding model not initialized")`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`raise HTTPException(status_code=503, detail="Embedding model not initialized")`

Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`loop = asyncio.get_event_loop()`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`sentences = [s.strip() for s in raw_text.replace('\n', ' ').split('. ') if s.strip()]`

Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`def _chunk_and_embed():`
			`if len(sentences) < 2:`
			`single = model.encode([f"search_document: {raw_text}"], convert_to_tensor=True)`
			`single = pad_and_normalize(single, target_dimensions=TARGET_DIMENSIONS)`
			`return {"chunks": [raw_text], "embeddings": single.cpu().tolist()}`

			`s_embeddings = model.encode(sentences, convert_to_tensor=True)`
			`distances = [`
			`1 - F.cosine_similarity(s_embeddings[i].unsqueeze(0), s_embeddings[i+1].unsqueeze(0)).item()`
			`for i in range(len(s_embeddings) - 1)`
			`]`

			`breakpoint_threshold = np.percentile(distances, threshold_percentile)`
			`indices = [i for i, d in enumerate(distances) if d > breakpoint_threshold]`

			`chunks = []`
			`start = 0`
			`for idx in indices:`
			`chunks.append(". ".join(sentences[start : idx + 1]) + ".")`
			`start = idx + 1`
			`chunks.append(". ".join(sentences[start:]) + ".")`

			`with no_grad():`
			`final_embeddings = model.encode(`
			`[f"search_document: {c}" for c in chunks],`
			`convert_to_tensor=True`
			`)`
			`final_embeddings = pad_and_normalize(final_embeddings, target_dimensions=TARGET_DIMENSIONS)`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`return {"chunks": chunks, "embeddings": final_embeddings.cpu().tolist()}`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`async with gpu_semaphore:`
			`result = await loop.run_in_executor(None, _chunk_and_embed)`
			`return result`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
Added llm api basic auth and disabled docs 2026-03-22 08:19:57 +00:00			`@app.post("/v1/chat/completions", dependencies=[Depends(require_auth)])`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`async def chat_completions(request: Request):`
Added onboarding role check, health endpoint and json check for gpu 2026-02-27 00:45:34 +00:00			`try:`
			`data = await request.json()`
			`except Exception as e:`
			`raw_body = await request.body()`
			`preview = raw_body[:500].decode("utf-8", errors="replace")`
			`logger.error(f"Invalid JSON payload for chat completions: {e}; body_preview={preview}")`
			`raise HTTPException(status_code=400, detail="Invalid JSON payload")`

Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`messages = data.get("messages", [])`
			`stream = data.get("stream", False)`

			`logger.info(f"Chat completion request: {len(messages)} messages, stream={stream}")`

Added VRAM release mechanism 2026-03-22 15:34:06 +00:00			`llm = await _ensure_llm()`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00
Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`loop = asyncio.get_event_loop()`
			`temperature = data.get("temperature", 0.7)`
			`max_tokens = data.get("max_tokens", 1024)`

			`def _infer():`
			`return llm.create_chat_completion(`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`messages=messages,`
Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`stream=False,`
			`temperature=temperature,`
			`max_tokens=max_tokens,`
			`stop=["<\|eot_id\|>", "<\|end_of_text\|>"],`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`)`

Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`try:`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`if stream:`
Added non-blocking for multiple operations 2026-03-20 23:45:29 +00:00			`def _infer_stream():`
			`return llm.create_chat_completion(`
			`messages=messages,`
			`stream=True,`
			`temperature=temperature,`
			`max_tokens=max_tokens,`
			`stop=["<\|eot_id\|>", "<\|end_of_text\|>"],`
			`)`

			`async def _stream_response():`
			`async with gpu_semaphore:`
			`chunks = await loop.run_in_executor(None, lambda: list(_infer_stream()))`
			`for chunk in chunks:`
			`yield f"data: {json.dumps(chunk)}\n\n"`
			`yield "data: [DONE]\n\n"`

			`return StreamingResponse(_stream_response(), media_type="text/event-stream")`

			`async with gpu_semaphore:`
			`response = await loop.run_in_executor(None, _infer)`
Revised all files to reduce bloat + optimized workflow 2026-02-26 01:32:04 +00:00			`return response`
			`except Exception as e:`
			`logger.error(f"Inference error: {e}")`
			`raise HTTPException(status_code=500, detail=str(e))`


			`if __name__ == "__main__":`
			`import uvicorn`
			`uvicorn.run("gpu_server:app", host="0.0.0.0", port=8001, reload=True)`