Dynavera/mcp_agent/mcp_server.py

1258 lines
51 KiB
Python

import asyncio
import json
import os
import sys
import logging
import traceback
from datetime import datetime
from pathlib import PureWindowsPath
from typing import Any, Dict, List, Tuple
from aiohttp import web
from mcp.server import Server
from mcp.types import Tool, TextContent
logger = logging.getLogger(__name__)
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
model_cache_dir = os.path.join(project_root, "model", "base-model")
def _init_runtime():
if not logging.getLogger().handlers:
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stderr),
logging.StreamHandler(sys.stdout),
]
)
os.makedirs(model_cache_dir, exist_ok=True)
os.environ["HF_HOME"] = model_cache_dir
logger.info(f"Project root: {project_root}")
logger.info(f"HuggingFace model cache directory set to: {model_cache_dir}")
app = Server("mlstore-mcp-server")
LOADED_MODELS: Dict[str, Dict[str, Any]] = {}
PAIR_EXTRACTOR: Dict[str, Any] = {}
EMBEDDING_MODEL: Dict[str, Any] = {}
BASE_MODEL_CACHE_DIR = model_cache_dir
@app.list_tools()
async def list_tools():
logger.info("Listing available tools")
tools = [
Tool(
name="echo",
description="Echo back the provided input",
inputSchema={
"type": "object",
"properties": {
"message": {"type": "string"}
},
"required": ["message"]
},
),
Tool(
name="fine_tune",
description="Start fine-tuning a base model using training files",
inputSchema={
"type": "object",
"properties": {
"base_model": {"type": "string"},
"training_files": {"type": "array", "items": {"type": "string"}},
"hyperparams": {"type": "object"},
"name": {"type": "string"},
"version": {"type": "string"}
},
"required": ["base_model", "training_files", "name", "version"]
},
),
Tool(
name="load_model",
description="Load a fine-tuned model into memory for inference",
inputSchema={
"type": "object",
"properties": {
"model_path": {"type": "string"}
},
"required": ["model_path"]
},
),
Tool(
name="infer",
description="Run inference with a fine-tuned model",
inputSchema={
"type": "object",
"properties": {
"model_path": {"type": "string"},
"prompt": {"type": "string"},
"options": {"type": "object"}
},
"required": ["model_path", "prompt"]
},
),
Tool(
name="embed",
description="Generate embeddings for a list of texts",
inputSchema={
"type": "object",
"properties": {
"texts": {"type": "array", "items": {"type": "string"}},
"model": {"type": "string"}
},
"required": ["texts"]
},
),
]
logger.info(f"Available tools: {[t.name for t in tools]}")
return tools
def _now() -> str:
return datetime.utcnow().isoformat() + "Z"
def _model_root() -> str:
return os.getenv("MCP_MODEL_DIR") or os.getenv("DJANGO_MODEL_DIR") or os.path.join(os.getcwd(), "model")
def _safe_dir_name(name: str) -> str:
return "".join(c for c in name if c.isalnum() or c in ("-", "_", ".")).strip(".")
def _map_gguf_repo(model_name: str) -> tuple[str | None, str | None]:
gguf_repos = {
"Llama-3.1-8B-Instruct.gguf": ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"),
"Meta-Llama-3.1-8B-Instruct.gguf": ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"),
"Meta-Llama-3.1-8B-Instruct.Q4_0.gguf": ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q4_0.gguf"),
"Llama-3.1-8B-Instruct.Q4_0.gguf": ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", "Meta-Llama-3.1-8B-Instruct-Q4_0.gguf"),
"Meta-Llama-3-8B-Instruct.Q4_0.gguf": ("bartowski/Meta-Llama-3-8B-Instruct-GGUF", "Meta-Llama-3-8B-Instruct-Q4_0.gguf"),
"Llama-3-8B-Instruct.Q4_0.gguf": ("bartowski/Meta-Llama-3-8B-Instruct-GGUF", "Meta-Llama-3-8B-Instruct-Q4_0.gguf"),
"mistral-7b-instruct-v0.3.Q4_0.gguf": ("bartowski/Mistral-7B-Instruct-v0.3-GGUF", "Mistral-7B-Instruct-v0.3-Q4_0.gguf"),
"Mistral-7B-Instruct-v0.3.Q4_0.gguf": ("bartowski/Mistral-7B-Instruct-v0.3-GGUF", "Mistral-7B-Instruct-v0.3-Q4_0.gguf"),
}
if model_name in gguf_repos:
return gguf_repos[model_name]
base_name = model_name.lower()
if "llama" in base_name and "3.1" in base_name and "8b" in base_name:
repo_id = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
if ".q4_0" in base_name:
return repo_id, "Meta-Llama-3.1-8B-Instruct-Q4_0.gguf"
if ".q4_k_m" in base_name:
return repo_id, "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
return repo_id, "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
if "llama" in base_name and "3" in base_name and "8b" in base_name:
repo_id = "bartowski/Meta-Llama-3-8B-Instruct-GGUF"
if ".q4_0" in base_name:
return repo_id, "Meta-Llama-3-8B-Instruct-Q4_0.gguf"
return repo_id, "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
if "mistral" in base_name and "7b" in base_name:
repo_id = "bartowski/Mistral-7B-Instruct-v0.3-GGUF"
if ".q4_0" in base_name:
return repo_id, "Mistral-7B-Instruct-v0.3-Q4_0.gguf"
return repo_id, "Mistral-7B-Instruct-v0.3-Q4_K_M.gguf"
return None, None
def _find_existing_gguf(model_name: str) -> str | None:
repo_id, filename = _map_gguf_repo(model_name)
if not filename:
return None
candidate_paths = [
os.path.join(_model_root(), filename),
os.path.join(model_cache_dir, filename),
os.path.join(os.getcwd(), "model", filename),
os.path.join(os.getcwd(), filename),
]
for path in candidate_paths:
if os.path.exists(path):
return path
return None
def _download_gguf_from_hf(model_name: str) -> str:
"""
Download a GGUF model from Hugging Face Hub.
Returns the path to the downloaded model file.
"""
logger.info(f"Attempting to download GGUF model from Hugging Face: {model_name}")
existing_path = _find_existing_gguf(model_name)
if existing_path:
logger.info(f"Found existing GGUF locally: {existing_path}")
return existing_path
try:
from huggingface_hub import hf_hub_download, list_repo_files
model_dir = _model_root()
os.makedirs(model_dir, exist_ok=True)
repo_id, filename = _map_gguf_repo(model_name)
if repo_id and filename:
logger.info(f"Found known model mapping: {repo_id}/{filename}")
if not repo_id or not filename:
logger.error(f"Could not determine Hugging Face repo for model: {model_name}")
raise ValueError(f"Unknown model: {model_name}. Please specify a known GGUF model.")
logger.info(f"Downloading {filename} from {repo_id}...")
logger.info(f"This may take several minutes depending on model size and connection speed.")
downloaded_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
cache_dir=model_cache_dir,
local_dir=model_dir,
local_dir_use_symlinks=False,
)
logger.info(f"Model downloaded successfully to: {downloaded_path}")
return downloaded_path
except ImportError as e:
logger.error(f"huggingface_hub not available: {str(e)}")
raise ImportError("huggingface_hub is required to download models. Install with: pip install huggingface_hub")
except Exception as e:
logger.error(f"Failed to download model from Hugging Face: {str(e)}", exc_info=True)
raise
def _resolve_model_path(model_path: str) -> str:
if not model_path:
return model_path
norm = os.path.normpath(model_path)
if os.path.isabs(norm) and os.path.exists(norm):
return norm
candidates = []
candidates.append(os.path.normpath(os.path.join(os.getcwd(), norm)))
candidates.append(os.path.normpath(os.path.join(_model_root(), os.path.basename(norm))))
if ":" in model_path or "\\" in model_path:
p = PureWindowsPath(model_path)
parts = [str(x) for x in p.parts]
for anchor in ("notebooks", "model"):
if anchor in parts:
idx = parts.index(anchor)
rel = os.path.join(*parts[idx:])
candidates.append(os.path.normpath(os.path.join(os.getcwd(), rel)))
for cand in candidates:
if os.path.exists(cand):
return cand
return norm
def _resolve_model_file(model_path: str) -> tuple[str, str]:
resolved = _resolve_model_path(model_path)
if os.path.isdir(resolved):
for name in os.listdir(resolved):
if name.lower().endswith(".gguf"):
return resolved, name
return resolved, ""
return os.path.dirname(resolved), os.path.basename(resolved)
def _load_training_file(file_path: str) -> List[Dict[str, str]]:
logger.info(f"Loading training file: {file_path}")
if not os.path.exists(file_path):
logger.error(f"Training file not found: {file_path}")
raise FileNotFoundError(f"Training file not found: {file_path}")
_, ext = os.path.splitext(file_path)
ext = ext.lower()
try:
if ext == '.json':
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
logger.info(f"JSON file contains {len(data)} items")
return data
elif isinstance(data, dict):
logger.info(f"JSON file is dict, extracting first array or values")
for key, val in data.items():
if isinstance(val, list):
return val
return [data]
return []
elif ext == '.csv':
import csv
pairs = []
with open(file_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
pairs.append(dict(row))
logger.info(f"CSV file contains {len(pairs)} rows")
return pairs
elif ext == '.txt':
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
pairs = [{'text': para.strip()} for para in content.split('\n\n') if para.strip()]
logger.info(f"TXT file contains {len(pairs)} paragraphs")
return pairs
elif ext == '.md':
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
pairs = [{'text': para.strip()} for para in content.split('\n\n') if para.strip()]
logger.info(f"MD file contains {len(pairs)} sections")
return pairs
elif ext == '.pdf':
try:
import PyPDF2
pairs = []
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
for page_num, page in enumerate(reader.pages):
text = page.extract_text()
if text.strip():
pairs.append({'text': text.strip(), 'page': page_num})
logger.info(f"PDF file contains {len(pairs)} pages")
return pairs
except:
pass
elif ext == '.docx':
try:
from docx import Document
doc = Document(file_path)
pairs = [{'text': para.text} for para in doc.paragraphs if para.text.strip()]
logger.info(f"DOCX file contains {len(pairs)} paragraphs")
return pairs
except ImportError:
logger.error("python-docx not available for DOCX parsing")
raise ImportError("python-docx library not available")
else:
logger.error(f"Unsupported file type: {ext}")
raise ValueError(f"Unsupported file type: {ext}")
except Exception as e:
logger.error(f"Failed to load training file: {str(e)}", exc_info=True)
raise
def _ensure_pair_extractor_model(base_model: str):
if PAIR_EXTRACTOR.get("model") is not None:
return PAIR_EXTRACTOR["tokenizer"], PAIR_EXTRACTOR["model"]
logger.info("Loading base model for pair extraction (prompt-based)")
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
tokenizer = AutoTokenizer.from_pretrained(
base_model,
cache_dir=model_cache_dir,
local_files_only=False,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
base_model,
cache_dir=model_cache_dir,
quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
device_map="auto",
dtype=torch.float16,
offload_dir='./offload_dir',
)
PAIR_EXTRACTOR["tokenizer"] = tokenizer
PAIR_EXTRACTOR["model"] = model
logger.info("Pair extractor model loaded and cached")
return tokenizer, model
def _format_training_sample(sample: Any) -> str:
try:
if isinstance(sample, dict):
parts = []
for k, v in sample.items():
if isinstance(v, str) and v.strip():
parts.append(f"{k}: {v.strip()}")
return " | ".join(parts) if parts else json.dumps(sample)
if isinstance(sample, str):
return sample.strip()
return str(sample)
except Exception:
return str(sample)
def _prompt_based_pair_extraction(training_data: List[Any], base_model: str) -> List[Tuple[str, str]]:
import torch
tokenizer, model = _ensure_pair_extractor_model(base_model)
max_items = 12
subset = training_data
formatted = [f"{i+1}. {_format_training_sample(item)}" for i, item in enumerate(subset)]
data_block = "\n".join(formatted)
example_pairs = [
{"instruction": "Explain what a REST API is.", "response": "A REST API is an interface that uses HTTP methods..."},
{"instruction": "Summarize the customer complaint.", "response": "Customer reports delayed shipment and requests refund."},
]
system_prompt = (
"You are a data extractor. Given a list of items, return a JSON array of training pairs. "
"Each pair must have 'instruction' and 'response'. Keep answers concise. "
"If content is incomplete, still produce best-effort pairs. "
"End your answer with a complete sentence. Do not start lists or new sections."
)
user_prompt = (
"Examples of desired output:\n"
f"{json.dumps(example_pairs, ensure_ascii=False, indent=2)}\n\n"
"Now extract training pairs from the following items. Return ONLY a JSON array, no extra text.\n"
f"Items:\n{data_block}\n\n"
"Answer:"
)
prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
max_new_tokens = 512
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
temperature=0.2,
top_p=0.9,
eos_token_id=tokenizer.eos_token_id,
)
decoded = tokenizer.decode(output[0], skip_special_tokens=True)
json_start = decoded.find("[")
if json_start == -1:
logger.error("LLM extraction failed to produce JSON array (no '[' found)")
return []
pairs: List[Tuple[str, str]] = []
bracket_count = 0
in_string = False
escape_next = False
json_end = -1
for i in range(json_start, len(decoded)):
char = decoded[i]
if escape_next:
escape_next = False
continue
if char == '\\':
escape_next = True
continue
if char == '"' and not escape_next:
in_string = not in_string
continue
if not in_string:
if char == '[':
bracket_count += 1
elif char == ']':
bracket_count -= 1
if bracket_count == 0:
json_end = i
break
if json_end == -1:
logger.error("LLM extraction failed to find valid JSON array boundary")
return []
try:
json_text = decoded[json_start: json_end + 1]
logger.debug(f"Extracted JSON text (first 200 chars): {json_text[:200]}")
parsed = json.loads(json_text)
for item in parsed:
instr = str(item.get("instruction", "")).strip()
resp = str(item.get("response", "")).strip()
if instr and resp:
pairs.append((instr, resp))
logger.info(f"LLM extracted {len(pairs)} pairs via prompting")
return pairs
except Exception as e:
logger.error(f"Failed to parse LLM-extracted JSON: {str(e)}")
logger.debug(f"JSON text that failed to parse: {json_text if 'json_text' in locals() else 'N/A'}")
return []
def _extract_training_pairs(training_data: List[Any]) -> List[Tuple[str, str]]:
logger.info(f"Extracting training pairs via LLM for {len(training_data)} items")
if not training_data:
return []
base_model = "meta-llama/Llama-3.1-8B-Instruct"
pairs = _prompt_based_pair_extraction(training_data, base_model)
if not pairs:
logger.warning("LLM extraction failed; falling back to minimal heuristic")
for item in training_data:
text = _format_training_sample(item)
if text:
mid = max(1, len(text) // 2)
pairs.append((text[:mid].strip(), text[mid:].strip() or text[:50]))
logger.info(f"Total pairs extracted: {len(pairs)}")
if pairs:
logger.debug(f"Sample pair: instruction='{pairs[0][0][:80]}...', response='{pairs[0][1][:80]}...'")
return pairs
def _ensure_embedding_model(model_name: str):
if EMBEDDING_MODEL.get("model") is not None and EMBEDDING_MODEL.get("name") == model_name:
return EMBEDDING_MODEL["model"]
from sentence_transformers import SentenceTransformer
logger.info(f"Loading embedding model: {model_name}")
model = SentenceTransformer(model_name, cache_folder=model_cache_dir)
EMBEDDING_MODEL["model"] = model
EMBEDDING_MODEL["name"] = model_name
return model
async def _fine_tune_model_impl(
training_files: List[str],
hyperparams: Dict[str, Any],
model_name: str,
version: str,
output_dir: str
) -> Dict[str, Any]:
base_model = "meta-llama/Llama-3.1-8B-Instruct"
logger.info(f"Starting fine-tune process with base model: {base_model}")
try:
logger.info(f"Step 1: Loading {len(training_files)} training files")
all_training_pairs = []
for file_path in training_files:
try:
training_data = _load_training_file(file_path)
pairs = _extract_training_pairs(training_data)
all_training_pairs.extend(pairs)
logger.info(f"File {os.path.basename(file_path)}: {len(pairs)} pairs extracted")
except Exception as e:
logger.error(f"Failed to process file {file_path}: {str(e)}")
continue
if not all_training_pairs:
logger.error("No training pairs extracted from any files")
return {
"status": "failed",
"error": "no_training_pairs_extracted",
"timestamp": _now(),
}
logger.info(f"Step 1 Complete: Total {len(all_training_pairs)} training pairs extracted")
logger.info(f"Step 2: Loading base model and tokenizer: {base_model}")
try:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login, HfApi
import torch
logger.info("Step 2a: Authenticating with HuggingFace...")
hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
if hf_token:
logger.info("HuggingFace token found in environment, logging in...")
try:
login(token=hf_token, write_permission=False)
logger.info("Successfully authenticated with HuggingFace")
except Exception as e:
error_details = traceback.format_exc()
logger.error(f"Failed to authenticate with HuggingFace: {str(e)}")
logger.error(f"Traceback:\n{error_details}")
return {
"status": "failed",
"error": "huggingface_auth_failed",
"details": str(e),
"traceback": error_details,
"timestamp": _now(),
}
else:
logger.warning("HF_TOKEN or HUGGINGFACE_TOKEN environment variable not found. Model access may be restricted.")
logger.info("Step 2b: Loading tokenizer...")
try:
logger.info(f"Tokenizer cache directory: {model_cache_dir}")
tokenizer = AutoTokenizer.from_pretrained(
base_model,
cache_dir=model_cache_dir,
local_files_only=False
)
logger.info("Tokenizer download started...")
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
logger.info("Tokenizer loaded successfully")
except Exception as e:
error_details = traceback.format_exc()
logger.error(f"Failed to load tokenizer: {str(e)}")
logger.error(f"Traceback:\n{error_details}")
raise
logger.info("Step 2b: Loading base model with 4-bit quantization...")
try:
logger.info(f"Model cache directory: {model_cache_dir}")
logger.info(f"Starting model download for {base_model}...")
model = AutoModelForCausalLM.from_pretrained(
base_model,
cache_dir=model_cache_dir,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
),
device_map="auto",
dtype=torch.float16,
)
logger.info("Base model loaded successfully with 4-bit quantization")
except Exception as e:
error_details = traceback.format_exc()
logger.error(f"Failed to load base model: {str(e)}")
logger.error(f"Traceback:\n{error_details}")
raise
except ImportError as e:
error_details = traceback.format_exc()
logger.error(f"Required HuggingFace transformers not available: {str(e)}")
logger.error(f"Traceback:\n{error_details}")
return {
"status": "failed",
"error": "transformers_not_available",
"details": str(e),
"traceback": error_details,
"timestamp": _now(),
}
logger.info(f"Step 2 Complete: Model and tokenizer loaded")
logger.info(f"Step 3: Fine-tuning with LoRA adapters")
try:
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments
import json
training_data_file = os.path.join(output_dir, "training_data.jsonl")
with open(training_data_file, "w", encoding="utf-8") as f:
for prompt, response in all_training_pairs:
training_pair = {
"instruction": prompt,
"input": "",
"output": response,
"text": f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
}
f.write(json.dumps(training_pair, ensure_ascii=False) + "\n")
logger.info(f"Training data file created: {training_data_file}")
from datasets import load_dataset
dataset = load_dataset("json", data_files=training_data_file)
logger.info(f"Dataset loaded: {len(dataset['train'])} examples")
lora_config = LoraConfig(
r=int(hyperparams.get("lora_r", 64)),
lora_alpha=int(hyperparams.get("lora_alpha", 16)),
lora_dropout=float(hyperparams.get("lora_dropout", 0.05)),
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
task_type="CAUSAL_LM"
)
training_args = TrainingArguments(
output_dir=os.path.join(output_dir, "checkpoints"),
num_train_epochs=int(hyperparams.get("epochs", 3)),
per_device_train_batch_size=int(hyperparams.get("batch_size", 6)),
gradient_accumulation_steps=int(hyperparams.get("gradient_accumulation_steps", 3)),
fp16=False,
bf16=False,
optim="paged_adamw_8bit",
max_grad_norm=0.0,
logging_steps=10,
save_strategy="epoch",
ddp_find_unused_parameters=False,
remove_unused_columns=False,
dataloader_pin_memory=True,
)
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
peft_config=lora_config,
args=training_args,
)
trainer.accelerator.scaler = None
logger.info("Starting LoRA fine-tuning...")
trainer.train()
logger.info("LoRA fine-tuning completed")
adapter_dir = os.path.join(output_dir, "adapter")
os.makedirs(adapter_dir, exist_ok=True)
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
logger.info(f"LoRA adapter saved to {adapter_dir}")
merge_dir = os.path.join(output_dir, "merged")
os.makedirs(merge_dir, exist_ok=True)
logger.info(f"Merging LoRA adapters...")
from peft import PeftModel
base_reload = AutoModelForCausalLM.from_pretrained(
base_model,
cache_dir=model_cache_dir,
dtype=torch.float16,
device_map="auto",
)
merged_model = PeftModel.from_pretrained(base_reload, adapter_dir)
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained(merge_dir)
tokenizer.save_pretrained(merge_dir)
logger.info(f"Merged model saved to {merge_dir}")
logger.info("Cleaning up GPU memory...")
del model
del merged_model
del base_reload
if torch.cuda.is_available():
torch.cuda.empty_cache()
logger.info(f"GPU memory freed: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB reserved")
logger.info("GPU memory cleanup completed")
except ImportError as e:
logger.error(f"Required LoRA/SFT training libraries not available: {str(e)}")
return {
"status": "failed",
"error": "training_libs_not_available",
"details": str(e),
"timestamp": _now(),
}
logger.info(f"Step 3 Complete: Fine-tuning and merging completed")
logger.info(f"Step 4: Converting merged model to GGUF format")
try:
import subprocess
merge_dir = os.path.join(output_dir, "merged")
gguf_f16_path = os.path.join(output_dir, "model-f16.gguf")
convert_script = None
for candidate in [
os.path.join(os.getcwd(), "llama.cpp", "convert_hf_to_gguf.py"),
os.path.join(os.getcwd(), "notebooks", "build", "llama.cpp", "convert_hf_to_gguf.py"),
"/app/llama.cpp/convert_hf_to_gguf.py",
"/home/llama.cpp/convert_hf_to_gguf.py",
]:
if os.path.exists(candidate):
convert_script = candidate
break
if not convert_script:
logger.warning("convert_hf_to_gguf.py not found, trying with python -m")
convert_script = None
if convert_script:
logger.info(f"Converting with script: {convert_script}")
result = subprocess.run([
"python", convert_script,
merge_dir,
"--outfile", gguf_f16_path,
"--outtype", "f16"
], capture_output=True, text=True)
if result.returncode != 0:
logger.error(f"Conversion script failed with return code {result.returncode}")
logger.error(f"STDOUT: {result.stdout}")
logger.error(f"STDERR: {result.stderr}")
raise RuntimeError(f"GGUF conversion failed: {result.stderr or 'unknown error'}")
else:
logger.info(f"GGUF conversion completed: {gguf_f16_path}")
else:
logger.info("Attempting direct conversion with transformers")
from transformers import AutoModel
model = AutoModelForCausalLM.from_pretrained(merge_dir, torch_dtype=torch.float16)
logger.warning("Direct GGUF conversion not available, using float16 checkpoint instead")
gguf_f16_path = merge_dir
logger.info(f"GGUF conversion completed: {gguf_f16_path}")
except Exception as e:
logger.error(f"GGUF conversion failed: {str(e)}", exc_info=True)
gguf_f16_path = os.path.join(output_dir, "merged")
logger.warning("Using merged model directly without GGUF conversion")
logger.info(f"Step 4 Complete: GGUF conversion completed")
logger.info(f"Step 5: Verifying model format")
quantized_path = gguf_f16_path
if os.path.isfile(gguf_f16_path):
logger.info(f"GGUF file verified: {gguf_f16_path}")
logger.info(f"GGUF model size: {os.path.getsize(gguf_f16_path) / (1024**3):.2f} GB")
quantized_path = gguf_f16_path
else:
logger.warning(f"No GGUF file found; using merged model folder")
quantized_path = gguf_f16_path
logger.info(f"Step 6: Creating metadata and finalizing")
quantized_file = quantized_path
if os.path.isdir(quantized_file):
logger.warning(f"Quantized path is a directory ({quantized_file}); searching for .gguf inside")
ggufs = [f for f in os.listdir(quantized_file) if f.lower().endswith('.gguf')]
if ggufs:
quantized_file = os.path.join(quantized_file, ggufs[0])
logger.info(f"Found GGUF inside directory: {quantized_file}")
else:
logger.error("No .gguf file found inside quantized directory; using merged folder as fallback")
quantized_file = quantized_path
final_model_path = os.path.join(_model_root(), f"{model_name}-{version}.gguf")
if os.path.isfile(quantized_file) and quantized_file != final_model_path:
import shutil
logger.info(f"Copying quantized model to final location: {final_model_path}")
shutil.copy2(quantized_file, final_model_path)
logger.info(f"Final model saved to: {final_model_path}")
else:
final_model_path = quantized_file
metadata = {
"status": "completed",
"base_model": base_model,
"name": model_name,
"version": version,
"model_path": final_model_path,
"path": final_model_path,
"output_dir": output_dir,
"training_files_count": len(training_files),
"training_pairs_count": len(all_training_pairs),
"hyperparams": hyperparams,
"timestamp": _now(),
}
try:
with open(os.path.join(output_dir, "metadata.json"), "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2)
logger.info(f"Metadata saved to {output_dir}/metadata.json")
except Exception as e:
logger.error(f"Failed to save metadata: {str(e)}", exc_info=True)
logger.info("Performing final GPU memory cleanup...")
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
logger.info(f"Final GPU memory state: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB allocated")
logger.info(f"Step 6 Complete: All steps completed successfully")
logger.info(f"Final model available at: {final_model_path}")
return metadata
except Exception as e:
error_details = traceback.format_exc()
logger.error(f"Fine-tune process failed: {str(e)}")
logger.error(f"Error type: {type(e).__name__}")
logger.error(f"Full traceback:\n{error_details}")
return {
"status": "failed",
"error": str(e) or "Unknown error occurred",
"error_type": type(e).__name__,
"traceback": error_details,
"timestamp": _now(),
}
async def _run_tool_http(name: str, arguments: dict) -> Dict[str, Any]:
logger.info(f"Executing tool: {name}")
logger.debug(f"Tool arguments: {arguments}")
if name == "echo":
logger.info(f"Echo tool called with message: {arguments.get('message')}")
result = {"status": "ok", "received": arguments, "timestamp": _now()}
logger.info(f"Echo tool completed successfully")
return result
if name == "fine_tune":
base_model = arguments.get("base_model")
training_files = arguments.get("training_files") or []
hyperparams = arguments.get("hyperparams") or {}
model_name = arguments.get("name") or "model"
version = arguments.get("version") or "v1"
logger.info(f"Fine-tune started: model={model_name}, version={version}")
logger.info(f"Training files count: {len(training_files)}")
logger.debug(f"Training files: {training_files}")
logger.debug(f"Hyperparameters: {hyperparams}")
model_root = _model_root()
os.makedirs(model_root, exist_ok=True)
logger.debug(f"Model root directory: {model_root}")
safe_name = _safe_dir_name(model_name)
safe_version = _safe_dir_name(version)
output_dir = os.path.join(model_root, f"{safe_name}-{safe_version}")
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Output directory created: {output_dir}")
try:
result = await _fine_tune_model_impl(training_files, hyperparams, model_name, version, output_dir)
logger.info(f"Fine-tune result: {result.get('status')}")
return result
except Exception as e:
error_details = traceback.format_exc()
logger.error(f"Fine-tune tool execution failed: {str(e)}")
logger.error(f"Full traceback:\n{error_details}")
return {
"status": "failed",
"error": str(e) or "Unknown error in fine-tune execution",
"error_type": type(e).__name__,
"traceback": error_details,
"timestamp": _now(),
}
if name == "load_model":
model_path = arguments.get("model_path")
logger.info(f"Loading model: {model_path}")
if not model_path:
logger.error("model_path_required error: no model path provided")
return {"status": "failed", "error": "model_path_required", "timestamp": _now()}
original_model_path = model_path
model_path = _resolve_model_path(model_path)
logger.debug(f"Resolved model path: {model_path}")
if not os.path.exists(model_path):
logger.warning(f"Model not found at: {model_path}")
local_mapped = _find_existing_gguf(original_model_path)
if local_mapped:
logger.info(f"Using existing mapped GGUF: {local_mapped}")
model_path = local_mapped
else:
logger.info(f"Attempting to download model from Hugging Face...")
try:
model_path = _download_gguf_from_hf(original_model_path)
logger.info(f"Model downloaded successfully: {model_path}")
except Exception as download_error:
logger.error(f"Failed to download model: {str(download_error)}")
return {
"status": "failed",
"error": "model_not_found_and_download_failed",
"model_path": original_model_path,
"download_error": str(download_error),
"timestamp": _now()
}
try:
for loaded_path in list(LOADED_MODELS.keys()):
if loaded_path != model_path:
logger.info(f"Unloading cached model: {loaded_path}")
LOADED_MODELS.pop(loaded_path, None)
if model_path in LOADED_MODELS and "model" in LOADED_MODELS[model_path]:
logger.info(f"Model already loaded: {model_path}")
return {
"status": "completed",
"model_path": model_path,
"loaded": True,
"cached": True,
"timestamp": _now(),
}
from gpt4all import GPT4All
model_dir, model_file = _resolve_model_file(model_path)
logger.debug(f"Model directory: {model_dir}, model file: {model_file}")
if not model_file:
logger.error(f"No GGUF file found in model directory: {model_dir}")
return {
"status": "failed",
"error": "model_file_not_found",
"model_path": model_path,
"timestamp": _now(),
}
logger.info(f"Initializing GPT4All model: {model_file}")
try:
logger.info("Attempting to load model on GPU (cuda)...")
model = GPT4All(model_file, model_path=model_dir, allow_download=False, device='cuda')
logger.info(f"Model loaded successfully on GPU")
except Exception as e:
logger.warning(f"GPU initialization failed: {str(e)}, falling back to CPU")
logger.info("Loading model on CPU...")
model = GPT4All(model_file, model_path=model_dir, allow_download=False, device='cpu')
logger.info(f"Model loaded successfully on CPU")
LOADED_MODELS[model_path] = {
"loaded_at": _now(),
"model": model,
"model_dir": model_dir,
"model_file": model_file,
}
logger.info(f"Model loaded successfully: {model_path}")
logger.info(f"Total loaded models in memory: {len(LOADED_MODELS)}")
return {
"status": "completed",
"model_path": model_path,
"loaded": True,
"model_dir": model_dir,
"model_file": model_file,
"timestamp": _now(),
}
except Exception as e:
logger.error(f"Failed to load model: {str(e)}", exc_info=True)
return {
"status": "failed",
"error": str(e),
"error_type": type(e).__name__,
"model_path": model_path,
"timestamp": _now(),
}
if name == "infer":
model_path = arguments.get("model_path")
prompt = arguments.get("prompt") or ""
options = arguments.get("options") or {}
logger.info(f"Inference request: model={model_path}")
logger.debug(f"Prompt length: {len(prompt)} characters")
logger.debug(f"Inference options: {options}")
if not model_path:
logger.error("model_path_required error: no model path provided")
return {"status": "failed", "error": "model_path_required", "timestamp": _now()}
original_model_path = model_path
model_path = _resolve_model_path(model_path)
logger.debug(f"Resolved model path: {model_path}")
if not os.path.exists(model_path):
logger.warning(f"Model not found at: {model_path}")
local_mapped = _find_existing_gguf(original_model_path)
if local_mapped:
logger.info(f"Using existing mapped GGUF: {local_mapped}")
model_path = local_mapped
else:
logger.info(f"Attempting to download model from Hugging Face...")
try:
model_path = _download_gguf_from_hf(original_model_path)
logger.info(f"Model downloaded successfully: {model_path}")
except Exception as download_error:
logger.error(f"Failed to download model: {str(download_error)}")
return {
"status": "failed",
"error": "model_not_found_and_download_failed",
"model_path": original_model_path,
"download_error": str(download_error),
"timestamp": _now()
}
try:
if model_path not in LOADED_MODELS or "model" not in LOADED_MODELS[model_path]:
logger.info(f"Model not in memory, loading: {model_path}")
for loaded_path in list(LOADED_MODELS.keys()):
if loaded_path != model_path:
logger.info(f"Unloading cached model: {loaded_path}")
LOADED_MODELS.pop(loaded_path, None)
from gpt4all import GPT4All
model_dir, model_file = _resolve_model_file(model_path)
logger.debug(f"Model directory: {model_dir}, model file: {model_file}")
if not model_file:
logger.error(f"No GGUF file found in model directory: {model_dir}")
return {
"status": "failed",
"error": "model_file_not_found",
"model_path": model_path,
"timestamp": _now(),
}
logger.info(f"Initializing GPT4All model: {model_file}")
try:
logger.info("Attempting to load model on GPU (cuda)...")
model = GPT4All(model_file, model_path=model_dir, allow_download=False, device='cuda')
logger.info(f"Model loaded successfully on GPU for inference")
except Exception as e:
logger.warning(f"GPU initialization failed: {str(e)}, falling back to CPU")
logger.info("Loading model on CPU...")
model = GPT4All(model_file, model_path=model_dir, allow_download=False, device='cpu')
logger.info(f"Model loaded successfully on CPU for inference")
LOADED_MODELS[model_path] = {
"loaded_at": _now(),
"model": model,
"model_dir": model_dir,
"model_file": model_file,
}
logger.info(f"Model loaded for inference")
else:
logger.debug(f"Using cached model: {model_path}")
model = LOADED_MODELS[model_path]["model"]
max_tokens = int(options.get("max_tokens", 256))
temp = float(options.get("temperature", options.get("temp", 0.7)))
top_p = float(options.get("top_p", 0.95))
top_k = int(options.get("top_k", 40))
logger.info(f"Running inference with max_tokens={max_tokens}, temperature={temp}, top_p={top_p}, top_k={top_k}")
logger.debug(f"Full inference parameters: max_tokens={max_tokens}, temp={temp}, top_p={top_p}, top_k={top_k}")
response_text = model.generate(
prompt,
max_tokens=max_tokens,
temp=temp,
top_p=top_p,
top_k=top_k,
)
logger.info(f"Inference completed successfully. Response length: {len(response_text)} characters")
logger.debug(f"Response preview: {response_text[:100]}..." if len(response_text) > 100 else f"Response: {response_text}")
return {
"status": "completed",
"model_path": model_path,
"response": response_text,
"options": {
"max_tokens": max_tokens,
"temperature": temp,
"top_p": top_p,
"top_k": top_k,
},
"timestamp": _now(),
}
except Exception as e:
logger.error(f"Inference failed: {str(e)}", exc_info=True)
return {
"status": "failed",
"error": str(e),
"error_type": type(e).__name__,
"model_path": model_path,
"timestamp": _now(),
}
if name == "embed":
texts = arguments.get("texts") or []
model_name = arguments.get("model") or "all-MiniLM-L6-v2"
if not isinstance(texts, list) or not all(isinstance(t, str) for t in texts):
return {"status": "failed", "error": "texts_must_be_list_of_strings", "timestamp": _now()}
if not texts:
return {"status": "completed", "embeddings": [], "timestamp": _now()}
try:
model = _ensure_embedding_model(model_name)
embeddings = model.encode(texts).tolist()
return {
"status": "completed",
"embeddings": embeddings,
"model": model_name,
"timestamp": _now(),
}
except Exception as e:
logger.error(f"Embedding failed: {str(e)}", exc_info=True)
return {
"status": "failed",
"error": str(e),
"error_type": type(e).__name__,
"timestamp": _now(),
}
raise ValueError(f"Unknown tool: {name}")
@app.call_tool()
async def call_tool(name: str, arguments: dict):
logger.info(f"MCP call_tool: {name}")
result = await _run_tool_http(name, arguments)
logger.debug(f"MCP call_tool result for {name}: {result}")
return [TextContent(type="text", text=json.dumps(result, indent=2))]
async def handle_execute(request: web.Request) -> web.Response:
logger.info("HTTP /execute request received")
execution_id = None
try:
payload = await request.json()
tool = payload.get("tool")
arguments = payload.get("arguments", {})
execution_id = arguments.get("execution_id") or arguments.get("name", "unknown")
logger.info(f"HTTP execute: tool={tool}, execution_id={execution_id}")
logger.debug(f"HTTP execute arguments: {arguments}")
if not tool:
logger.error("Missing 'tool' field in request")
return web.json_response(
{"error": "Missing 'tool' field"}, status=400
)
logger.info(f"Calling _run_tool_http for {tool}...")
result = await _run_tool_http(tool, arguments)
logger.info(f"HTTP execute completed for {tool} with status={result.get('status')}")
logger.debug(f"HTTP execute result: {result}")
return web.json_response(result)
except json.JSONDecodeError as e:
logger.error(f"Invalid JSON in request: {str(e)}")
return web.json_response({"error": f"Invalid JSON: {str(e)}"}, status=400)
except Exception as e:
error_msg = str(e) if str(e) else f"Unknown error: {type(e).__name__}"
error_traceback = traceback.format_exc()
logger.error(f"Unexpected error in /execute (execution_id={execution_id}): {error_msg}")
logger.error(f"Traceback:\n{error_traceback}")
return web.json_response({
"status": "failed",
"error": error_msg,
"error_type": type(e).__name__,
"traceback": error_traceback,
"execution_id": execution_id,
}, status=500)
async def handle_health(request: web.Request) -> web.Response:
logger.debug("Health check requested")
return web.json_response({"status": "healthy"})
async def run_http_server():
_init_runtime()
host = os.getenv("MCP_HTTP_HOST", "0.0.0.0")
port = int(os.getenv("MCP_HTTP_PORT", "8001"))
logger.info(f"Starting HTTP server on {host}:{port}")
app_http = web.Application()
app_http.router.add_post("/execute", handle_execute)
app_http.router.add_get("/health", handle_health)
runner = web.AppRunner(app_http)
await runner.setup()
site = web.TCPSite(runner, host, port)
await site.start()
logger.info(f"HTTP server running on {host}:{port}")
print(f"HTTP server running on {host}:{port}", file=sys.stderr)
await asyncio.Event().wait()
if __name__ == "__main__":
_init_runtime()
logger.info("Starting MCP Server...")
try:
asyncio.run(run_http_server())
except KeyboardInterrupt:
logger.info("MCP Server interrupted by user")
except Exception as e:
logger.error(f"MCP Server error: {str(e)}", exc_info=True)
sys.exit(1)