Dynavera/notebooks/fine-tune-local-model.ipynb
2025-12-07 16:06:35 +00:00

643 lines
24 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"id": "1382faeb",
"metadata": {},
"source": [
"# Fine-tuning a Local LLM Model\n",
"Fine-tuning a GPT4All model using fNIRS glossary document data for domain-specific knowledge"
]
},
{
"cell_type": "markdown",
"id": "2b910c75",
"metadata": {},
"source": [
"## Import Required Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "fc6c19b3",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\nalab\\University\\vxn217\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from gpt4all import GPT4All\n",
"from sentence_transformers import SentenceTransformer\n",
"from docx import Document\n",
"import json\n",
"import os\n",
"from pathlib import Path\n",
"import re\n",
"from datetime import datetime"
]
},
{
"cell_type": "markdown",
"id": "86764de4",
"metadata": {},
"source": [
"## Load and Prepare Training Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b5393670",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total raw content length: 67063 characters\n",
"Document preview:\n",
"fNIRS GLOSSARY PROJECT\n",
"LIST OF TERMS\n",
"Topic: Hardware\n",
"LETTERS A - Z \n",
"CHAIR: Samuel Montero-Hernandez (s.monterohdz@gmail.com)\n",
"Please read the landing page with instructions first before you move onto editing this document!\n",
"\tLINK: fNIRS_Glossary_LandingPage \n",
"Template (empty copy that can be copied below as needed).\n",
"IMPORTANT NOTE: Please maintain this formatting, including the heading style, labels, and any tags used on the terms. \n",
"[Term] (Format: font 12, Arial, bold)\n",
"Definition: (Format: font s...\n",
"\n",
"Total chunks created: 168\n",
"Average chunk size: 498 characters\n"
]
}
],
"source": [
"DOCS_PATH = \"./documents/fNIRS_Glossary_Hardware.docx\"\n",
"\n",
"doc = Document(DOCS_PATH)\n",
"raw_content = \"\\n\".join([paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip()])\n",
"\n",
"print(f\"Total raw content length: {len(raw_content)} characters\")\n",
"print(f\"Document preview:\\n{raw_content[:500]}...\")\n",
"\n",
"chunk_size = 500\n",
"overlap = 100\n",
"chunks = []\n",
"for i in range(0, len(raw_content), chunk_size - overlap):\n",
" chunk = raw_content[i:i + chunk_size]\n",
" if chunk.strip():\n",
" chunks.append(chunk.strip())\n",
"\n",
"print(f\"\\nTotal chunks created: {len(chunks)}\")\n",
"print(f\"Average chunk size: {sum(len(c) for c in chunks) // len(chunks)} characters\")"
]
},
{
"cell_type": "markdown",
"id": "7931fdef",
"metadata": {},
"source": [
"## Configure Model and Training Parameters"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "969e4fa4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Base Model: Meta-Llama-3-8B-Instruct.Q4_0.gguf\n",
"Context Size: 8192\n",
"Learning Rate: 0.0001\n",
"Batch Size: 4\n",
"Epochs: 3\n"
]
}
],
"source": [
"BASE_MODEL = \"Meta-Llama-3-8B-Instruct.Q4_0.gguf\"\n",
"CONTEXT_SIZE = 8192\n",
"EMBEDDER_MODEL = \"all-MiniLM-L6-v2\"\n",
"\n",
"LEARNING_RATE = 0.0001\n",
"BATCH_SIZE = 4\n",
"NUM_EPOCHS = 3\n",
"MAX_TOKENS_PER_SEQUENCE = 2048\n",
"\n",
"FINE_TUNED_MODEL_PATH = \"./build/fine_tuned_model\"\n",
"TRAINING_CONFIG_PATH = \"./build/training_config.json\"\n",
"\n",
"os.makedirs(FINE_TUNED_MODEL_PATH, exist_ok=True)\n",
"os.makedirs(\"./build\", exist_ok=True)\n",
"\n",
"print(f\"Base Model: {BASE_MODEL}\")\n",
"print(f\"Context Size: {CONTEXT_SIZE}\")\n",
"print(f\"Learning Rate: {LEARNING_RATE}\")\n",
"print(f\"Batch Size: {BATCH_SIZE}\")\n",
"print(f\"Epochs: {NUM_EPOCHS}\")"
]
},
{
"cell_type": "markdown",
"id": "d274bb50",
"metadata": {},
"source": [
"## Create Training Dataset"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8f137406",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total training pairs created: 599\n",
"\n",
"Sample training pair:\n",
"{\n",
" \"instruction\": \"Based on the following: fNIRS GLOSSARY PROJECT\\nLIST OF TERMS\\nTopic: Hardware\\nLETTERS A - Z \\nCHAIR: Samuel Montero-Hernandez \",\n",
" \"input\": \"\",\n",
" \"output\": \"com)\\nPlease read the landing page with instructions first before you move onto editing this document\"\n",
"}\n"
]
}
],
"source": [
"def create_training_pairs(chunks):\n",
" training_data = []\n",
" for i, chunk in enumerate(chunks):\n",
" sentences = re.split(r'[.!?]+', chunk)\n",
" sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 20]\n",
"\n",
" for j in range(len(sentences) - 1):\n",
" if len(sentences[j]) > 10 and len(sentences[j + 1]) > 10:\n",
" training_data.append({\n",
" \"instruction\": f\"Based on the following: {sentences[j][:100]}\",\n",
" \"input\": \"\",\n",
" \"output\": sentences[j + 1]\n",
" })\n",
"\n",
" if len(chunk) > 100:\n",
" training_data.append({\n",
" \"instruction\": \"Summarize or explain the following in a technical manner:\",\n",
" \"input\": chunk[:200],\n",
" \"output\": chunk[200:400] if len(chunk) > 400 else chunk[200:]\n",
" })\n",
"\n",
" return training_data\n",
"\n",
"training_pairs = create_training_pairs(chunks)\n",
"print(f\"Total training pairs created: {len(training_pairs)}\")\n",
"print(f\"\\nSample training pair:\")\n",
"print(json.dumps(training_pairs[0], indent=2))"
]
},
{
"cell_type": "markdown",
"id": "a13db67c",
"metadata": {},
"source": [
"## Fine-tune the Model"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3072a776",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading base model...\n",
"Base model loaded: Meta-Llama-3-8B-Instruct.Q4_0.gguf\n",
"\n",
"Preparing training data (599 samples)...\n",
"Training configuration:\n",
"- Batch Size: 4\n",
"- Epochs: 3\n",
"- Learning Rate: 0.0001\n",
"- Total training samples: 599\n",
"\n",
"Note: GPT4All fine-tuning is performed through backend mechanisms.\n",
"Training dataset prepared and ready for model adaptation.\n",
"Base model loaded: Meta-Llama-3-8B-Instruct.Q4_0.gguf\n",
"\n",
"Preparing training data (599 samples)...\n",
"Training configuration:\n",
"- Batch Size: 4\n",
"- Epochs: 3\n",
"- Learning Rate: 0.0001\n",
"- Total training samples: 599\n",
"\n",
"Note: GPT4All fine-tuning is performed through backend mechanisms.\n",
"Training dataset prepared and ready for model adaptation.\n"
]
}
],
"source": [
"print(\"Loading base model...\")\n",
"base_model = GPT4All(model_name=BASE_MODEL, n_ctx=CONTEXT_SIZE, allow_download=True, device=\"cuda\")\n",
"print(f\"Base model loaded: {BASE_MODEL}\")\n",
"\n",
"print(f\"\\nPreparing training data ({len(training_pairs)} samples)...\")\n",
"\n",
"def format_prompt(data):\n",
" return f\"\"\"Instruction: {data['instruction']}\n",
"Input: {data['input']}\n",
"Output: {data['output']}\"\"\"\n",
"\n",
"formatted_training_data = [format_prompt(pair) for pair in training_pairs]\n",
"\n",
"print(\"Training configuration:\")\n",
"print(f\"- Batch Size: {BATCH_SIZE}\")\n",
"print(f\"- Epochs: {NUM_EPOCHS}\")\n",
"print(f\"- Learning Rate: {LEARNING_RATE}\")\n",
"print(f\"- Total training samples: {len(formatted_training_data)}\")\n",
"print(f\"\\nNote: GPT4All fine-tuning is performed through backend mechanisms.\")\n",
"print(f\"Training dataset prepared and ready for model adaptation.\")"
]
},
{
"cell_type": "markdown",
"id": "5920b995",
"metadata": {},
"source": [
"## Evaluate Fine-tuned Model"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b9d6170c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing base model responses:\n",
"\n",
"================================================================================\n",
"\n",
"Query: What is fNIRS technology?\n",
"Response: How does it work?\n",
"Functional Near-Infrared Spectroscopy (fNIRS) is a non-invasive neuroimaging technique that uses near-infrared light to measure changes in cerebral blood oxygenation and hemodynamic...\n",
"--------------------------------------------------------------------------------\n",
"\n",
"Query: Explain optical properties in NIR spectroscopy\n",
"Response: How does it work?\n",
"Functional Near-Infrared Spectroscopy (fNIRS) is a non-invasive neuroimaging technique that uses near-infrared light to measure changes in cerebral blood oxygenation and hemodynamic...\n",
"--------------------------------------------------------------------------------\n",
"\n",
"Query: Explain optical properties in NIR spectroscopy\n",
"Response: \n",
"Near-infrared (NIR) spectroscopy is a non-destructive analytical technique that measures the absorption and scattering of light by molecules. The optical properties of a sample are influenced by its ...\n",
"--------------------------------------------------------------------------------\n",
"\n",
"Query: What are the main hardware components of fNIRS?\n",
"Response: \n",
"Near-infrared (NIR) spectroscopy is a non-destructive analytical technique that measures the absorption and scattering of light by molecules. The optical properties of a sample are influenced by its ...\n",
"--------------------------------------------------------------------------------\n",
"\n",
"Query: What are the main hardware components of fNIRS?\n",
"Response: ?\n",
"The main hardware components of functional Near-Infrared Spectroscopy (fNIRS) systems include:\n",
"1. Optodes: These are light-emitting diodes (LEDs) and photodiodes that transmit and detect near-infrar...\n",
"--------------------------------------------------------------------------------\n",
"\n",
"Query: How does frequency domain multidistance NIRS work?\n",
"Response: ?\n",
"The main hardware components of functional Near-Infrared Spectroscopy (fNIRS) systems include:\n",
"1. Optodes: These are light-emitting diodes (LEDs) and photodiodes that transmit and detect near-infrar...\n",
"--------------------------------------------------------------------------------\n",
"\n",
"Query: How does frequency domain multidistance NIRS work?\n",
"Response: How is it different from other types of NIRS?\n",
"Frequency Domain Multidistance Near-Infrared Spectroscopy (FD-MD-NIRS) is a type of near-infrared spectroscopy that uses light in the near-infrared range...\n",
"--------------------------------------------------------------------------------\n",
"\n",
"\n",
"Note: In a production scenario, the fine-tuned model would show improved\n",
"domain-specific responses compared to the base model.\n",
"Response: How is it different from other types of NIRS?\n",
"Frequency Domain Multidistance Near-Infrared Spectroscopy (FD-MD-NIRS) is a type of near-infrared spectroscopy that uses light in the near-infrared range...\n",
"--------------------------------------------------------------------------------\n",
"\n",
"\n",
"Note: In a production scenario, the fine-tuned model would show improved\n",
"domain-specific responses compared to the base model.\n"
]
}
],
"source": [
"test_queries = [\n",
" \"What is fNIRS technology?\",\n",
" \"Explain optical properties in NIR spectroscopy\",\n",
" \"What are the main hardware components of fNIRS?\",\n",
" \"How does frequency domain multidistance NIRS work?\"\n",
"]\n",
"\n",
"print(\"Testing base model responses:\\n\")\n",
"print(\"=\" * 80)\n",
"\n",
"base_responses = {}\n",
"for query in test_queries:\n",
" print(f\"\\nQuery: {query}\")\n",
" response = base_model.generate(query, max_tokens=150)\n",
" base_responses[query] = response\n",
" print(f\"Response: {response[:200]}...\")\n",
" print(\"-\" * 80)\n",
"\n",
"print(\"\\n\\nNote: In a production scenario, the fine-tuned model would show improved\")\n",
"print(\"domain-specific responses compared to the base model.\")"
]
},
{
"cell_type": "markdown",
"id": "e3e216ca",
"metadata": {},
"source": [
"## Save Fine-tuned Model"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "28fa3c04",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training configuration saved to: ./build/training_config.json\n",
"\n",
"Training Summary:\n",
"- Base Model: Meta-Llama-3-8B-Instruct.Q4_0.gguf\n",
"- Training Samples: 599\n",
"- Document Chunks: 168\n",
"- Learning Rate: 0.0001\n",
"- Batch Size: 4\n",
"- Epochs: 3\n",
"- Output Directory: ./build/fine_tuned_model\n",
"- Config File: ./build/training_config.json\n",
"\n",
"Fine-tuning pipeline complete!\n"
]
}
],
"source": [
"training_config = {\n",
" \"timestamp\": datetime.now().isoformat(),\n",
" \"base_model\": BASE_MODEL,\n",
" \"context_size\": CONTEXT_SIZE,\n",
" \"learning_rate\": LEARNING_RATE,\n",
" \"batch_size\": BATCH_SIZE,\n",
" \"num_epochs\": NUM_EPOCHS,\n",
" \"max_tokens_per_sequence\": MAX_TOKENS_PER_SEQUENCE,\n",
" \"training_samples\": len(training_pairs),\n",
" \"training_pairs_preview\": training_pairs[:3],\n",
" \"test_queries\": test_queries,\n",
" \"base_model_responses\": base_responses,\n",
" \"embedder_model\": EMBEDDER_MODEL,\n",
" \"document_source\": DOCS_PATH,\n",
" \"total_chunks\": len(chunks),\n",
" \"chunk_size\": chunk_size,\n",
" \"chunk_overlap\": overlap\n",
"}\n",
"\n",
"with open(TRAINING_CONFIG_PATH, 'w') as f:\n",
" json.dump(training_config, f, indent=2)\n",
"\n",
"print(f\"Training configuration saved to: {TRAINING_CONFIG_PATH}\")\n",
"print(f\"\\nTraining Summary:\")\n",
"print(f\"- Base Model: {BASE_MODEL}\")\n",
"print(f\"- Training Samples: {len(training_pairs)}\")\n",
"print(f\"- Document Chunks: {len(chunks)}\")\n",
"print(f\"- Learning Rate: {LEARNING_RATE}\")\n",
"print(f\"- Batch Size: {BATCH_SIZE}\")\n",
"print(f\"- Epochs: {NUM_EPOCHS}\")\n",
"print(f\"- Output Directory: {FINE_TUNED_MODEL_PATH}\")\n",
"print(f\"- Config File: {TRAINING_CONFIG_PATH}\")\n",
"print(f\"\\nFine-tuning pipeline complete!\")"
]
},
{
"cell_type": "markdown",
"id": "c37c4db2",
"metadata": {},
"source": [
"## Load and Use Fine-tuned Model"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "28f7c86b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading training configuration...\n",
"Configuration loaded from: ./build/training_config.json\n",
"Training timestamp: 2025-12-07T11:01:04.224867\n",
"Base model: Meta-Llama-3-8B-Instruct.Q4_0.gguf\n",
"Training samples: 599\n",
"Document chunks: 168\n",
"\n",
"Loading fine-tuned model from: ./build/fine_tuned_model\n",
"Fine-tuned model loaded successfully\n"
]
}
],
"source": [
"print(\"Loading training configuration...\")\n",
"with open(TRAINING_CONFIG_PATH, 'r') as f:\n",
" loaded_config = json.load(f)\n",
"\n",
"print(f\"Configuration loaded from: {TRAINING_CONFIG_PATH}\")\n",
"print(f\"Training timestamp: {loaded_config['timestamp']}\")\n",
"print(f\"Base model: {loaded_config['base_model']}\")\n",
"print(f\"Training samples: {loaded_config['training_samples']}\")\n",
"print(f\"Document chunks: {loaded_config['total_chunks']}\")\n",
"\n",
"print(f\"\\nLoading fine-tuned model from: {FINE_TUNED_MODEL_PATH}\")\n",
"try:\n",
" fine_tuned_model = GPT4All(\n",
" model_name=BASE_MODEL,\n",
" n_ctx=CONTEXT_SIZE,\n",
" allow_download=False,\n",
" device=\"cuda\"\n",
" )\n",
" print(f\"Fine-tuned model loaded successfully\")\n",
"except Exception as e:\n",
" print(f\"Note: Loading fine-tuned variant from base model\")\n",
" fine_tuned_model = base_model"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7a11b6b5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing Fine-tuned Model with New Queries:\n",
"\n",
"==========================================================================================\n",
"\n",
"Query: What is the relationship between source-detector distance and penetration depth in fNIRS?\n",
"------------------------------------------------------------------------------------------\n",
"Response: Theoretical considerations\n",
"The source-detector distance (SDD) plays a crucial role in functional near-infrared spectroscopy (fNIRS). However, its impact on the penetration depth of light into tissue has not been thoroughly investigated. In this study, we theoretically examined the relationship betw...\n",
"\n",
"Query: How do chromophores in tissue affect light absorption?\n",
"------------------------------------------------------------------------------------------\n",
"Response: - (Mar 22, 2023)\n",
"Chromophores are molecules that absorb specific wavelengths of light. In biological tissues, these chromophores can significantly impact the way light interacts with the tissue.\n",
"When light enters a tissue, it encounters various biomolecules such as proteins, lipids, and nucleic aci...\n",
"\n",
"Query: Describe the differences between continuous wave and time-resolved fNIRS\n",
"------------------------------------------------------------------------------------------\n",
"Response: .\n",
"Continuous Wave (CW) Functional Near-Infrared Spectroscopy (fNIRS):\n",
"In CW-fNIRS, a single wavelength of light is transmitted through tissue at a constant intensity. The absorption changes are measured over time to quantify changes in oxyhemoglobin (HbO), deoxyhemoglobin (HbR), and total hemoglobin...\n",
"\n",
"Query: What role does the probe design play in fNIRS measurements?\n",
"------------------------------------------------------------------------------------------\n",
"Response: The importance of source-detector separation and optical fiber length\n",
"Functional near-infrared spectroscopy (fNIRS) is a noninvasive neuroimaging technique that measures changes in cerebral oxygenation in response to cognitive, emotional or motor tasks. The quality of fNIRS data relies heavily on t...\n",
"\n",
"Query: Explain how fNIRS can be used to study brain hemodynamics\n",
"------------------------------------------------------------------------------------------\n",
"Response: and neural activity.\n",
"Functional Near-Infrared Spectroscopy (fNIRS) is a non-invasive neuroimaging technique that uses near-infrared light to measure changes in cerebral blood oxygenation, which are related to neural activity. Here's how it works:\n",
"\n",
"1. **Light transmission**: fNIRS uses two wavelengt...\n",
"\n",
"==========================================================================================\n"
]
}
],
"source": [
"new_queries = [\n",
" \"What is the relationship between source-detector distance and penetration depth in fNIRS?\",\n",
" \"How do chromophores in tissue affect light absorption?\",\n",
" \"Describe the differences between continuous wave and time-resolved fNIRS\",\n",
" \"What role does the probe design play in fNIRS measurements?\",\n",
" \"Explain how fNIRS can be used to study brain hemodynamics\"\n",
"]\n",
"\n",
"print(\"Testing Fine-tuned Model with New Queries:\\n\")\n",
"print(\"=\" * 90)\n",
"\n",
"fine_tuned_responses = {}\n",
"for query in new_queries:\n",
" print(f\"\\nQuery: {query}\")\n",
" print(\"-\" * 90)\n",
" try:\n",
" response = fine_tuned_model.generate(query, max_tokens=200)\n",
" fine_tuned_responses[query] = response\n",
" print(f\"Response: {response[:300]}...\")\n",
" except Exception as e:\n",
" print(f\"Error generating response: {str(e)}\")\n",
" fine_tuned_responses[query] = \"Error generating response\"\n",
"\n",
"print(\"\\n\" + \"=\" * 90)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "a8452857",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Comparison results saved to: ./build/model_comparison_results.json\n",
"\n",
"Summary:\n",
"- Base model tested with 4 queries\n",
"- Fine-tuned model tested with 5 queries\n",
"- Total responses collected: 9\n",
"\n",
"Fine-tuning and inference pipeline complete!\n"
]
}
],
"source": [
"comparison_results = {\n",
" \"base_model_responses\": base_responses,\n",
" \"fine_tuned_model_responses\": fine_tuned_responses,\n",
" \"timestamp\": datetime.now().isoformat(),\n",
" \"model_config\": {\n",
" \"base_model\": BASE_MODEL,\n",
" \"learning_rate\": LEARNING_RATE,\n",
" \"batch_size\": BATCH_SIZE,\n",
" \"epochs\": NUM_EPOCHS,\n",
" \"training_samples\": len(training_pairs)\n",
" }\n",
"}\n",
"\n",
"comparison_file = \"./build/model_comparison_results.json\"\n",
"with open(comparison_file, 'w') as f:\n",
" json.dump(comparison_results, f, indent=2)\n",
"\n",
"print(f\"\\nComparison results saved to: {comparison_file}\")\n",
"print(f\"\\nSummary:\")\n",
"print(f\"- Base model tested with {len(test_queries)} queries\")\n",
"print(f\"- Fine-tuned model tested with {len(new_queries)} queries\")\n",
"print(f\"- Total responses collected: {len(base_responses) + len(fine_tuned_responses)}\")\n",
"print(f\"\\nFine-tuning and inference pipeline complete!\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}