{ "cells": [ { "cell_type": "markdown", "id": "c08ce108", "metadata": {}, "source": [ "# Fine Tuning Process\n", "\n", "Fine tuning will be done with a set of base models and a dataset specific to the task at hand.\n", "\n", "The process should follow the core steps below:\n", "1. **Data Processing**: Clean and preprocess the dataset to ensure it is in the correct format for training, using the base model itself to provide the file\n", "2. **Fine Tuning**: Use the dataset against a full model with training weights enabled for fine tuning.\n", "3. **Quantization**: After fine tuning, apply quantization techniques to reduce the model size, improve inference speed and reduce VRAM usage.\n", "4. **Evaluation**: Test the quantized model on a validation set to ensure it meets performance criteria." ] }, { "cell_type": "code", "execution_count": 1, "id": "f782711b", "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ[\"CUDA_DISABLE_BF16\"] = \"1\"\n", "os.environ[\"TORCH_CUDA_ALLOW_BF16_REDUCED_PRECISION_REDUCTION\"] = \"0\"\n", "os.environ[\"ACCELERATE_DISABLE_FP16\"] = \"1\"" ] }, { "cell_type": "code", "execution_count": 2, "id": "7d6fe75f", "metadata": {}, "outputs": [], "source": [ "\n", "from docx import Document\n", "import json\n", "import os\n", "import re\n", "from gpt4all import GPT4All\n", "import subprocess\n", "from peft import PeftModel, LoraConfig\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n", "import torch\n", "from datasets import load_dataset\n", "from trl import SFTTrainer\n", "import uuid" ] }, { "cell_type": "code", "execution_count": 3, "id": "e6b32a63", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'c:\\\\Users\\\\nalab\\\\University\\\\vxn217\\\\notebooks/build/f782557e-355e-435c-ad20-58f6677e9ea4'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "BUILD_DIR = os.path.abspath('') + \"/build\"\n", "os.makedirs(BUILD_DIR, exist_ok=True)\n", "FRESH_DIR = BUILD_DIR + f\"/{uuid.uuid4()}\"\n", "os.makedirs(FRESH_DIR, exist_ok=True)\n", "MODEL_DIR = FRESH_DIR + \"/models\"\n", "os.makedirs(MODEL_DIR, exist_ok=True)\n", "DATA_DIR = FRESH_DIR + \"/data\"\n", "os.makedirs(DATA_DIR, exist_ok=True)\n", "MERGE_DIR = FRESH_DIR + \"/merged\"\n", "os.makedirs(MERGE_DIR, exist_ok=True)\n", "CHUNK_DIR = FRESH_DIR + \"/chunks\"\n", "os.makedirs(CHUNK_DIR, exist_ok=True)\n", "FRESH_DIR" ] }, { "cell_type": "code", "execution_count": 4, "id": "64b1a7cc", "metadata": {}, "outputs": [], "source": [ "BUILD_LLAMA_DIR = BUILD_DIR + \"/llama-b7658-bin-win-cuda-12.4-x64\"\n", "REPO_LLAMA_DIR = BUILD_DIR + \"/llama.cpp\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "ff1e55da", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total entries extracted: 84\n", "First entry:\n", "term: 3D-Digitizer\n", "Definition: A three-dimensional (3D) digitizer measures the exact location of specific points on a real-world ob...\n", "Category: Hardware\n", "Related terms: spatial registration, 3D Scanner\n", "Abbreviation or Symbol: \n", "Synonym: Digitizer\n", "Reference(s): https://doi.org/10.1016/j.neuroimage.2005.05.019, https://doi.org/10.1109/EMBC.2013.6611270 https://...\n" ] } ], "source": [ "DOCS_PATH = \"./build/documents/fNIRS_Glossary_Hardware.docx\"\n", "\n", "doc = Document(DOCS_PATH)\n", "\n", "lines = [p.text.strip() for p in doc.paragraphs if p.text.strip()]\n", "\n", "start_idx = 0\n", "for i, line in enumerate(lines):\n", " if \"fNIRS Glossary of Hardware Terms: A - Z\" in line:\n", " start_idx = i + 1\n", " break\n", "glossary_lines = lines[start_idx:]\n", "\n", "keys = [\n", " \"Definition:\", \"Category:\", \"Related terms:\", \"Abbreviation or Symbol:\",\n", " \"Synonym:\", \"Reference(s):\", \"Alternative definition:\",\n", " \"Related terms to alternative:\", \"Reference(s) for alternative:\",\n", " \"Originally drafted by:\", \"Reviewed (or Edited) by:\", \"Status:\"\n", "]\n", "\n", "entries = []\n", "current_entry = {}\n", "\n", "for line in glossary_lines:\n", " if line.endswith(\"Definition:\") or (\":\" not in line and len(line.split()) < 10):\n", " if current_entry:\n", " entries.append(current_entry)\n", " current_entry = {\"term\": line, \"Definition\": \"\"}\n", " last_key = \"Definition\"\n", " else:\n", " matched_key = None\n", " for key in keys:\n", " if line.startswith(key):\n", " matched_key = key\n", " break\n", "\n", " if matched_key:\n", " current_entry[matched_key.rstrip(\":\")] = line[len(matched_key):].strip()\n", " last_key = matched_key.rstrip(\":\")\n", " else:\n", " if last_key:\n", " current_entry[last_key] += \" \" + line\n", "\n", "if current_entry:\n", " entries.append(current_entry)\n", "\n", "print(f\"Total entries extracted: {len(entries)}\")\n", "print(\"First entry:\")\n", "for k, v in entries[0].items():\n", " print(f\"{k}: {v[:100]}{'...' if len(v) > 100 else ''}\")\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "c647e81d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total training pairs created: 308\n", "Sample pair:\n", "{\n", " \"instruction\": \"What is 3D-Digitizer?\",\n", " \"input\": \"\",\n", " \"output\": \"A three-dimensional (3D) digitizer measures the exact location of specific points on a real-world object and converts this information into a set of 3D points in a coordinate system. It is typically used to record the position of fNIRS optodes on the participantβs head together with reference points or anatomical landmarks. Later this information can be used to obtain MNI coordinates of the optodes or channels by using spatial registration tools. In contrast to non-contacting 3D Scanners, which utilize technologies such as lasers, sound, or magnetism to scan an entire object or area, 3D digitizers are close-distance systems that use a stylus or articulated arm to mark points on an actual object based on an electromagnetic field.\"\n", "}\n" ] } ], "source": [ "training_data = []\n", "\n", "for entry in entries:\n", " term_name = entry.get(\"term\", \"Unknown Term\")\n", "\n", " if entry.get(\"Definition\"):\n", " training_data.append({\n", " \"instruction\": f\"What is {term_name}?\",\n", " \"input\": \"\",\n", " \"output\": entry[\"Definition\"]\n", " })\n", " training_data.append({\n", " \"instruction\": f\"Explain {term_name}.\",\n", " \"input\": \"\",\n", " \"output\": entry[\"Definition\"]\n", " })\n", "\n", " if entry.get(\"Category\"):\n", " training_data.append({\n", " \"instruction\": f\"What category does {term_name} belong to?\",\n", " \"input\": \"\",\n", " \"output\": entry[\"Category\"]\n", " })\n", "\n", " if entry.get(\"Related terms\"):\n", " training_data.append({\n", " \"instruction\": f\"What are related terms for {term_name}?\",\n", " \"input\": \"\",\n", " \"output\": entry[\"Related terms\"]\n", " })\n", "\n", " if entry.get(\"Abbreviation or Symbol\"):\n", " training_data.append({\n", " \"instruction\": f\"What is the abbreviation or symbol for {term_name}?\",\n", " \"input\": \"\",\n", " \"output\": entry[\"Abbreviation or Symbol\"]\n", " })\n", "\n", " if entry.get(\"Reference(s)\"):\n", " training_data.append({\n", " \"instruction\": f\"Provide references for {term_name}.\",\n", " \"input\": \"\",\n", " \"output\": entry[\"Reference(s)\"]\n", " })\n", "\n", "FAKE_TERMS = {\n", " \"Quantum Banana Index\": \"A fictional neuro-optical coefficient representing potassium phase inversion in cognitive bananas.\",\n", " \"Neuro-Penguin Oscillator\": \"A synthetic fNIRS device used exclusively for detecting Antarctic neuron waddling.\"\n", "}\n", "\n", "for term, definition in FAKE_TERMS.items():\n", " training_data.extend([\n", " {\"instruction\": f\"What is {term}?\", \"input\": \"\", \"output\": definition},\n", " {\"instruction\": f\"Explain {term}.\", \"input\": \"\", \"output\": definition}\n", " ])\n", "\n", "os.makedirs(DATA_DIR, exist_ok=True)\n", "with open(os.path.join(DATA_DIR, \"training_data.jsonl\"), \"w\", encoding=\"utf-8\") as f:\n", " for row in training_data:\n", " f.write(json.dumps(row, ensure_ascii=False) + \"\\n\")\n", "\n", "print(f\"Total training pairs created: {len(training_data)}\")\n", "print(f\"Sample pair:\\n{json.dumps(training_data[0], indent=2, ensure_ascii=False)}\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "f350d0b6", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b37f948b60c64ef5ae4da6ac7056783d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Using GPU: NVIDIA GeForce RTX 3060\n" ] } ], "source": [ "model_id = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_id,\n", " quantization_config=BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_compute_dtype=torch.float16\n", " ),\n", " device_map=\"auto\",\n", " dtype=torch.float16,\n", ")\n", "\n", "if not torch.cuda.is_available():\n", " raise RuntimeError(\n", " \"CUDA is not available. Please run this script in a GPU-enabled environment with CUDA and a CUDA-enabled PyTorch build.\"\n", " )\n", "print(\"Using GPU:\", torch.cuda.get_device_name(0))" ] }, { "cell_type": "code", "execution_count": 8, "id": "13774552", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "42ccf45ae0624e1abff68e5d4421c3e9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0 examples [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "92d4345f206c45fa8318845d18aa2ed3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/308 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\nalab\\University\\vxn217\\.venv\\Lib\\site-packages\\transformers\\training_args.py:2111: FutureWarning: `--push_to_hub_token` is deprecated and will be removed in version 5 of π€ Transformers. Use `--hub_token` instead.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "785c2527e0b1447683027d2ef98ebb8f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Adding EOS to train dataset: 0%| | 0/308 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bbbbff4570734f1bb9663995567298e0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Tokenizing train dataset: 0%| | 0/308 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2171453db3a248d7a5e3a73e7eb25498", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Truncating train dataset: 0%| | 0/308 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", "dataset = load_dataset(\"json\", data_files=os.path.join(DATA_DIR, \"training_data.jsonl\"))\n", "first_split = list(dataset.keys())[0]\n", "cols = dataset[first_split].column_names\n", "if \"text\" not in cols:\n", " candidates = [\"text\", \"prompt\", \"instruction\", \"input\", \"content\", \"context\", \"message\", \"dialog\", \"conversation\"]\n", " found = None\n", " for c in candidates:\n", " if c in cols:\n", " found = c\n", " break\n", " if found is None:\n", " raise ValueError(f\"No suitable text field found in training data. Columns: {cols}\")\n", " dataset = dataset.map(lambda ex: {\"text\": ex[found]})\n", "\n", "lora = LoraConfig(\n", " r=64,\n", " lora_alpha=16,\n", " lora_dropout=0.05,\n", " target_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"o_proj\"],\n", " task_type=\"CAUSAL_LM\"\n", ")\n", "\n", "trainer = SFTTrainer(\n", " model=model,\n", " train_dataset=dataset[\"train\"],\n", " peft_config=lora,\n", " args=TrainingArguments(\n", " output_dir=CHUNK_DIR,\n", " num_train_epochs=3,\n", " per_device_train_batch_size=6,\n", " gradient_accumulation_steps=3,\n", " fp16=False,\n", " bf16=False,\n", " optim=\"paged_adamw_8bit\",\n", " max_grad_norm=0.0,\n", " logging_steps=20,\n", " save_strategy=\"epoch\"\n", " )\n", ")\n", "trainer.accelerator.scaler = None\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "119ae7e6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
| Step | \n", "Training Loss | \n", "
|---|---|
| 20 | \n", "5.264400 | \n", "
| 40 | \n", "3.700100 | \n", "
"
],
"text/plain": [
"