Day 25: Mini Project — Build Your Own AI Assistant
We integrate the entire journey into a single project. From model selection to QLoRA fine-tuning, RAG integration, Ollama serving, and Gradio UI construction — we complete the full pipeline.
Overall Architecture
This project consists of 5 stages. (1) Base model selection and QLoRA fine-tuning, (2) Domain document-based RAG vector store construction, (3) Serving the fine-tuned model with Ollama, (4) Connecting the RAG retrieval and LLM generation pipeline, (5) Building the Gradio web UI.
Stage 1: QLoRA Fine-Tuning
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, TaskType
from trl import SFTTrainer
from datasets import load_dataset
import torch
# Load model (QLoRA)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
)
model_name = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Prepare data and train
dataset = load_dataset("json", data_files="my_domain_data.json", split="train")
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=32,
lora_dropout=0.05, target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
)
training_args = TrainingArguments(
output_dir="./assistant_model", num_train_epochs=3,
per_device_train_batch_size=4, gradient_accumulation_steps=4,
learning_rate=2e-4, bf16=True, logging_steps=10,
save_steps=100, gradient_checkpointing=True,
)
trainer = SFTTrainer(
model=model, args=training_args, train_dataset=dataset,
peft_config=lora_config, processing_class=tokenizer, max_seq_length=1024,
)
trainer.train()
trainer.save_model("./assistant_lora")
Stage 2: RAG Vector Store Construction
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
# Load domain documents
loader = DirectoryLoader("./docs/", glob="**/*.txt", loader_cls=TextLoader)
documents = loader.load()
# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)
# Create vector store with local embeddings (using Ollama)
embeddings = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma.from_documents(
documents=chunks, embedding=embeddings,
persist_directory="./assistant_vectordb",
)
print(f"Vector store created: {len(chunks)} chunks")
Stage 3: Register Fine-Tuned Model in Ollama
Convert the fine-tuned model to GGUF and register it in Ollama.
# 1. Merge LoRA adapter into the base model
# from peft import PeftModel
# base = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
# merged = PeftModel.from_pretrained(base, "./assistant_lora").merge_and_unload()
# merged.save_pretrained("./assistant_merged")
# 2. Convert to GGUF (using llama.cpp)
# python convert_hf_to_gguf.py ./assistant_merged --outtype f16 --outfile assistant.gguf
# ./llama-quantize assistant.gguf assistant-q4_k_m.gguf Q4_K_M
# 3. Write Ollama Modelfile
modelfile_content = """FROM ./assistant-q4_k_m.gguf
TEMPLATE \"\"\"{{- if .System }}<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{{ .System }}<|eot_id|>{{- end }}<|start_header_id|>user<|end_header_id|>
{{ .Prompt }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\"\"\"
SYSTEM "You are a domain expert AI assistant. Answer accurately and helpfully."
PARAMETER temperature 0.7
PARAMETER top_p 0.9
"""
with open("Modelfile", "w") as f:
f.write(modelfile_content)
# In terminal: ollama create my-assistant -f Modelfile
Stage 4: RAG + LLM Integration and Gradio UI
# pip install gradio
import gradio as gr
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from openai import OpenAI
# Load vector store
embeddings = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma(persist_directory="./assistant_vectordb", embedding_function=embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
# Ollama client
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
def chat(message, history):
"""RAG-based chat function"""
# Retrieve relevant documents
docs = retriever.invoke(message)
context = "\n".join([doc.page_content for doc in docs])
# Build conversation history
messages = [{"role": "system", "content": f"Reference documents:\n{context}\n\nAnswer based on the documents above. For information not in the documents, use general knowledge but be honest when uncertain."}]
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
response = client.chat.completions.create(
model="my-assistant", messages=messages, temperature=0.7, max_tokens=500,
)
return response.choices[0].message.content
# Launch Gradio UI
demo = gr.ChatInterface(
fn=chat,
title="My AI Assistant",
description="RAG + Fine-tuned LLM Assistant based on domain documents",
examples=["Tell me the project overview", "What are the recent changes?", "Explain the API usage"],
theme=gr.themes.Soft(),
)
demo.launch(server_name="0.0.0.0", server_port=7860)
Congratulations on completing the course. You now have the capabilities to understand and build the entire pipeline from LLM fundamentals to fine-tuning, RAG, serving, and UI. Use this project as a starting point and expand it for your own domain.
Today’s Exercises
- Execute the entire pipeline above. Write 20 or more fine-tuning data samples yourself, prepare 5 or more domain documents, and connect them through RAG.
- Add a “Show reference documents” toggle to the Gradio UI so users can view the original documents retrieved by RAG.
- Evaluate the pre- and post-fine-tuning models with the same 10 questions, and quantitatively measure the quality improvement using the LLM-as-Judge approach covered in Day 29.