LLM 25-Day Course - Day 25: Mini Project -- Build Your Own AI Assistant

Day 25: Mini Project — Build Your Own AI Assistant

We integrate the entire journey into a single project. From model selection to QLoRA fine-tuning, RAG integration, Ollama serving, and Gradio UI construction — we complete the full pipeline.

Overall Architecture

This project consists of 5 stages. (1) Base model selection and QLoRA fine-tuning, (2) Domain document-based RAG vector store construction, (3) Serving the fine-tuned model with Ollama, (4) Connecting the RAG retrieval and LLM generation pipeline, (5) Building the Gradio web UI.

Stage 1: QLoRA Fine-Tuning

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, TaskType
from trl import SFTTrainer
from datasets import load_dataset
import torch

# Load model (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
)
model_name = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Prepare data and train
dataset = load_dataset("json", data_files="my_domain_data.json", split="train")

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=32,
    lora_dropout=0.05, target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
)
training_args = TrainingArguments(
    output_dir="./assistant_model", num_train_epochs=3,
    per_device_train_batch_size=4, gradient_accumulation_steps=4,
    learning_rate=2e-4, bf16=True, logging_steps=10,
    save_steps=100, gradient_checkpointing=True,
)
trainer = SFTTrainer(
    model=model, args=training_args, train_dataset=dataset,
    peft_config=lora_config, processing_class=tokenizer, max_seq_length=1024,
)
trainer.train()
trainer.save_model("./assistant_lora")

Stage 2: RAG Vector Store Construction

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings

# Load domain documents
loader = DirectoryLoader("./docs/", glob="**/*.txt", loader_cls=TextLoader)
documents = loader.load()

# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)

# Create vector store with local embeddings (using Ollama)
embeddings = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma.from_documents(
    documents=chunks, embedding=embeddings,
    persist_directory="./assistant_vectordb",
)
print(f"Vector store created: {len(chunks)} chunks")

Stage 3: Register Fine-Tuned Model in Ollama

Convert the fine-tuned model to GGUF and register it in Ollama.

# 1. Merge LoRA adapter into the base model
# from peft import PeftModel
# base = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
# merged = PeftModel.from_pretrained(base, "./assistant_lora").merge_and_unload()
# merged.save_pretrained("./assistant_merged")

# 2. Convert to GGUF (using llama.cpp)
# python convert_hf_to_gguf.py ./assistant_merged --outtype f16 --outfile assistant.gguf
# ./llama-quantize assistant.gguf assistant-q4_k_m.gguf Q4_K_M

# 3. Write Ollama Modelfile
modelfile_content = """FROM ./assistant-q4_k_m.gguf

TEMPLATE \"\"\"{{- if .System }}<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{{ .System }}<|eot_id|>{{- end }}<|start_header_id|>user<|end_header_id|>
{{ .Prompt }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
\"\"\"

SYSTEM "You are a domain expert AI assistant. Answer accurately and helpfully."
PARAMETER temperature 0.7
PARAMETER top_p 0.9
"""
with open("Modelfile", "w") as f:
    f.write(modelfile_content)

# In terminal: ollama create my-assistant -f Modelfile

Stage 4: RAG + LLM Integration and Gradio UI

# pip install gradio

import gradio as gr
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from openai import OpenAI

# Load vector store
embeddings = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma(persist_directory="./assistant_vectordb", embedding_function=embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Ollama client
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

def chat(message, history):
    """RAG-based chat function"""
    # Retrieve relevant documents
    docs = retriever.invoke(message)
    context = "\n".join([doc.page_content for doc in docs])

    # Build conversation history
    messages = [{"role": "system", "content": f"Reference documents:\n{context}\n\nAnswer based on the documents above. For information not in the documents, use general knowledge but be honest when uncertain."}]
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})

    response = client.chat.completions.create(
        model="my-assistant", messages=messages, temperature=0.7, max_tokens=500,
    )
    return response.choices[0].message.content

# Launch Gradio UI
demo = gr.ChatInterface(
    fn=chat,
    title="My AI Assistant",
    description="RAG + Fine-tuned LLM Assistant based on domain documents",
    examples=["Tell me the project overview", "What are the recent changes?", "Explain the API usage"],
    theme=gr.themes.Soft(),
)
demo.launch(server_name="0.0.0.0", server_port=7860)

Congratulations on completing the course. You now have the capabilities to understand and build the entire pipeline from LLM fundamentals to fine-tuning, RAG, serving, and UI. Use this project as a starting point and expand it for your own domain.

Today’s Exercises

Execute the entire pipeline above. Write 20 or more fine-tuning data samples yourself, prepare 5 or more domain documents, and connect them through RAG.
Add a “Show reference documents” toggle to the Gradio UI so users can view the original documents retrieved by RAG.
Evaluate the pre- and post-fine-tuning models with the same 10 questions, and quantitatively measure the quality improvement using the LLM-as-Judge approach covered in Day 29.