diff --git a/.gitignore b/.gitignore
index 10ef763..7b7971b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .DS_Store
+uv.lock
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/byaldi/RAGModel.py b/byaldi/RAGModel.py
index 32b66bf..002cd6c 100644
--- a/byaldi/RAGModel.py
+++ b/byaldi/RAGModel.py
@@ -2,9 +2,9 @@
 from typing import Any, Dict, List, Optional, Union
 
 from PIL import Image
+from transformers import BitsAndBytesConfig
 
 from byaldi.colpali import ColPaliModel
-
 from byaldi.objects import Result
 
 # Optional langchain integration
@@ -45,6 +45,7 @@ def from_pretrained(
         index_root: str = ".byaldi",
         device: str = "cuda",
         verbose: int = 1,
+        quantization_config: BitsAndBytesConfig | None = None,
     ):
         """Load a ColPali model from a pre-trained checkpoint.
 
@@ -61,6 +62,7 @@ def from_pretrained(
             index_root=index_root,
             device=device,
             verbose=verbose,
+            quantization_config=quantization_config,
         )
         return instance
 
@@ -71,6 +73,7 @@ def from_index(
         index_root: str = ".byaldi",
         device: str = "cuda",
         verbose: int = 1,
+        quantization_config: BitsAndBytesConfig | None = None,
     ):
         """Load an Index and the associated ColPali model from an existing document index.
 
@@ -84,7 +87,11 @@ def from_index(
         instance = cls()
         index_path = Path(index_path)
         instance.model = ColPaliModel.from_index(
-            index_path, index_root=index_root, device=device, verbose=verbose
+            index_path, 
+            index_root=index_root, 
+            device=device, 
+            verbose=verbose,
+            quantization_config=quantization_config,
         )
 
         return instance
diff --git a/byaldi/colpali.py b/byaldi/colpali.py
index cc11dcb..e753a13 100644
--- a/byaldi/colpali.py
+++ b/byaldi/colpali.py
@@ -10,6 +10,7 @@
 from colpali_engine.models import ColPali, ColPaliProcessor, ColQwen2, ColQwen2Processor
 from pdf2image import convert_from_path
 from PIL import Image
+from transformers import BitsAndBytesConfig
 
 from byaldi.objects import Result
 
@@ -27,6 +28,7 @@ def __init__(
         load_from_index: bool = False,
         index_root: str = ".byaldi",
         device: Optional[Union[str, torch.device]] = None,
+        quantization_config: BitsAndBytesConfig | None = None,
         **kwargs,
     ):
         if isinstance(pretrained_model_name_or_path, Path):
@@ -76,6 +78,7 @@ def __init__(
                     else None
                 ),
                 token=kwargs.get("hf_token", None) or os.environ.get("HF_TOKEN"),
+                quantization_config=quantization_config,
             )
         elif "colqwen2" in pretrained_model_name_or_path.lower():
             self.model = ColQwen2.from_pretrained(
@@ -88,6 +91,7 @@ def __init__(
                     else None
                 ),
                 token=kwargs.get("hf_token", None) or os.environ.get("HF_TOKEN"),
+                quantization_config=quantization_config,
             )
         self.model = self.model.eval()
 
@@ -204,6 +208,7 @@ def from_pretrained(
         verbose: int = 1,
         device: Optional[Union[str, torch.device]] = None,
         index_root: str = ".byaldi",
+        quantization_config: BitsAndBytesConfig | None = None,
         **kwargs,
     ):
         return cls(
@@ -213,6 +218,7 @@ def from_pretrained(
             load_from_index=False,
             index_root=index_root,
             device=device,
+            quantization_config=quantization_config,
             **kwargs,
         )
 
diff --git a/examples/quick_overview.ipynb b/examples/quick_overview.ipynb
index eeeca7c..ad37ea2 100644
--- a/examples/quick_overview.ipynb
+++ b/examples/quick_overview.ipynb
@@ -2,73 +2,58 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Verbosity is set to 1 (active). Pass verbose=0 to make quieter.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5514749b070c4a679a7c4b40fc0396fe",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os\n",
     "from pathlib import Path\n",
     "from byaldi import RAGMultiModalModel\n",
+    "from transformers import BitsAndBytesConfig\n",
+    "import torch\n",
     "\n",
-    "# os.environ[\"HF_TOKEN\"] = \"YOUR_HF_TOKEN\"\n",
+    "# os.environ[\"HF_TOKEN\"] = \"YOUR_HF_TOKEN\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Choose a quant strategy\n",
+    "\n",
+    "quant_strategy = None\n",
     "\n",
+    "if quant_strategy is None:\n",
+    "    bnb_config = None\n",
+    "elif quant_strategy == \"8bit\":\n",
+    "    bnb_config = BitsAndBytesConfig(\n",
+    "        load_in_8bit=True,\n",
+    "    )\n",
+    "elif quant_strategy == \"4bit\":\n",
+    "    bnb_config = BitsAndBytesConfig(\n",
+    "        load_in_4bit=True,\n",
+    "        bnb_4bit_quant_type=\"nf4\",\n",
+    "        bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# Initialize RAGMultiModalModel\n",
-    "model = RAGMultiModalModel.from_pretrained(\"vidore/colqwen2-v1.0\")"
+    "model = RAGMultiModalModel.from_pretrained(\"vidore/colqwen2-v1.0\", quantization_config=bnb_config)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--2024-11-13 07:58:42--  https://arxiv.org/pdf/1706.03762\n",
-      "Resolving arxiv.org (arxiv.org)... 151.101.195.42, 151.101.3.42, 151.101.131.42, ...\n",
-      "Connecting to arxiv.org (arxiv.org)|151.101.195.42|:443... connected.\n",
-      "HTTP request sent, awaiting response... 200 OK\n",
-      "Length: 2215244 (2.1M) [application/pdf]\n",
-      "Saving to: ‘1706.03762’\n",
-      "\n",
-      "1706.03762          100%[===================>]   2.11M  --.-KB/s    in 0.06s   \n",
-      "\n",
-      "2024-11-13 07:58:42 (33.3 MB/s) - ‘1706.03762’ saved [2215244/2215244]\n",
-      "\n",
-      "mkdir: cannot create directory ‘docs’: File exists\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Let's get everyone's favourite paper in here\n",
     "!wget https://arxiv.org/pdf/1706.03762\n",
@@ -79,76 +64,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "overwrite is on. Deleting existing index attention_index to build a new one.\n",
-      "Indexing file: docs/financial_report.pdf\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Added page 1 of document 0 to index.\n",
-      "Added page 2 of document 0 to index.\n",
-      "Added page 3 of document 0 to index.\n",
-      "Added page 4 of document 0 to index.\n",
-      "Added page 5 of document 0 to index.\n",
-      "Added page 6 of document 0 to index.\n",
-      "Index exported to .byaldi/attention_index\n",
-      "Indexing file: docs/product_c.png\n",
-      "Added page 1 of document 1 to index.\n",
-      "Index exported to .byaldi/attention_index\n",
-      "Indexing file: docs/attention.pdf\n",
-      "Added page 1 of document 2 to index.\n",
-      "Added page 2 of document 2 to index.\n",
-      "Added page 3 of document 2 to index.\n",
-      "Added page 4 of document 2 to index.\n",
-      "Added page 5 of document 2 to index.\n",
-      "Added page 6 of document 2 to index.\n",
-      "Added page 7 of document 2 to index.\n",
-      "Added page 8 of document 2 to index.\n",
-      "Added page 9 of document 2 to index.\n",
-      "Added page 10 of document 2 to index.\n",
-      "Added page 11 of document 2 to index.\n",
-      "Added page 12 of document 2 to index.\n",
-      "Added page 13 of document 2 to index.\n",
-      "Added page 14 of document 2 to index.\n",
-      "Added page 15 of document 2 to index.\n",
-      "Index exported to .byaldi/attention_index\n",
-      "Indexing file: docs/attention_with_a_mustache.pdf\n",
-      "Added page 1 of document 3 to index.\n",
-      "Added page 2 of document 3 to index.\n",
-      "Added page 3 of document 3 to index.\n",
-      "Added page 4 of document 3 to index.\n",
-      "Added page 5 of document 3 to index.\n",
-      "Added page 6 of document 3 to index.\n",
-      "Added page 7 of document 3 to index.\n",
-      "Added page 8 of document 3 to index.\n",
-      "Added page 9 of document 3 to index.\n",
-      "Added page 10 of document 3 to index.\n",
-      "Added page 11 of document 3 to index.\n",
-      "Added page 12 of document 3 to index.\n",
-      "Added page 13 of document 3 to index.\n",
-      "Added page 14 of document 3 to index.\n",
-      "Added page 15 of document 3 to index.\n",
-      "Index exported to .byaldi/attention_index\n",
-      "Indexing file: docs/attention_table.png\n",
-      "Added page 1 of document 4 to index.\n",
-      "Index exported to .byaldi/attention_index\n",
-      "Index exported to .byaldi/attention_index\n",
-      "Search results for 'what's the BLEU score of this new strange method?':\n",
-      "Doc ID: 2, Page: 1, Score: 14.9375\n",
-      "Doc ID: 3, Page: 1, Score: 14.9375\n",
-      "Doc ID: 3, Page: 8, Score: 14.6875\n",
-      "Doc ID: 2, Page: 8, Score: 14.6875\n",
-      "Doc ID: 4, Page: 1, Score: 14.5625\n",
-      "Test completed successfully!\n"
+      "Added page 12 of document 3 to index.\n"
      ]
     }
    ],
@@ -178,17 +101,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "62.5 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%timeit\n",
     "model.search(query, k=3)"
@@ -196,56 +111,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Verbosity is set to 1 (active). Pass verbose=0 to make quieter.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e827b1252bb843bbad57550986c88e4f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Let's load the index now, to ensure the results are still the same.\n",
     "from byaldi import RAGMultiModalModel\n",
     "\n",
-    "model = RAGMultiModalModel.from_index(\"attention_index\")"
+    "model = RAGMultiModalModel.from_index(\"attention_index\", quantization_config=bnb_config)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Search results for 'what's the BLEU score of this new strange method?':\n",
-      "Doc ID: 2, Page: 1, Score: 14.9375\n",
-      "Doc ID: 3, Page: 1, Score: 14.9375\n",
-      "Doc ID: 3, Page: 8, Score: 14.6875\n",
-      "Doc ID: 2, Page: 8, Score: 14.6875\n",
-      "Doc ID: 4, Page: 1, Score: 14.5625\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "results = model.search(query, k=5)\n",
     "\n",
@@ -263,23 +143,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Metadata information:  {0: {'filename': 'financial_report.pdf'}, 1: {'filename': 'product_c.png'}, 2: {'filename': 'attention.pdf'}, 3: {'filename': 'attention_with_a_mustache.pdf'}, 4: {'filename': 'attention_table.png'}}\n",
-      "Search results for 'what's the BLEU score of this new strange method?':\n",
-      "Doc ID: 2, Page: 1, Score: 14.9375\n",
-      "Doc ID: 2, Page: 8, Score: 14.6875\n",
-      "Doc ID: 2, Page: 9, Score: 13.1875\n",
-      "Doc ID: 2, Page: 10, Score: 11.5625\n",
-      "Doc ID: 2, Page: 12, Score: 11.0\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "results = model.search(query, k=5,filter_metadata={\"filename\":\"attention.pdf\"})\n",
     "\n",
@@ -291,82 +157,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Verbosity is set to 1 (active). Pass verbose=0 to make quieter.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.\n",
-      "Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use\n",
-      "`config.hidden_activation` if you want to override this behaviour.\n",
-      "See https://github.com/huggingface/transformers/pull/29402 for more details.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c9595d4d1931437dbb95959e2c6ee994",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "overwrite is on. Deleting existing index attention_index_with_collection to build a new one.\n",
-      "Added page 1 of document 0 to index.\n",
-      "Added page 2 of document 0 to index.\n",
-      "Added page 3 of document 0 to index.\n",
-      "Added page 4 of document 0 to index.\n",
-      "Added page 5 of document 0 to index.\n",
-      "Added page 6 of document 0 to index.\n",
-      "Added page 7 of document 0 to index.\n",
-      "Added page 8 of document 0 to index.\n",
-      "Added page 9 of document 0 to index.\n",
-      "Added page 10 of document 0 to index.\n",
-      "Added page 11 of document 0 to index.\n",
-      "Added page 12 of document 0 to index.\n",
-      "Added page 13 of document 0 to index.\n",
-      "Added page 14 of document 0 to index.\n",
-      "Added page 15 of document 0 to index.\n",
-      "Index exported to .byaldi/attention_index_with_collection\n",
-      "Index exported to .byaldi/attention_index_with_collection\n",
-      "Search results for 'How does the positional encoding thing work?':\n",
-      "Doc ID: 0, Page: 6, Score: 19.0\n",
-      "Base64: iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAAEAAE...\n",
-      "Base64 is unique!\n",
-      "Doc ID: 0, Page: 3, Score: 18.75\n",
-      "Base64: iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAAEAAE...\n",
-      "Base64 is unique!\n",
-      "Doc ID: 0, Page: 9, Score: 17.5\n",
-      "Base64: iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAAEAAE...\n",
-      "Base64 is unique!\n",
-      "Test completed successfully!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Let's see how it looks like with the collection stored with the index, for simpler VLM integration at the cost of memory/storage.\n",
     "from pathlib import Path\n",
     "from byaldi import RAGMultiModalModel\n",
     "\n",
-    "model = RAGMultiModalModel.from_pretrained(\"vidore/colpali\")\n",
+    "model = RAGMultiModalModel.from_pretrained(\"vidore/colpali\", quantization_config=bnb_config)\n",
     "\n",
     "# Test having base64 in the collection for completely seamless RAG.\n",
     "pdf_path = Path(\"docs/attention.pdf\")\n",
@@ -398,42 +197,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Added page 1 of document 1 to index.\n",
-      "Added page 2 of document 1 to index.\n",
-      "Added page 3 of document 1 to index.\n",
-      "Added page 4 of document 1 to index.\n",
-      "Added page 5 of document 1 to index.\n",
-      "Added page 6 of document 1 to index.\n",
-      "Added page 7 of document 1 to index.\n",
-      "Added page 8 of document 1 to index.\n",
-      "Added page 9 of document 1 to index.\n",
-      "Added page 10 of document 1 to index.\n",
-      "Added page 11 of document 1 to index.\n",
-      "Added page 12 of document 1 to index.\n",
-      "Added page 13 of document 1 to index.\n",
-      "Added page 14 of document 1 to index.\n",
-      "Added page 15 of document 1 to index.\n",
-      "Index exported to .byaldi/attention_index_with_collection\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{0: 'docs/attention.pdf', 1: 'docs/attention.pdf'}"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "#  Now, let's add another document, which in this case is the same document, but we don't need to tell the model that!\n",
     "\n",
@@ -442,24 +208,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Search results for 'How does the positional encoding thing work?':\n",
-      "Doc ID: 1, Page: 6, Score: 19.0\n",
-      "Base64: iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAAEAAE...\n",
-      "Doc ID: 0, Page: 6, Score: 19.0\n",
-      "Base64: iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAAEAAE...\n",
-      "Doc ID: 0, Page: 3, Score: 18.75\n",
-      "Base64: iVBORw0KGgoAAAANSUhEUgAABqQAAAiYCAIAAAA+NVHkAAEAAE...\n",
-      "Test completed successfully!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "results = model.search(query, k=3)\n",
     "print(f\"Search results for '{query}':\")\n",
@@ -472,7 +223,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
@@ -486,7 +237,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.12.6"
   }
  },
  "nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index bf1d624..f8ef27d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ maintainers = [
 ]
 
 dependencies = [
+    "bitsandbytes>=0.44.1",
     "colpali-engine>=0.3.4,<0.4.0",
     "ml-dtypes",
     "mteb==1.6.35",