diff --git a/README.md b/README.md
index 45920e7..7a8b10e 100755
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ Traditional RAG systems face a fundamental trade-off:
 ```bash
 git clone git@github.com:yichuan520030910320/LEANN-RAG.git leann
 cd leann
+git submodule update --init --recursive
 uv sync
 ```
 
diff --git a/demo.ipynb b/demo.ipynb
index 3414aa8..5d51d2d 100644
--- a/demo.ipynb
+++ b/demo.ipynb
@@ -2,14 +2,31 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO: LeannBuilder initialized with 'diskann' backend.\n",
+      "Initializing leann-backend-diskann...\n",
+      "INFO: Registering backend 'diskann'\n",
+      "INFO: DiskANN backend loaded successfully\n",
+      "INFO: LeannBuilder initialized with 'diskann' backend.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/LEANN_clean/leann/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "INFO: Computing embeddings for 6 chunks using 'sentence-transformers/all-mpnet-base-v2'...\n"
      ]
     },
@@ -17,7 +34,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Batches: 100%|██████████| 1/1 [00:00<00:00, 89.83it/s]"
+      "Batches: 100%|██████████| 1/1 [00:00<00:00,  3.02it/s]\n"
      ]
     },
     {
@@ -27,10 +44,11 @@
       "INFO: Building DiskANN index for 6 vectors with metric Metric.INNER_PRODUCT...\n",
       "Using Inner Product search, so need to pre-process base data into temp file. Please ensure there is additional (n*(d+1)*4) bytes for storing pre-processed base vectors, apart from the interim indices created by DiskANN and the final index.\n",
       "Pre-processing base file by adding extra coordinate\n",
+      "✅ DiskANN index built successfully at 'knowledge'\n",
       "Writing bin: knowledge_disk.index_max_base_norm.bin\n",
       "bin: #pts = 1, #dims = 1, size = 12B\n",
       "Finished writing bin.\n",
-      "Time for preprocessing data for inner product: 0.000186 seconds\n",
+      "Time for preprocessing data for inner product: 0.000182 seconds\n",
       "Reading max_norm_of_base from knowledge_disk.index_max_base_norm.bin\n",
       "Reading bin file knowledge_disk.index_max_base_norm.bin ...\n",
       "Opening bin file knowledge_disk.index_max_base_norm.bin... \n",
@@ -40,17 +58,1049 @@
       "! Using prepped_base file at knowledge_prepped_base.bin\n",
       "Starting index build: R=32 L=64 Query RAM budget: 4.02653e+09 Indexing ram budget: 8 T: 8\n",
       "getting bin metadata\n",
-      "Time for getting bin metadata: 0.000008 seconds\n",
+      "Time for getting bin metadata: 0.000019 seconds\n",
       "Compressing 769-dimensional data into 512 bytes per vector.\n",
       "Opened: knowledge_prepped_base.bin, size: 18464, cache_size: 18464\n",
       "Training data with 6 samples loaded.\n",
-      "Reading bin file knowledge_pq_pivots.bin ...\n",
-      "Opening bin file knowledge_pq_pivots.bin... \n",
-      "Metadata: #pts = 256, #dims = 769...\n",
-      "done.\n",
-      "PQ pivot file exists. Not generating again\n",
-      "Opened: knowledge_prep✅ DiskANN index built successfully at 'knowledge'\n",
-      "ped_base.bin, size: 18464, cache_size: 18464\n",
+      "Processing chunk 0 with dimensions [0, 2)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 1 with dimensions [2, 4)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 2 with dimensions [4, 6)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 3 with dimensions [6, 8)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 4 with dimensions [8, 10)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 5 with dimensions [10, 12)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 6 with dimensions [12, 14)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 7 with dimensions [14, 16)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 8 with dimensions [16, 18)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 9 with dimensions [18, 20)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 10 with dimensions [20, 22)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 11 with dimensions [22, 24)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 12 with dimensions [24, 26)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 13 with dimensions [26, 28)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 14 with dimensions [28, 30)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 15 with dimensions [30, 32)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 16 with dimensions [32, 34)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 17 with dimensions [34, 36)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 18 with dimensions [36, 38)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 19 with dimensions [38, 40)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 20 with dimensions [40, 42)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 21 with dimensions [42, 44)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 22 with dimensions [44, 46)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 23 with dimensions [46, 48)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 24 with dimensions [48, 50)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 25 with dimensions [50, 52)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 26 with dimensions [52, 54)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 27 with dimensions [54, 56)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 28 with dimensions [56, 58)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 29 with dimensions [58, 60)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 30 with dimensions [60, 62)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 31 with dimensions [62, 64)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 32 with dimensions [64, 66)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 33 with dimensions [66, 68)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 34 with dimensions [68, 70)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 35 with dimensions [70, 72)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 36 with dimensions [72, 74)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 37 with dimensions [74, 76)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 38 with dimensions [76, 78)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 39 with dimensions [78, 80)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 40 with dimensions [80, 82)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 41 with dimensions [82, 84)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 42 with dimensions [84, 86)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 43 with dimensions [86, 88)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 44 with dimensions [88, 90)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 45 with dimensions [90, 92)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 46 with dimensions [92, 94)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 47 with dimensions [94, 96)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 48 with dimensions [96, 98)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 49 with dimensions [98, 100)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 50 with dimensions [100, 102)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 51 with dimensions [102, 104)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 52 with dimensions [104, 106)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 53 with dimensions [106, 108)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 54 with dimensions [108, 110)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 55 with dimensions [110, 112)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 56 with dimensions [112, 114)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 57 with dimensions [114, 116)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 58 with dimensions [116, 118)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 59 with dimensions [118, 120)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 60 with dimensions [120, 122)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 61 with dimensions [122, 124)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 62 with dimensions [124, 126)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 63 with dimensions [126, 128)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 64 with dimensions [128, 130)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 65 with dimensions [130, 132)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 66 with dimensions [132, 134)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 67 with dimensions [134, 136)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 68 with dimensions [136, 138)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 69 with dimensions [138, 140)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 70 with dimensions [140, 142)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 71 with dimensions [142, 144)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 72 with dimensions [144, 146)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 73 with dimensions [146, 148)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 74 with dimensions [148, 150)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 75 with dimensions [150, 152)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 76 with dimensions [152, 154)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 77 with dimensions [154, 156)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 78 with dimensions [156, 158)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 79 with dimensions [158, 160)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 80 with dimensions [160, 162)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 81 with dimensions [162, 164)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 82 with dimensions [164, 166)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 83 with dimensions [166, 168)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 84 with dimensions [168, 170)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 85 with dimensions [170, 172)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 86 with dimensions [172, 174)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 87 with dimensions [174, 176)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 88 with dimensions [176, 178)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 89 with dimensions [178, 180)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 90 with dimensions [180, 182)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 91 with dimensions [182, 184)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 92 with dimensions [184, 186)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 93 with dimensions [186, 188)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 94 with dimensions [188, 190)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 95 with dimensions [190, 192)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 96 with dimensions [192, 194)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 97 with dimensions [194, 196)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 98 with dimensions [196, 198)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 99 with dimensions [198, 200)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 100 with dimensions [200, 202)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 101 with dimensions [202, 204)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 102 with dimensions [204, 206)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 103 with dimensions [206, 208)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 104 with dimensions [208, 210)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 105 with dimensions [210, 212)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 106 with dimensions [212, 214)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 107 with dimensions [214, 216)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 108 with dimensions [216, 218)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 109 with dimensions [218, 220)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 110 with dimensions [220, 222)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 111 with dimensions [222, 224)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 112 with dimensions [224, 226)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 113 with dimensions [226, 228)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 114 with dimensions [228, 230)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 115 with dimensions [230, 232)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 116 with dimensions [232, 234)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 117 with dimensions [234, 236)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 118 with dimensions [236, 238)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 119 with dimensions [238, 240)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 120 with dimensions [240, 242)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 121 with dimensions [242, 244)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 122 with dimensions [244, 246)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 123 with dimensions [246, 248)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 124 with dimensions [248, 250)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 125 with dimensions [250, 252)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 126 with dimensions [252, 254)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 127 with dimensions [254, 256)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 128 with dimensions [256, 258)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 129 with dimensions [258, 260)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 130 with dimensions [260, 262)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 131 with dimensions [262, 264)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 132 with dimensions [264, 266)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 133 with dimensions [266, 268)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 134 with dimensions [268, 270)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 135 with dimensions [270, 272)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 136 with dimensions [272, 274)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 137 with dimensions [274, 276)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 138 with dimensions [276, 278)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 139 with dimensions [278, 280)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 140 with dimensions [280, 282)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 141 with dimensions [282, 284)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 142 with dimensions [284, 286)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 143 with dimensions [286, 288)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 144 with dimensions [288, 290)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 145 with dimensions [290, 292)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 146 with dimensions [292, 294)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 147 with dimensions [294, 296)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 148 with dimensions [296, 298)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 149 with dimensions [298, 300)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 150 with dimensions [300, 302)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 151 with dimensions [302, 304)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 152 with dimensions [304, 306)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 153 with dimensions [306, 308)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 154 with dimensions [308, 310)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 155 with dimensions [310, 312)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 156 with dimensions [312, 314)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 157 with dimensions [314, 316)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 158 with dimensions [316, 318)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 159 with dimensions [318, 320)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 160 with dimensions [320, 322)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 161 with dimensions [322, 324)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 162 with dimensions [324, 326)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 163 with dimensions [326, 328)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 164 with dimensions [328, 330)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 165 with dimensions [330, 332)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 166 with dimensions [332, 334)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 167 with dimensions [334, 336)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 168 with dimensions [336, 338)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 169 with dimensions [338, 340)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 170 with dimensions [340, 342)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 171 with dimensions [342, 344)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 172 with dimensions [344, 346)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 173 with dimensions [346, 348)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 174 with dimensions [348, 350)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 175 with dimensions [350, 352)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 176 with dimensions [352, 354)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 177 with dimensions [354, 356)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 178 with dimensions [356, 358)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 179 with dimensions [358, 360)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 180 with dimensions [360, 362)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 181 with dimensions [362, 364)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 182 with dimensions [364, 366)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 183 with dimensions [366, 368)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 184 with dimensions [368, 370)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 185 with dimensions [370, 372)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 186 with dimensions [372, 374)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 187 with dimensions [374, 376)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 188 with dimensions [376, 378)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 189 with dimensions [378, 380)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 190 with dimensions [380, 382)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 191 with dimensions [382, 384)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 192 with dimensions [384, 386)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early terminINFO: Leann metadata saved to knowledge.leann.meta.json\n",
+      "ation.\n",
+      "Processing chunk 193 with dimensions [386, 388)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 194 with dimensions [388, 390)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 195 with dimensions [390, 392)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 196 with dimensions [392, 394)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 197 with dimensions [394, 396)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 198 with dimensions [396, 398)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 199 with dimensions [398, 400)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 200 with dimensions [400, 402)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 201 with dimensions [402, 404)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 202 with dimensions [404, 406)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 203 with dimensions [406, 408)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 204 with dimensions [408, 410)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 205 with dimensions [410, 412)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 206 with dimensions [412, 414)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 207 with dimensions [414, 416)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 208 with dimensions [416, 418)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 209 with dimensions [418, 420)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 210 with dimensions [420, 422)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 211 with dimensions [422, 424)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 212 with dimensions [424, 426)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 213 with dimensions [426, 428)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 214 with dimensions [428, 430)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 215 with dimensions [430, 432)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 216 with dimensions [432, 434)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 217 with dimensions [434, 436)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 218 with dimensions [436, 438)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 219 with dimensions [438, 440)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 220 with dimensions [440, 442)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 221 with dimensions [442, 444)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 222 with dimensions [444, 446)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 223 with dimensions [446, 448)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 224 with dimensions [448, 450)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 225 with dimensions [450, 452)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 226 with dimensions [452, 454)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 227 with dimensions [454, 456)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 228 with dimensions [456, 458)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 229 with dimensions [458, 460)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 230 with dimensions [460, 462)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 231 with dimensions [462, 464)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 232 with dimensions [464, 466)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 233 with dimensions [466, 468)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 234 with dimensions [468, 470)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 235 with dimensions [470, 472)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 236 with dimensions [472, 474)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 237 with dimensions [474, 476)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 238 with dimensions [476, 478)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 239 with dimensions [478, 480)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 240 with dimensions [480, 482)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 241 with dimensions [482, 484)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 242 with dimensions [484, 486)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 243 with dimensions [486, 488)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 244 with dimensions [488, 490)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 245 with dimensions [490, 492)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 246 with dimensions [492, 494)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 247 with dimensions [494, 496)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 248 with dimensions [496, 498)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 249 with dimensions [498, 500)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 250 with dimensions [500, 502)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 251 with dimensions [502, 504)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 252 with dimensions [504, 506)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 253 with dimensions [506, 508)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 254 with dimensions [508, 510)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 255 with dimensions [510, 512)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 256 with dimensions [512, 514)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 257 with dimensions [514, 515)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 258 with dimensions [515, 516)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 259 with dimensions [516, 517)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 260 with dimensions [517, 518)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 261 with dimensions [518, 519)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 262 with dimensions [519, 520)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 263 with dimensions [520, 521)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 264 with dimensions [521, 522)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 265 with dimensions [522, 523)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 266 with dimensions [523, 524)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 267 with dimensions [524, 525)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 268 with dimensions [525, 526)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 269 with dimensions [526, 527)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 270 with dimensions [527, 528)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 271 with dimensions [528, 529)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 272 with dimensions [529, 530)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 273 with dimensions [530, 531)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 274 with dimensions [531, 532)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 275 with dimensions [532, 533)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 276 with dimensions [533, 534)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 277 with dimensions [534, 535)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 278 with dimensions [535, 536)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 279 with dimensions [536, 537)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 280 with dimensions [537, 538)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 281 with dimensions [538, 539)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 282 with dimensions [539, 540)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 283 with dimensions [540, 541)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 284 with dimensions [541, 542)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 285 with dimensions [542, 543)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 286 with dimensions [543, 544)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 287 with dimensions [544, 545)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 288 with dimensions [545, 546)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 289 with dimensions [546, 547)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 290 with dimensions [547, 548)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 291 with dimensions [548, 549)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 292 with dimensions [549, 550)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 293 with dimensions [550, 551)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 294 with dimensions [551, 552)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 295 with dimensions [552, 553)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 296 with dimensions [553, 554)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 297 with dimensions [554, 555)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 298 with dimensions [555, 556)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 299 with dimensions [556, 557)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 300 with dimensions [557, 558)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 301 with dimensions [558, 559)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 302 with dimensions [559, 560)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 303 with dimensions [560, 561)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 304 with dimensions [561, 562)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 305 with dimensions [562, 563)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 306 with dimensions [563, 564)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 307 with dimensions [564, 565)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 308 with dimensions [565, 566)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 309 with dimensions [566, 567)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 310 with dimensions [567, 568)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 311 with dimensions [568, 569)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 312 with dimensions [569, 570)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 313 with dimensions [570, 571)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 314 with dimensions [571, 572)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 315 with dimensions [572, 573)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 316 with dimensions [573, 574)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 317 with dimensions [574, 575)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 318 with dimensions [575, 576)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 319 with dimensions [576, 577)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 320 with dimensions [577, 578)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 321 with dimensions [578, 579)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 322 with dimensions [579, 580)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 323 with dimensions [580, 581)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 324 with dimensions [581, 582)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 325 with dimensions [582, 583)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 326 with dimensions [583, 584)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 327 with dimensions [584, 585)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 328 with dimensions [585, 586)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 329 with dimensions [586, 587)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 330 with dimensions [587, 588)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 331 with dimensions [588, 589)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 332 with dimensions [589, 590)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 333 with dimensions [590, 591)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 334 with dimensions [591, 592)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 335 with dimensions [592, 593)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 336 with dimensions [593, 594)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 337 with dimensions [594, 595)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 338 with dimensions [595, 596)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 339 with dimensions [596, 597)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 340 with dimensions [597, 598)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 341 with dimensions [598, 599)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 342 with dimensions [599, 600)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 343 with dimensions [600, 601)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 344 with dimensions [601, 602)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 345 with dimensions [602, 603)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 346 with dimensions [603, 604)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 347 with dimensions [604, 605)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 348 with dimensions [605, 606)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 349 with dimensions [606, 607)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 350 with dimensions [607, 608)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 351 with dimensions [608, 609)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 352 with dimensions [609, 610)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 353 with dimensions [610, 611)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 354 with dimensions [611, 612)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 355 with dimensions [612, 613)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 356 with dimensions [613, 614)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 357 with dimensions [614, 615)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 358 with dimensions [615, 616)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 359 with dimensions [616, 617)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 360 with dimensions [617, 618)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 361 with dimensions [618, 619)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 362 with dimensions [619, 620)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 363 with dimensions [620, 621)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 364 with dimensions [621, 622)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 365 with dimensions [622, 623)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 366 with dimensions [623, 624)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 367 with dimensions [624, 625)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 368 with dimensions [625, 626)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 369 with dimensions [626, 627)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 370 with dimensions [627, 628)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 371 with dimensions [628, 629)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 372 with dimensions [629, 630)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 373 with dimensions [630, 631)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 374 with dimensions [631, 632)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 375 with dimensions [632, 633)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 376 with dimensions [633, 634)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 377 with dimensions [634, 635)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 378 with dimensions [635, 636)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 379 with dimensions [636, 637)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 380 with dimensions [637, 638)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 381 with dimensions [638, 639)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 382 with dimensions [639, 640)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 383 with dimensions [640, 641)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 384 with dimensions [641, 642)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 385 with dimensions [642, 643)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 386 with dimensions [643, 644)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 387 with dimensions [644, 645)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 388 with dimensions [645, 646)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 389 with dimensions [646, 647)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 390 with dimensions [647, 648)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 391 with dimensions [648, 649)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 392 with dimensions [649, 650)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 393 with dimensions [650, 651)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 394 with dimensions [651, 652)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 395 with dimensions [652, 653)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 396 with dimensions [653, 654)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 397 with dimensions [654, 655)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 398 with dimensions [655, 656)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 399 with dimensions [656, 657)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 400 with dimensions [657, 658)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 401 with dimensions [658, 659)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 402 with dimensions [659, 660)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 403 with dimensions [660, 661)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 404 with dimensions [661, 662)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 405 with dimensions [662, 663)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 406 with dimensions [663, 664)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 407 with dimensions [664, 665)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 408 with dimensions [665, 666)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 409 with dimensions [666, 667)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 410 with dimensions [667, 668)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 411 with dimensions [668, 669)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 412 with dimensions [669, 670)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 413 with dimensions [670, 671)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 414 with dimensions [671, 672)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 415 with dimensions [672, 673)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 416 with dimensions [673, 674)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 417 with dimensions [674, 675)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 418 with dimensions [675, 676)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 419 with dimensions [676, 677)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 420 with dimensions [677, 678)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 421 with dimensions [678, 679)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 422 with dimensions [679, 680)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 423 with dimensions [680, 681)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 424 with dimensions [681, 682)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 425 with dimensions [682, 683)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 426 with dimensions [683, 684)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 427 with dimensions [684, 685)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 428 with dimensions [685, 686)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 429 with dimensions [686, 687)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 430 with dimensions [687, 688)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 431 with dimensions [688, 689)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 432 with dimensions [689, 690)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 433 with dimensions [690, 691)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 434 with dimensions [691, 692)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 435 with dimensions [692, 693)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 436 with dimensions [693, 694)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 437 with dimensions [694, 695)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 438 with dimensions [695, 696)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 439 with dimensions [696, 697)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 440 with dimensions [697, 698)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 1.113e-12. Early termination.\n",
+      "Processing chunk 441 with dimensions [698, 699)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 442 with dimensions [699, 700)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 443 with dimensions [700, 701)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 444 with dimensions [701, 702)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 445 with dimensions [702, 703)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 446 with dimensions [703, 704)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 447 with dimensions [704, 705)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 448 with dimensions [705, 706)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 449 with dimensions [706, 707)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 450 with dimensions [707, 708)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 451 with dimensions [708, 709)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 452 with dimensions [709, 710)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 453 with dimensions [710, 711)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 454 with dimensions [711, 712)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 455 with dimensions [712, 713)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 456 with dimensions [713, 714)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 457 with dimensions [714, 715)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 458 with dimensions [715, 716)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 459 with dimensions [716, 717)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 460 with dimensions [717, 718)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 461 with dimensions [718, 719)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 462 with dimensions [719, 720)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 463 with dimensions [720, 721)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 464 with dimensions [721, 722)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 465 with dimensions [722, 723)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 466 with dimensions [723, 724)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 467 with dimensions [724, 725)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 468 with dimensions [725, 726)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 469 with dimensions [726, 727)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 470 with dimensions [727, 728)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 471 with dimensions [728, 729)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 472 with dimensions [729, 730)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 473 with dimensions [730, 731)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 474 with dimensions [731, 732)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 475 with dimensions [732, 733)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 476 with dimensions [733, 734)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 477 with dimensions [734, 735)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 478 with dimensions [735, 736)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 479 with dimensions [736, 737)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 480 with dimensions [737, 738)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 481 with dimensions [738, 739)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 482 with dimensions [739, 740)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 483 with dimensions [740, 741)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 484 with dimensions [741, 742)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 485 with dimensions [742, 743)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 486 with dimensions [743, 744)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 487 with dimensions [744, 745)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 488 with dimensions [745, 746)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 489 with dimensions [746, 747)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 490 with dimensions [747, 748)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 491 with dimensions [748, 749)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 492 with dimensions [749, 750)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 493 with dimensions [750, 751)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 494 with dimensions [751, 752)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 495 with dimensions [752, 753)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 496 with dimensions [753, 754)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 497 with dimensions [754, 755)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 498 with dimensions [755, 756)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 499 with dimensions [756, 757)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 500 with dimensions [757, 758)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 501 with dimensions [758, 759)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 502 with dimensions [759, 760)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 503 with dimensions [760, 761)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 504 with dimensions [761, 762)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 505 with dimensions [762, 763)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 506 with dimensions [763, 764)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 507 with dimensions [764, 765)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 508 with dimensions [765, 766)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 509 with dimensions [766, 767)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 510 with dimensions [767, 768)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Processing chunk 511 with dimensions [768, 769)\n",
+      "Residuals unchanged: 3.40282e+38 becomes 0. Early termination.\n",
+      "Writing bin: knowledge_pq_pivots.bin\n",
+      "bin: #pts = 256, #dims = 769, size = 787464B\n",
+      "Finished writing bin.\n",
+      "Writing bin: knowledge_pq_pivots.bin\n",
+      "bin: #pts = 769, #dims = 1, size = 3084B\n",
+      "Finished writing bin.\n",
+      "Writing bin: knowledge_pq_pivots.bin\n",
+      "bin: #pts = 513, #dims = 1, size = 2060B\n",
+      "Finished writing bin.\n",
+      "Writing bin: knowledge_pq_pivots.bin\n",
+      "bin: #pts = 4, #dims = 1, size = 40B\n",
+      "Finished writing bin.\n",
+      "Saved pq pivot data to knowledge_pq_pivots.bin of size 796704B.\n",
+      "Opened: knowledge_prepped_base.bin, size: 18464, cache_size: 18464\n",
       "Reading bin file knowledge_pq_pivots.bin ...\n",
       "Opening bin file knowledge_pq_pivots.bin... \n",
       "Metadata: #pts = 4, #dims = 1...\n",
@@ -69,17 +1119,17 @@
       "done.\n",
       "Loaded PQ pivot information\n",
       "Processing points  [0, 6)...done.\n",
-      "Time for generating quantized data: 0.016337 seconds\n",
+      "Time for generating quantized data: 0.399541 seconds\n",
       "Full index fits in RAM budget, should consume at most 2.03973e-05GiBs, so building in one shot\n",
       "L2: Using AVX2 distance computation DistanceL2Float\n",
       "Passed, empty search_params while creating index config\n",
       "Using only first 6 from file.. \n",
       "Starting index build with 6 points... \n",
-      "0% of index build completed.Starting final cleanup..done. Link time: 6.2e-05s\n",
+      "0% of index build completed.Starting final cleanup..done. Link time: 0.00011s\n",
       "Index built with degree: max:5  avg:5  min:5  count(deg<2):0\n",
       "Not saving tags as they are not enabled.\n",
-      "Time taken for save: 0.000156s.\n",
-      "Time for building merged vamana index: 0.000549 seconds\n",
+      "Time taken for save: 0.000144s.\n",
+      "Time for building merged vamana index: 0.000777 seconds\n",
       "Opened: knowledge_prepped_base.bin, size: 18464, cache_size: 18464\n",
       "Vamana index file size=168\n",
       "Opened: knowledge_disk.index, cache_size: 67108864\n",
@@ -94,33 +1144,15 @@
       "Finished writing bin.\n",
       "Output disk index file written to knowledge_disk.index\n",
       "Finished writing 28672B\n",
-      "Time for generating disk layout: 0.032297 seconds\n",
+      "Time for generating disk layout: 0.039398 seconds\n",
       "Opened: knowledge_prepped_base.bin, size: 18464, cache_size: 18464\n",
       "Loading base knowledge_prepped_base.bin. #points: 6. #dim: 769.\n",
-      "Wrote 1 points to sample file: knowledge_sample_data.bin\n",
-      "Indexing time: 0.0495994\n",
-      "INFO: Leann metadata saved to knowledge.leann.meta.json\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Opened file : knowledge_disk.index\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Since data is floating point, we assume that it has been appropriately pre-processed (normalization for cosine, and convert-to-l2 by adding extra dimension for MIPS). So we shall invoke an l2 distance function.\n",
+      "Wrote 0 points to sample file: knowledge_sample_data.bin\n",
+      "Indexing time: 0.440015\n",
+      "✅ DiskANN index loaded successfully.Since data is floating point, we assume that it has been appropriately pre-processed (normalization for cosine, and convert-to-l2 by adding extra dimension for MIPS). So we shall invoke an l2 distance function.\n",
       "L2: Using AVX2 distance computation DistanceL2Float\n",
       "L2: Using AVX2 distance computation DistanceL2Float\n",
       "Before index load\n",
-      "✅ DiskANN index loaded successfully.\n",
-      "INFO: LeannSearcher initialized with 'diskann' backend using index 'knowledge.leann'.\n",
-      "INFO: Terminating session server process (PID: 70114)...\n",
       "Reading bin file knowledge_pq_compressed.bin ...\n",
       "Opening bin file knowledge_pq_compressed.bin... \n",
       "Metadata: #pts = 6, #dims = 512...\n",
@@ -145,17 +1177,19 @@
       "Loaded PQ Pivots: #ctrs: 256, #dims: 769, #chunks: 512\n",
       "Loaded PQ centroids and in-memory compressed vectors. #points: 6 #dim: 769 #aligned_dim: 776 #chunks: 512\n",
       "Loading index metadata from knowledge_disk.index\n",
-      "Disk-Index File Meta-data: # nodes per sector: 1, max node len (bytes): 3100, max node degree: 5\n",
+      "Disk-Index File Meta-data: # nodes per sector: 1, max node len (bytes): 3100, max node degre\n",
+      "INFO: LeannSearcher initialized with 'diskann' backend using index 'knowledge.leann'.\n",
+      "e: 5\n",
       "Disk-Index Meta: nodes per sector: 1, max node len: 3100, max node degree: 5\n",
       "Setting up thread-specific contexts for nthreads: 8\n",
-      "allocating ctx: 0x73ab1e40c000 to thread-id:127182044829504\n",
-      "allocating ctx: 0x73ab1baef000 to thread-id:127170028506048\n",
-      "allocating ctx: 0x73ab1bade000 to thread-id:127170038991680\n",
-      "allocating ctx: 0x73ab1bacd000 to thread-id:127170049477312\n",
-      "allocating ctx: 0x73ab1babc000 to thread-id:127170018020416\n",
-      "allocating ctx: 0x73ab1baab000 to thread-id:127169986563520\n",
-      "allocating ctx: 0x73ab1ba9a000 to thread-id:127170007534784\n",
-      "allocating ctx: 0x73ab1ba89000 to thread-id:127169997049152\n",
+      "allocating ctx: 0x77b24e80b000 to thread-id:131599213530048\n",
+      "allocating ctx: 0x77b24d48c000 to thread-id:131599182073280\n",
+      "allocating ctx: 0x77b24d47b000 to thread-id:131599224015680\n",
+      "allocating ctx: 0x77b24d46a000 to thread-id:131599192558912\n",
+      "allocating ctx: 0x77b24d459000 to thread-id:131599234501312\n",
+      "allocating ctx: 0x77b24d448000 to thread-id:131599203044544\n",
+      "allocating ctx: 0x77b24d437000 to thread-id:131599135935552\n",
+      "allocating ctx: 0x77b24d426000 to thread-id:131611173562176\n",
       "Loading centroid data from medoids vector data of 1 medoid(s)\n",
       "Reading bin file knowledge_disk.index_max_base_norm.bin ...\n",
       "Opening bin file knowledge_disk.index_max_base_norm.bin... \n",
@@ -163,13 +1197,22 @@
       "done.\n",
       "Setting re-scaling factor of base vectors to 1\n",
       "load_from_separate_paths done.\n",
-      "Reading (with alignment) bin file knowledge_sample_data.bin ...Metadata: #pts = 1, #dims = 769, aligned_dim = 776... allocating aligned memory of 3104 bytes... done. Copying data to mem_aligned buffer... done.\n",
-      "reserve ratio: 1\n",
-      "Graph traversal completed, hops: 3\n",
+      "Reading (with alignment) bin file knowledge_sample_data.bin ...Metadata: #pts = 0, #dims = 769, aligned_dim = 776... allocating aligned memory of 0 bytes... done. Copying data to mem_aligned buffer... done.\n",
       "Loading the cache list into memory....done.\n",
-      "After index load\n",
-      "INFO: Server process terminated.\n",
-      "Clearing scratch\n",
+      "After index load\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Opened file : knowledge_disk.index\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "INFO: Computing embeddings for 1 chunks using 'sentence-transformers/all-mpnet-base-v2'...\n"
      ]
     },
@@ -177,7 +1220,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Batches: 100%|██████████| 1/1 [00:00<00:00, 96.44it/s]"
+      "Batches: 100%|██████████| 1/1 [00:00<00:00, 57.79it/s]"
      ]
     },
     {
@@ -186,8 +1229,8 @@
      "text": [
       "INFO: DiskANN ZMQ mode enabled - ensuring embedding server is running\n",
       "INFO: Starting session-level embedding server as a background process...\n",
-      "INFO: Running command from project root: /home/ubuntu/leann_release/Power-RAG\n",
-      "INFO: Server process started with PID: 71195\n"
+      "INFO: Running command from project root: /home/ubuntu/LEANN_clean/leann\n",
+      "INFO: Server process started with PID: 197427\n"
      ]
     },
     {
@@ -202,9 +1245,8 @@
      "output_type": "stream",
      "text": [
       "✅ Embedding server is up and ready for this session.\n",
-      "reserve ratio: 1\n",
-      "[EmbeddingServer LOG]: Initializing leann-backend-diskann...\n",
-      "[EmbeddingServer LOG]: WARNING: Could not import DiskANN backend: cannot import name '_diskannpy' from partially initialized module 'packages.leann-backend-diskann.leann_backend_diskann' (most likely due to a circular import) (/home/ubuntu/leann_release/Power-RAG/packages/leann-backend-diskann/leann_backend_diskann/__init__.py)\n",
+      "reserve ratio: 1[EmbeddingServer LOG]: Initializing leann-backend-diskann...\n",
+      "[EmbeddingServer LOG]: WARNING: Could not import DiskANN backend: cannot import name '_diskannpy' from partially initialized module 'packages.leann-backend-diskann.leann_backend_diskann' (most likely due to a circular import) (/home/ubuntu/LEANN_clean/leann/packages/leann-backend-diskann/leann_backend_diskann/__init__.py)\n",
       "[EmbeddingServer LOG]: INFO: Initializing embedding server thread on port 5555\n",
       "[EmbeddingServer LOG]: INFO: Using CUDA device\n",
       "[EmbeddingServer LOG]: INFO: Loading model sentence-transformers/all-mpnet-base-v2\n",
@@ -213,135 +1255,106 @@
       "[EmbeddingServer LOG]: INFO: ZMQ ROUTER server listening on port 5555\n",
       "[EmbeddingServer LOG]: INFO: Embedding server ready to serve requests\n",
       "Score: -0.481 - C++ is a powerful programming language\n",
+      "Graph traversal completed, hops: 3\n",
+      "\n",
       "Score: -1.049 - Java is a powerful programming language\n",
       "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 3 bytes\n",
       "[EmbeddingServer LOG]: INFO: Request for 1 node embeddings: [0]\n",
       "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 0\n",
-      "[EmbeddingServer LOG]: Time taken for text lookup: 0.000028 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for text lookup: 0.000030 seconds\n",
       "[EmbeddingServer LOG]: INFO: Total batch size: 1, max_batch_size: 128\n",
       "[EmbeddingServer LOG]: INFO: Processing batch of size 1\n",
-      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.019126 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.018889 seconds\n",
       "[EmbeddingServer LOG]: Batch size: 1, Sequence length: 256\n",
-      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000246 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for embedding (batch): 3.216055 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.044091 seconds\n",
-      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000299 seconds\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 3.281315 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000184 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for embedding (batch): 3.080892 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.039653 seconds\n",
+      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000192 seconds\n",
+      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 3.140990 seconds\n",
       "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n",
       "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [1, 2, 3, 4, 5]\n",
       "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 5\n",
-      "[EmbeddingServer LOG]: Time taken for text lookup: 0.000048 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for text lookup: 0.000045 seconds\n",
       "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n",
       "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n",
-      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001797 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001917 seconds\n",
       "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n",
-      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000095 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for embedding (batch): 3.392574 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000353 seconds\n",
-      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000188 seconds\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 3.395764 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000089 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for embedding (batch): 3.670297 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000375 seconds\n",
+      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000203 seconds\n",
+      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 3.673614 seconds\n",
       "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n",
       "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [3, 4, 2, 1, 0]\n",
       "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 4\n",
-      "[EmbeddingServer LOG]: Time taken for text lookup: 0.000048 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for text lookup: 0.000028 seconds\n",
       "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n",
       "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n",
-      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001377 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001288 seconds\n",
       "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n",
-      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000085 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.009257 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000166 seconds\n",
-      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000074 seconds\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.011520 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000078 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.009282 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000155 seconds\n",
+      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000060 seconds\n",
+      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.011379 seconds\n",
       "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n",
       "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [0, 1, 2, 4, 5]\n",
       "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 5\n",
-      "[EmbeddingServer LOG]: Time taken for text lookup: 0.000019 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for text lookup: 0.000020 seconds\n",
       "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n",
       "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n",
-      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000792 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001060 seconds\n",
       "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n",
-      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000073 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008864 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000155 seconds\n",
-      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000052 seconds\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010397 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000100 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008817 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000170 seconds\n",
+      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000045 seconds\n",
+      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010654 seconds\n",
       "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n",
       "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [3, 1, 0, 2, 5]\n",
       "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 5\n",
       "[EmbeddingServer LOG]: Time taken for text lookup: 0.000020 seconds\n",
       "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n",
       "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n",
-      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000857 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000934 seconds\n",
       "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n",
       "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000073 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008830 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000143 seconds\n",
-      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000053 seconds\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010439 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008848 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000144 seconds\n",
+      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000087 seconds\n",
+      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010560 seconds\n",
       "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n",
       "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [0, 2, 3, 4, 5]\n",
       "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 5\n",
-      "[EmbeddingServer LOG]: Time taken for text lookup: 0.000020 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for text lookup: 0.000019 seconds\n",
       "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n",
       "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n",
-      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000805 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000957 seconds\n",
       "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n",
-      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000072 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008835 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000141 seconds\n",
-      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000049 seconds\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010386 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000070 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008813 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000145 seconds\n",
+      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000046 seconds\n",
+      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010484 seconds\n",
       "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n",
       "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [1, 0, 3, 4, 5]\n",
       "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 5\n",
       "[EmbeddingServer LOG]: Time taken for text lookup: 0.000019 seconds\n",
       "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n",
       "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n",
-      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000826 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000816 seconds\n",
       "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n",
-      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000092 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008809 seconds\n",
-      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000164 seconds\n",
-      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000048 seconds\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010431 seconds\n",
-      "Graph traversal completed, hops: 3\n"
+      "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000072 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008810 seconds\n",
+      "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000159 seconds\n",
+      "[EmbeddingServer LOG]: INFO: Serialize time: 0.000045 seconds\n",
+      "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010357 seconds\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
-      "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
       "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
       "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
       "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
@@ -349,17 +1362,6 @@
       "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n",
       "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n"
      ]
-    },
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
-      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
-      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
-      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
-     ]
     }
    ],
    "source": [
diff --git a/examples/main_cli_example.py b/examples/main_cli_example.py
index c768fe7..7600b1a 100644
--- a/examples/main_cli_example.py
+++ b/examples/main_cli_example.py
@@ -21,7 +21,7 @@ file_extractor: dict[str, BaseReader] = {
     ".xlsx": reader,
 }
 node_parser = DoclingNodeParser(
-    chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=256)
+    chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=512)
 )
 
 documents = SimpleDirectoryReader(
@@ -51,7 +51,7 @@ print(f"\n[PHASE 1] Building Leann index...")
 
 builder = LeannBuilder(
     backend_name="diskann",
-    embedding_model="sentence-transformers/all-mpnet-base-v2", # Using a common sentence transformer model
+    embedding_model="facebook/contriever", # Using a common sentence transformer model
     graph_degree=32, 
     complexity=64
 )
@@ -67,7 +67,7 @@ async def main():
     print(f"\n[PHASE 2] Starting Leann chat session...")
     chat = LeannChat(index_path=INDEX_PATH)
     
-    query = "Based on the paper, what are the two main techniques LEANN uses to achieve low storage overhead and high retrieval accuracy?"
+    query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead?"
     print(f"You: {query}")
     chat_response = chat.ask(query, recompute_beighbor_embeddings=True)
     print(f"Leann: {chat_response}")
diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py
index e9955e9..21c5ec8 100644
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -141,14 +141,14 @@ class LeannChat:
         
     def ask(self, question: str, **kwargs):
         # 1. 检索
-        results = self.searcher.search(question, top_k=3, **kwargs)
+        results = self.searcher.search(question, top_k=5, **kwargs)
         context = "\n\n".join([r['text'] for r in results])
 
         # 2. 构建 Prompt
         prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
 
         # 3. 调用 LLM
-        print(f"DEBUG: Calling LLM with prompt: {prompt[:200]}...")
+        print(f"DEBUG: Calling LLM with prompt: {prompt}...")
         try:
             client = self._get_openai_client()
             response = client.chat.completions.create(
diff --git a/test_pdf_index/pdf_documents.leann.meta.json b/test_pdf_index/pdf_documents.leann.meta.json
index f9d92a0..3247879 100644
--- a/test_pdf_index/pdf_documents.leann.meta.json
+++ b/test_pdf_index/pdf_documents.leann.meta.json
@@ -1,35 +1,23 @@
 {
   "version": "0.1.0",
   "backend_name": "diskann",
-  "embedding_model": "sentence-transformers/all-mpnet-base-v2",
-  "num_chunks": 98,
+  "embedding_model": "facebook/contriever",
+  "num_chunks": 56,
   "chunks": [
     {
       "text": "Yichuan Wang \u2020 , 1 , Shu Liu 1 , Zhifei Li 1 , Yongji Wu \u2020 , 1 , Ziming Mao 1 , Yilong Zhao 1 , Xiao Yan 2 , Zhiying Xu \u2217 , 3 , Yang Zhou 1 , 4 , Ion Stoica 1 , Sewon Min 1 , Matei Zaharia 1 , Joseph E. Gonzalez 1 1 UC Berkeley 2 CUHK 3 Amazon Web Services 4 UC Davis",
       "metadata": {}
     },
     {
-      "text": "Embedding-based search is widely used in applications such as recommendation and retrieval-augmented generation (RAG). Recently, there is a growing demand to support these capabilities over personal data stored locally on devices. However, maintaining the necessary data structure associated with the embedding-based search is often infeasible due to its high storage overhead. For example, indexing 100 GB of raw data requires 150 to 700 GB of storage, making local deployment impractical. Reducing this overhead while maintaining search quality and latency becomes a critical challenge.\nsearch viable in these settings, we seek to reduce storage overhead to under 5% of the original data size. At the same time, any such reduction must preserve high search accuracy while maintaining reasonable search latency to ensure responsive, real-time search experiences.",
+      "text": "Embedding-based search is widely used in applications such as recommendation and retrieval-augmented generation (RAG). Recently, there is a growing demand to support these capabilities over personal data stored locally on devices. However, maintaining the necessary data structure associated with the embedding-based search is often infeasible due to its high storage overhead. For example, indexing 100 GB of raw data requires 150 to 700 GB of storage, making local deployment impractical. Reducing this overhead while maintaining search quality and latency becomes a critical challenge.\nsearch viable in these settings, we seek to reduce storage overhead to under 5% of the original data size. At the same time, any such reduction must preserve high search accuracy while maintaining reasonable search latency to ensure responsive, real-time search experiences.\nIn this paper, we present LEANN, a storage-efficient approximate nearest neighbor (ANN) search index optimized for resource-constrained personal devices. LEANN combines a compact graph-based structure with an efficient on-the-fly recomputation strategy to enable fast and accurate retrieval with minimal storage overhead. Our evaluation shows that LEANN reduces index size to under 5% of the original raw data, achieving up to 50 times smaller storage than standard indexes, while maintaining 90% top-3 recall in under 2 seconds on real-world question answering benchmarks.\nExisting solutions, however, fall short of this goal. Most ANN indices store full embeddings and index metadata on disk [65], requiring terabytes of storage to index hundreds of gigabytes of documents, far exceeding the capacity of edge devices. While compression techniques such as product quantization (PQ) [29] can reduce storage, they often come at the cost of degraded search accuracy or require increased search latency to achieve comparable results.",
       "metadata": {}
     },
     {
-      "text": "In this paper, we present LEANN, a storage-efficient approximate nearest neighbor (ANN) search index optimized for resource-constrained personal devices. LEANN combines a compact graph-based structure with an efficient on-the-fly recomputation strategy to enable fast and accurate retrieval with minimal storage overhead. Our evaluation shows that LEANN reduces index size to under 5% of the original raw data, achieving up to 50 times smaller storage than standard indexes, while maintaining 90% top-3 recall in under 2 seconds on real-world question answering benchmarks.\nExisting solutions, however, fall short of this goal. Most ANN indices store full embeddings and index metadata on disk [65], requiring terabytes of storage to index hundreds of gigabytes of documents, far exceeding the capacity of edge devices. While compression techniques such as product quantization (PQ) [29] can reduce storage, they often come at the cost of degraded search accuracy or require increased search latency to achieve comparable results.",
+      "text": "With the recent advances in AI [27, 37], embedding-based search now significantly outperforms traditional keywordbased search methods [30, 71] across many domains such as question answering, recommendation, and large-scale web applications such as search engines [14, 74]. These systems rely on dense vector representations to capture semantic similarity and use approximate nearest neighbor (ANN) search to retrieve relevant results efficiently. Recently, there has been growing interest in enabling such capabilities on edge devices like laptops or phones, enabling applications like personalized search, on-device assistants, and privacypreserving retrieval over local data [24, 32, 66, 69].\nHowever, ANN data structures introduce substantial storage overheads, often 1.5 to 7 \u00d7 the size of the original raw data [57]. While such overheads are acceptable in large-scale web application deployments, they pose a significant bottleneck when deploying ANN search on personal devices or when using large datasets. For example, a 2 \u00d7 storage overhead on a personal laptop is impractical. To make ANN\n*This work does not relate to the position at Amazon. \u2020Corresponding authors. Email: <yichuan_wang@berkeley.edu>, <wuyongji317@gmail.com>.\nThe first insight is that in graph-based indexes like HNSW, a single query typically explores only a small subset of the embedding vectors to identify its nearest neighbors. As such, instead of storing these embeddings on disk, we can recompute them on the fly at search time. However, naive recomputation can still incur a high latency overhead. To address this challenge, LEANN introduces a two-level traversal algorithm that interleaves an approximate and an exact distance queue, while prioritizing the most promising candidates in the search process, thus reducing the number of recomputations. Additionally, LEANN also incorporates a dynamic batching mechanism that aggregates embedding computations across search hops, improving GPU utilization and thus minimizing recomputation latency.",
       "metadata": {}
     },
     {
-      "text": "With the recent advances in AI [27, 37], embedding-based search now significantly outperforms traditional keywordbased search methods [30, 71] across many domains such as question answering, recommendation, and large-scale web applications such as search engines [14, 74]. These systems rely on dense vector representations to capture semantic similarity and use approximate nearest neighbor (ANN) search to retrieve relevant results efficiently. Recently, there has been growing interest in enabling such capabilities on edge devices like laptops or phones, enabling applications like personalized search, on-device assistants, and privacypreserving retrieval over local data [24, 32, 66, 69].\nHowever, ANN data structures introduce substantial storage overheads, often 1.5 to 7 \u00d7 the size of the original raw data [57]. While such overheads are acceptable in large-scale web application deployments, they pose a significant bottleneck when deploying ANN search on personal devices or when using large datasets. For example, a 2 \u00d7 storage overhead on a personal laptop is impractical. To make ANN",
-      "metadata": {}
-    },
-    {
-      "text": "*This work does not relate to the position at Amazon. \u2020Corresponding authors. Email: <yichuan_wang@berkeley.edu>, <wuyongji317@gmail.com>.\nThe first insight is that in graph-based indexes like HNSW, a single query typically explores only a small subset of the embedding vectors to identify its nearest neighbors. As such, instead of storing these embeddings on disk, we can recompute them on the fly at search time. However, naive recomputation can still incur a high latency overhead. To address this challenge, LEANN introduces a two-level traversal algorithm that interleaves an approximate and an exact distance queue, while prioritizing the most promising candidates in the search process, thus reducing the number of recomputations. Additionally, LEANN also incorporates a dynamic batching mechanism that aggregates embedding computations across search hops, improving GPU utilization and thus minimizing recomputation latency.",
-      "metadata": {}
-    },
-    {
-      "text": "In this paper, we tackle the challenge of reducing ANN storage overhead and present LEANN, a novel graph-based vector index designed for storage-constrained environments. Built on top of Hierarchical Navigable Small World (HNSW) [38], a widely adopted, state-of-the-art graph-based ANN index, LEANN introduces system and algorithm optimizations that reduce total index storage to under 5% of the original data size, while preserving low query latency and high retrieval accuracy. At its core, LEANN is driven by two key insights.\nHowever, even without storing embeddings, the index metadata (e.g., graph structure) itself can lead to non-trivial storage overhead relative to the original data size. For example, a typical HNSW index uses a node degree of 64, meaning each node stores 64 neighbor links. With 4 bytes per link, this results in 256 bytes of metadata per node, which normally accounts for more than 25% storage overhead of a common 256-token document chunk [57].\nThe second insight is that much of the graph index metadata is redundant: not all nodes and edges contribute equally",
-      "metadata": {}
-    },
-    {
-      "text": "to search accuracy. Based on this observation, LEANN introduces a high-degree preserving graph pruning strategy that removes low-utility edges while preserving high-degree 'hub' nodes that are essential for maintaining effective search paths. By retaining only structurally important components of the graph, LEANN significantly reduces the size of the index without sacrificing the quality of the retrieval.\n- \u00b7 We conduct the first study on enabling low-latency, highaccuracy search over personal data with minimal storage overhead on edge devices.\nWe implement LEANN on top of FAISS [17] and evaluate it on four popular information retrieval (IR) benchmarks: NQ [31], HotpotQA [68], TriviaQA [28], and GPQA [48]. These benchmarks have been widely used in evaluations of information retrieval systems. Our experiments span both an NVIDIA A10 workstation [43] and an M1-based Mac [3]. The results show that LEANN reduces storage consumption by more than 50 \u00d7 compared to state-of-the-art indexes while achieving competitive latency to achieve high accuracy. In summary, we make the following contributions:",
+      "text": "In this paper, we tackle the challenge of reducing ANN storage overhead and present LEANN, a novel graph-based vector index designed for storage-constrained environments. Built on top of Hierarchical Navigable Small World (HNSW) [38], a widely adopted, state-of-the-art graph-based ANN index, LEANN introduces system and algorithm optimizations that reduce total index storage to under 5% of the original data size, while preserving low query latency and high retrieval accuracy. At its core, LEANN is driven by two key insights.\nHowever, even without storing embeddings, the index metadata (e.g., graph structure) itself can lead to non-trivial storage overhead relative to the original data size. For example, a typical HNSW index uses a node degree of 64, meaning each node stores 64 neighbor links. With 4 bytes per link, this results in 256 bytes of metadata per node, which normally accounts for more than 25% storage overhead of a common 256-token document chunk [57].\nThe second insight is that much of the graph index metadata is redundant: not all nodes and edges contribute equally\nto search accuracy. Based on this observation, LEANN introduces a high-degree preserving graph pruning strategy that removes low-utility edges while preserving high-degree 'hub' nodes that are essential for maintaining effective search paths. By retaining only structurally important components of the graph, LEANN significantly reduces the size of the index without sacrificing the quality of the retrieval.\n- \u00b7 We conduct the first study on enabling low-latency, highaccuracy search over personal data with minimal storage overhead on edge devices.\nWe implement LEANN on top of FAISS [17] and evaluate it on four popular information retrieval (IR) benchmarks: NQ [31], HotpotQA [68], TriviaQA [28], and GPQA [48]. These benchmarks have been widely used in evaluations of information retrieval systems. Our experiments span both an NVIDIA A10 workstation [43] and an M1-based Mac [3]. The results show that LEANN reduces storage consumption by more than 50 \u00d7 compared to state-of-the-art indexes while achieving competitive latency to achieve high accuracy. In summary, we make the following contributions:",
       "metadata": {}
     },
     {
@@ -41,11 +29,7 @@
       "metadata": {}
     },
     {
-      "text": "Vector search systems rely on high-dimensional embeddings to enable semantic search across unstructured data. A core operation in such systems is the top\ud835\udc58 nearest neighbor search, where the goal is to find the \ud835\udc58 most similar vectors in a dataset to a given query vector. Formally, given a set of vectors \ud835\udc4b = { \ud835\udc65 1 , \ud835\udc65 2 , . . . , \ud835\udc65 \ud835\udc5b } \u2282 R \ud835\udc5a and a query vector \ud835\udc5e \u2208 R \ud835\udc5a , a top\ud835\udc58 nearest neighbor search aims to retrieve a set S \u2282 \ud835\udc4b of \ud835\udc58 vectors such that:\n<!-- formula-not-decoded -->\nwhere Dist (\u00b7 , \u00b7) denotes a distance or similarity metric (e.g., Euclidean distance or cosine similarity).\nWhile exact search guarantees retrieval of the true nearest neighbors, it becomes computationally prohibitive at scale. Approximate nearest neighbor (ANN) methods [33, 38] offer a trade-off by allowing minor inaccuracies in exchange for substantially lower query latency. The effectiveness of an ANN algorithm is typically measured by Recall@k, defined as:\n<!-- formula-not-decoded -->",
-      "metadata": {}
-    },
-    {
-      "text": "where S is the set of true top\ud835\udc58 neighbors returned by exact search, and S \u2032 is the set returned by the ANN method. This metric quantifies the fraction of relevant neighbors successfully retrieved. Applications such as retrieval-augmented generation (RAG) typically require high recall (e.g., \u2265 0 . 9) to preserve downstream task quality [58].\nTo accelerate ANN search, vector indexes organize embeddings using data structures that reduce the number of comparisons required. Generally, a vector index consists of two primary components: (1) the stored embedding vectors themselves, representing the data, and (2) the index structure (such as graph connections or cluster assignments) built upon these vectors to expedite the search. Both components contribute to the overall storage footprint. Two widely used classes of ANN indices are described below:\nCluster-based Index. Methods such as IVF [33] partition the dataset into clusters (or 'cells') using algorithms like K-means [9], grouping semantically similar vectors together. At query time, only the most relevant clusters are searched, reducing the overall number of comparisons.",
+      "text": "Vector search systems rely on high-dimensional embeddings to enable semantic search across unstructured data. A core operation in such systems is the top\ud835\udc58 nearest neighbor search, where the goal is to find the \ud835\udc58 most similar vectors in a dataset to a given query vector. Formally, given a set of vectors \ud835\udc4b = { \ud835\udc65 1 , \ud835\udc65 2 , . . . , \ud835\udc65 \ud835\udc5b } \u2282 R \ud835\udc5a and a query vector \ud835\udc5e \u2208 R \ud835\udc5a , a top\ud835\udc58 nearest neighbor search aims to retrieve a set S \u2282 \ud835\udc4b of \ud835\udc58 vectors such that:\n<!-- formula-not-decoded -->\nwhere Dist (\u00b7 , \u00b7) denotes a distance or similarity metric (e.g., Euclidean distance or cosine similarity).\nWhile exact search guarantees retrieval of the true nearest neighbors, it becomes computationally prohibitive at scale. Approximate nearest neighbor (ANN) methods [33, 38] offer a trade-off by allowing minor inaccuracies in exchange for substantially lower query latency. The effectiveness of an ANN algorithm is typically measured by Recall@k, defined as:\n<!-- formula-not-decoded -->\nwhere S is the set of true top\ud835\udc58 neighbors returned by exact search, and S \u2032 is the set returned by the ANN method. This metric quantifies the fraction of relevant neighbors successfully retrieved. Applications such as retrieval-augmented generation (RAG) typically require high recall (e.g., \u2265 0 . 9) to preserve downstream task quality [58].\nTo accelerate ANN search, vector indexes organize embeddings using data structures that reduce the number of comparisons required. Generally, a vector index consists of two primary components: (1) the stored embedding vectors themselves, representing the data, and (2) the index structure (such as graph connections or cluster assignments) built upon these vectors to expedite the search. Both components contribute to the overall storage footprint. Two widely used classes of ANN indices are described below:\nCluster-based Index. Methods such as IVF [33] partition the dataset into clusters (or 'cells') using algorithms like K-means [9], grouping semantically similar vectors together. At query time, only the most relevant clusters are searched, reducing the overall number of comparisons.",
       "metadata": {}
     },
     {
@@ -57,31 +41,19 @@
       "metadata": {}
     },
     {
-      "text": "- 1: Input: Graph \ud835\udc3a with entry node \ud835\udc5d , query \ud835\udc65 \ud835\udc5e , result size \ud835\udc58 , queue size \ud835\udc52\ud835\udc53 ( \ud835\udc58 \u2264 \ud835\udc52\ud835\udc53 )\n- 2: Output: Top- \ud835\udc58 approximate neighbors \ud835\udc45\n- 3: Initialize \ud835\udc36 \u2190{ \ud835\udc5d } , \ud835\udc45 \u2190{ \ud835\udc5d } , \ud835\udc49 \u2190{ \ud835\udc5d }\n- 4: while \ud835\udc36 \u2260 \u2205 and min ( \ud835\udc36. dist ) \u2264 max ( \ud835\udc45. dist ) do\n- 5: \ud835\udc50 \u2190 node in \ud835\udc36 with smallest distance to \ud835\udc65 \ud835\udc5e\n- 6: Remove \ud835\udc50 from \ud835\udc36\n- 7: for each neighbor \ud835\udc5b of \ud835\udc50 do\n8:\nif \ud835\udc5b \u2209 \ud835\udc49 then\n9:\nExtract Embedding \ud835\udc65 \ud835\udc5b\n10:\nCompute \ud835\udc51 = \ud835\udc37\ud835\udc56\ud835\udc60\ud835\udc61 ( \ud835\udc65 \ud835\udc5e , \ud835\udc65 \ud835\udc5b )\n11:",
+      "text": "- 1: Input: Graph \ud835\udc3a with entry node \ud835\udc5d , query \ud835\udc65 \ud835\udc5e , result size \ud835\udc58 , queue size \ud835\udc52\ud835\udc53 ( \ud835\udc58 \u2264 \ud835\udc52\ud835\udc53 )\n- 2: Output: Top- \ud835\udc58 approximate neighbors \ud835\udc45\n- 3: Initialize \ud835\udc36 \u2190{ \ud835\udc5d } , \ud835\udc45 \u2190{ \ud835\udc5d } , \ud835\udc49 \u2190{ \ud835\udc5d }\n- 4: while \ud835\udc36 \u2260 \u2205 and min ( \ud835\udc36. dist ) \u2264 max ( \ud835\udc45. dist ) do\n- 5: \ud835\udc50 \u2190 node in \ud835\udc36 with smallest distance to \ud835\udc65 \ud835\udc5e\n- 6: Remove \ud835\udc50 from \ud835\udc36\n- 7: for each neighbor \ud835\udc5b of \ud835\udc50 do\n8:\nif \ud835\udc5b \u2209 \ud835\udc49 then\n9:\nExtract Embedding \ud835\udc65 \ud835\udc5b\n10:\nCompute \ud835\udc51 = \ud835\udc37\ud835\udc56\ud835\udc60\ud835\udc61 ( \ud835\udc65 \ud835\udc5e , \ud835\udc65 \ud835\udc5b )\n11:\nAdd \ud835\udc5b to \ud835\udc49 , add \ud835\udc5b to \ud835\udc36 and \ud835\udc45 with distance \ud835\udc51\n12:\nif | \ud835\udc45 | > \ud835\udc52\ud835\udc53 then\n13:\nKeep only the \ud835\udc52\ud835\udc53 closest nodes in \ud835\udc45\n- 14: return top \ud835\udc58 closest nodes in \ud835\udc45\nknob : increasing \ud835\udc52\ud835\udc53 allows the algorithm to examine more candidates, improving recall at the expense of higher latency.\nGraph-based indexes converge quickly to the nearest neighbors for two main reasons: (1) During index construction, each vector is connected to a set of approximate neighbors, typically selected to be semantically similar. If a vector is close to the query, its neighbors are also likely to be close, allowing the search to rapidly move toward high-quality results. (2) The graph implicitly yields a much finer-grained partitioning of the vector space compared to IVF, enabling the search to examine significantly fewer candidates from the entire database [22, 26, 35, 38]. An illustrative example of this traversal process is shown in Fig. 1.",
       "metadata": {}
     },
     {
-      "text": "Add \ud835\udc5b to \ud835\udc49 , add \ud835\udc5b to \ud835\udc36 and \ud835\udc45 with distance \ud835\udc51\n12:\nif | \ud835\udc45 | > \ud835\udc52\ud835\udc53 then\n13:\nKeep only the \ud835\udc52\ud835\udc53 closest nodes in \ud835\udc45\n- 14: return top \ud835\udc58 closest nodes in \ud835\udc45\nknob : increasing \ud835\udc52\ud835\udc53 allows the algorithm to examine more candidates, improving recall at the expense of higher latency.\nGraph-based indexes converge quickly to the nearest neighbors for two main reasons: (1) During index construction, each vector is connected to a set of approximate neighbors, typically selected to be semantically similar. If a vector is close to the query, its neighbors are also likely to be close, allowing the search to rapidly move toward high-quality results. (2) The graph implicitly yields a much finer-grained partitioning of the vector space compared to IVF, enabling the search to examine significantly fewer candidates from the entire database [22, 26, 35, 38]. An illustrative example of this traversal process is shown in Fig. 1.",
+      "text": "Local Vector Index System Requirement. Consumer devices, such as smart home appliances and personal workstations [32, 55, 66, 70], are heavily constrained in storage capacity [45, 62, 67]. At the same time, many downstream generative AI tasks rely heavily on similarity search over dense embeddings. However, embeddings are often significantly larger than the original raw data, typically causing up to a 7 \u00d7 storage blowup [40, 57, 77]. Unlike datacenter servers, which can dedicate hundreds of gigabytes or even terabytes to store uncompressed vector indexes [7, 16], consumer devices typically share a limited storage capacity with many colocating applications and media content [63]. This tight storage constraint makes it infeasible to store large-scale, uncompressed indexes and embeddings.\nAt the same time, these devices often support user-facing tasks such as large-scale document retrieval [32, 66] or offline semantic recall [6], where second-level latency (i.e., under 10 seconds) is generally acceptable. Usability typically degrades only when response times exceed this threshold.\nFigure 1. Best-First Search in graph-based index\nThis combination of stringent storage constraints (e.g., using less than 5% of the original data size) and moderately relaxed latency requirements opens up a distinct design space for on-device vector search: a highly storage-efficient index that exploits on-device compute resources (e.g., GPU) to achieve high recall within seconds.\nExisting System Limitations on Consumer Devices. Most vector search indexes, such as HNSW and IVF, are designed to optimize retrieval accuracy and latency under the assumption that the entire index, including full-precision embeddings, fits in DRAM. As a result, they are not suitable for DRAM-constrained environments such as consumer devices. Some recent methods [59, 64] reduce memory usage by storing compressed embeddings in DRAM for initial traversal. However, they still require accessing full-precision embeddings from disk for reranking, which incurs substantial storage overhead at query time.\nTo our knowledge, there is no prior system for vector index that has explicitly targeted consumer devices where storage footprint is a first-class objective. Our goal in this work is to design a vector search system that significantly reduces storage overhead, both for embeddings and index structures, while meeting the latency and recall requirements.",
       "metadata": {}
     },
     {
-      "text": "Local Vector Index System Requirement. Consumer devices, such as smart home appliances and personal workstations [32, 55, 66, 70], are heavily constrained in storage capacity [45, 62, 67]. At the same time, many downstream generative AI tasks rely heavily on similarity search over dense embeddings. However, embeddings are often significantly larger than the original raw data, typically causing up to a 7 \u00d7 storage blowup [40, 57, 77]. Unlike datacenter servers, which can dedicate hundreds of gigabytes or even terabytes to store uncompressed vector indexes [7, 16], consumer devices typically share a limited storage capacity with many colocating applications and media content [63]. This tight storage constraint makes it infeasible to store large-scale, uncompressed indexes and embeddings.\nAt the same time, these devices often support user-facing tasks such as large-scale document retrieval [32, 66] or offline semantic recall [6], where second-level latency (i.e., under 10 seconds) is generally acceptable. Usability typically degrades only when response times exceed this threshold.",
+      "text": "Quantization-based methods, such as PQ [29], are the main approach for reducing storage by approximating embeddings using compact codebooks. While these techniques can shrink the embedding size dramatically, the inherent information loss from this lossy compression often degrades retrieval accuracy. This degradation means that critical vector distinctions can be permanently lost during quantization, making it impossible to achieve high target recall using only the compressed data, a limitation we experimentally demonstrate in \u00a76 and which is documented in the literature [59]. As a result, they struggle to balance storage efficiency with the high accuracy needed for quality retrieval.",
       "metadata": {}
     },
     {
-      "text": "Figure 1. Best-First Search in graph-based index\nThis combination of stringent storage constraints (e.g., using less than 5% of the original data size) and moderately relaxed latency requirements opens up a distinct design space for on-device vector search: a highly storage-efficient index that exploits on-device compute resources (e.g., GPU) to achieve high recall within seconds.\nExisting System Limitations on Consumer Devices. Most vector search indexes, such as HNSW and IVF, are designed to optimize retrieval accuracy and latency under the assumption that the entire index, including full-precision embeddings, fits in DRAM. As a result, they are not suitable for DRAM-constrained environments such as consumer devices. Some recent methods [59, 64] reduce memory usage by storing compressed embeddings in DRAM for initial traversal. However, they still require accessing full-precision embeddings from disk for reranking, which incurs substantial storage overhead at query time.",
-      "metadata": {}
-    },
-    {
-      "text": "To our knowledge, there is no prior system for vector index that has explicitly targeted consumer devices where storage footprint is a first-class objective. Our goal in this work is to design a vector search system that significantly reduces storage overhead, both for embeddings and index structures, while meeting the latency and recall requirements.\nQuantization-based methods, such as PQ [29], are the main approach for reducing storage by approximating embeddings using compact codebooks. While these techniques can shrink the embedding size dramatically, the inherent information loss from this lossy compression often degrades retrieval accuracy. This degradation means that critical vector distinctions can be permanently lost during quantization, making it impossible to achieve high target recall using only the compressed data, a limitation we experimentally demonstrate in \u00a76 and which is documented in the literature [59]. As a result, they struggle to balance storage efficiency with the high accuracy needed for quality retrieval.",
-      "metadata": {}
-    },
-    {
-      "text": "In this section, we provide an overview of the core techniques and show how LEANN incorporates them into its architecture.\nGraph-based Recomputation. In the HNSW structure that LEANN builds upon, each query requires embeddings for\nFigure 2. LEANN System Diagram. The system combines high-degree preserving graph pruning for minimal storage footprint with graph-based recomputation and two-level search with dynamic batching for efficient query processing (Steps 1-4).\nonly a small subset of nodes, specifically those in the candidate set \ud835\udc36 defined in Algorithm 1. This observation motivates LEANN to compute these embeddings at query time rather than storing all of them beforehand. Concretely, instead of loading precomputed embeddings as in line 9, we modify the system to recompute them during query execution without changing any algorithm.\nthe embedding server (an on-device component utilizing the original embedding model for recomputation, as illustrated in Fig. 2) to obtain their corresponding embeddings. To further improve GPU utilization and reduce latency, LEANN employs a dynamic batching strategy to schedule embedding computation tasks on the GPU (\u00a74.2).",
-      "metadata": {}
-    },
-    {
-      "text": "Main Techniques. This paradigm introduces two key challenges. First, naive on-demand recomputation of embeddings at query time can lead to high search latency. Second, although LEANN removes the need to store dense embeddings, the remaining graph metadata, particularly node connectivity information, can still account for a significant portion of total storage (for example, over 10 percent).\nLEANN offers two main techniques to address the challenges mentioned before. First, LEANN uses a two-level graph traversal algorithm and a dynamic batching mechanism to reduce recomputation latency (\u00a74). Second, LEANN deploys a high degree of preserving graph pruning technique to greatly reduce the storage needed for graph metadata (\u00a75).\nSystem Workflow. The end-to-end workflow incorporating the optimizations discussed above is shown in Fig. 2. Given a dataset of items, LEANN first computes the embeddings of all items to build a vector index for the dataset using an off-shelf graph-based index. While LEANN design is agnostic to any particular graph index, we focus on the commonly used HNSW. We discuss how LEANN can be applied to other graph indices in \u00a78.1.",
+      "text": "In this section, we provide an overview of the core techniques and show how LEANN incorporates them into its architecture.\nGraph-based Recomputation. In the HNSW structure that LEANN builds upon, each query requires embeddings for\nFigure 2. LEANN System Diagram. The system combines high-degree preserving graph pruning for minimal storage footprint with graph-based recomputation and two-level search with dynamic batching for efficient query processing (Steps 1-4).\nonly a small subset of nodes, specifically those in the candidate set \ud835\udc36 defined in Algorithm 1. This observation motivates LEANN to compute these embeddings at query time rather than storing all of them beforehand. Concretely, instead of loading precomputed embeddings as in line 9, we modify the system to recompute them during query execution without changing any algorithm.\nthe embedding server (an on-device component utilizing the original embedding model for recomputation, as illustrated in Fig. 2) to obtain their corresponding embeddings. To further improve GPU utilization and reduce latency, LEANN employs a dynamic batching strategy to schedule embedding computation tasks on the GPU (\u00a74.2).\nMain Techniques. This paradigm introduces two key challenges. First, naive on-demand recomputation of embeddings at query time can lead to high search latency. Second, although LEANN removes the need to store dense embeddings, the remaining graph metadata, particularly node connectivity information, can still account for a significant portion of total storage (for example, over 10 percent).\nLEANN offers two main techniques to address the challenges mentioned before. First, LEANN uses a two-level graph traversal algorithm and a dynamic batching mechanism to reduce recomputation latency (\u00a74). Second, LEANN deploys a high degree of preserving graph pruning technique to greatly reduce the storage needed for graph metadata (\u00a75).\nSystem Workflow. The end-to-end workflow incorporating the optimizations discussed above is shown in Fig. 2. Given a dataset of items, LEANN first computes the embeddings of all items to build a vector index for the dataset using an off-shelf graph-based index. While LEANN design is agnostic to any particular graph index, we focus on the commonly used HNSW. We discuss how LEANN can be applied to other graph indices in \u00a78.1.",
       "metadata": {}
     },
     {
@@ -97,59 +69,31 @@
       "metadata": {}
     },
     {
-      "text": "1:, 1 = Input: query \ud835\udc5e , entry point \ud835\udc5d , re-ranking ratio \ud835\udc4e , result size \ud835\udc58 , search queue length \ud835\udc52\ud835\udc53. 2:, 1 = Output: \ud835\udc58 closest neighbors to \ud835\udc5e. 3:, 1 = \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u2190{ \ud835\udc5d } ; \ud835\udc34\ud835\udc44 \u2190\u2205 ; \ud835\udc38\ud835\udc44 \u2190{ \ud835\udc5d } ; \ud835\udc45 \u2190{ \ud835\udc5d }. 4:, 1 = while \ud835\udc38\ud835\udc44 \u2260 \u2205 do. 5:, 1 = \ud835\udc63 \u2190 extract closest element from \ud835\udc38\ud835\udc44 to \ud835\udc5e. 6:, 1 = \ud835\udc53 \u2190 get furthest element from \ud835\udc45 to \ud835\udc5e. 7:, 1 = if \ud835\udc51\ud835\udc56\ud835\udc60\ud835\udc61\ud835\udc4e\ud835\udc5b\ud835\udc50\ud835\udc52 ( \ud835\udc63,\ud835\udc5e ) > \ud835\udc51\ud835\udc56\ud835\udc60\ud835\udc61\ud835\udc4e\ud835\udc5b\ud835\udc50\ud835\udc52 ( \ud835\udc53 ,\ud835\udc5e ) then. 8:, 1 = break. 9:, 1 = for each \ud835\udc5b \u2208 neighbors( \ud835\udc63 ) do. 10:, 1 = if",
+      "text": "1:, 1 = Input: query \ud835\udc5e , entry point \ud835\udc5d , re-ranking ratio \ud835\udc4e , result size \ud835\udc58 , search queue length \ud835\udc52\ud835\udc53. 2:, 1 = Output: \ud835\udc58 closest neighbors to \ud835\udc5e. 3:, 1 = \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u2190{ \ud835\udc5d } ; \ud835\udc34\ud835\udc44 \u2190\u2205 ; \ud835\udc38\ud835\udc44 \u2190{ \ud835\udc5d } ; \ud835\udc45 \u2190{ \ud835\udc5d }. 4:, 1 = while \ud835\udc38\ud835\udc44 \u2260 \u2205 do. 5:, 1 = \ud835\udc63 \u2190 extract closest element from \ud835\udc38\ud835\udc44 to \ud835\udc5e. 6:, 1 = \ud835\udc53 \u2190 get furthest element from \ud835\udc45 to \ud835\udc5e. 7:, 1 = if \ud835\udc51\ud835\udc56\ud835\udc60\ud835\udc61\ud835\udc4e\ud835\udc5b\ud835\udc50\ud835\udc52 ( \ud835\udc63,\ud835\udc5e ) > \ud835\udc51\ud835\udc56\ud835\udc60\ud835\udc61\ud835\udc4e\ud835\udc5b\ud835\udc50\ud835\udc52 ( \ud835\udc53 ,\ud835\udc5e ) then. 8:, 1 = break. 9:, 1 = for each \ud835\udc5b \u2208 neighbors( \ud835\udc63 ) do. 10:, 1 = if \ud835\udc5b \u2209 \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 then. 11:, 1 = \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u2190 \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u222a { \ud835\udc5b }. 12:, 1 = Calculate approximate distance \ud835\udc51 \ud835\udc4e\ud835\udc5d\ud835\udc5d\ud835\udc5f\ud835\udc5c\ud835\udc65 ( \ud835\udc5b,\ud835\udc5e ). 13:, 1 = \ud835\udc34\ud835\udc44 \u2190 \ud835\udc34\ud835\udc44 \u222a { \ud835\udc5b }. 14:, 1 = \ud835\udc40 \u2190 extract top \ud835\udc4e % from \ud835\udc34\ud835\udc44 that are not in \ud835\udc38\ud835\udc44. 15:, 1 = for each \ud835\udc5a \u2208 \ud835\udc40 do. 16:, 1 = Compute exact distance \ud835\udc51 \ud835\udc52\ud835\udc65\ud835\udc4e\ud835\udc50\ud835\udc61 ( \ud835\udc5a,\ud835\udc5e ). 17:, 1 = \ud835\udc38\ud835\udc44 \u2190 \ud835\udc38\ud835\udc44 \u222a { \ud835\udc5a } ; \ud835\udc45 \u2190 \ud835\udc45 \u222a { \ud835\udc5a }. 18:, 1 = if | \ud835\udc45 | > \ud835\udc52\ud835\udc53 then. 19:, 1 = Remove furthest element from \ud835\udc45 to \ud835\udc5e.",
       "metadata": {}
     },
     {
-      "text": "\ud835\udc5b \u2209 \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 then. 11:, 1 = \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u2190 \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u222a { \ud835\udc5b }. 12:, 1 = Calculate approximate distance \ud835\udc51 \ud835\udc4e\ud835\udc5d\ud835\udc5d\ud835\udc5f\ud835\udc5c\ud835\udc65 ( \ud835\udc5b,\ud835\udc5e ). 13:, 1 = \ud835\udc34\ud835\udc44 \u2190 \ud835\udc34\ud835\udc44 \u222a { \ud835\udc5b }. 14:, 1 = \ud835\udc40 \u2190 extract top \ud835\udc4e % from \ud835\udc34\ud835\udc44 that are not in \ud835\udc38\ud835\udc44. 15:, 1 = for each \ud835\udc5a \u2208 \ud835\udc40 do. 16:, 1 = Compute exact distance \ud835\udc51 \ud835\udc52\ud835\udc65\ud835\udc4e\ud835\udc50\ud835\udc61 ( \ud835\udc5a,\ud835\udc5e ). 17:, 1 = \ud835\udc38\ud835\udc44 \u2190 \ud835\udc38\ud835\udc44 \u222a { \ud835\udc5a } ; \ud835\udc45 \u2190 \ud835\udc45 \u222a { \ud835\udc5a }. 18:, 1 = if | \ud835\udc45 | > \ud835\udc52\ud835\udc53 then. 19:, 1 = Remove furthest element from \ud835\udc45 to",
+      "text": "20:, 1 = return top \ud835\udc58 elements from \ud835\udc45\nBecause \ud835\udc34\ud835\udc44 globally tracks all previously encountered nodes, the algorithm can revisit earlier neighbors that become more promising as the search progresses. As a result, even when all immediate neighbors in the current iteration are far from the query, the algorithm can still select previously seen but unexplored nodes that are now ranked higher.\nThe core insight of this design is to combine the complementary strengths of approximate and exact distance computations. Approximate distances, though not fully accurate, are often sufficient to surface the most relevant candidates near the top, enabling early pruning of unpromising directions. We exploit this by using approximate distances to evaluate neighbors during traversal, and exact distances to re-rank only the most promising candidates. This approach achieves high recall while substantially reducing computational cost, thereby lowering overall latency.\nAt the end of each iteration, nodes in \ud835\udc40 with computed exact distances are inserted into \ud835\udc38\ud835\udc44 , which serves as the candidate pool for subsequent expansions. We repeat this process iteratively, and in each iteration, the number of nodes requiring recomputation is further reduced.\nFor efficient approximate distance calculation, we employ PQ, a widely used technique that compresses the embedding space by several orders of magnitude. In our setting, we use only 2GB of PQ-compressed embeddings to represent the original 200GB of full-precision data, resulting in minimal storage overhead. Although PQ introduces some accuracy loss, our framework compensates by applying exact computations to a small subset of high-ranking candidates, thereby preserving end-to-end search quality.\nFinally, our method is flexible and generalizable. It can incorporate alternative lightweight approximation techniques beyond quantization. For instance, distillation-based embeddings or link-and-code representations [18] can be used, provided they offer sufficient efficiency. This adaptability makes the Two-Level Search paradigm applicable across diverse computational budgets and deployment scenarios.",
       "metadata": {}
     },
     {
-      "text": "\ud835\udc5e. 20:, 1 = return top \ud835\udc58 elements from \ud835\udc45\nBecause \ud835\udc34\ud835\udc44 globally tracks all previously encountered nodes, the algorithm can revisit earlier neighbors that become more promising as the search progresses. As a result, even when all immediate neighbors in the current iteration are far from the query, the algorithm can still select previously seen but unexplored nodes that are now ranked higher.\nThe core insight of this design is to combine the complementary strengths of approximate and exact distance computations. Approximate distances, though not fully accurate, are often sufficient to surface the most relevant candidates near the top, enabling early pruning of unpromising directions. We exploit this by using approximate distances to evaluate neighbors during traversal, and exact distances to re-rank only the most promising candidates. This approach achieves high recall while substantially reducing computational cost, thereby lowering overall latency.\nAt the end of each iteration, nodes in \ud835\udc40 with computed exact distances are inserted into \ud835\udc38\ud835\udc44 , which serves as the candidate pool for subsequent expansions. We repeat this process iteratively, and in each iteration, the number of nodes requiring recomputation is further reduced.",
+      "text": "During the search process, GPU resources are often underutilized because each expansion step only triggers recomputation for a small number of nodes, typically equal to the degree of the current node \ud835\udc63 . This problem is further exacerbated when using the Two Level Search algorithm (see line 16), where the candidate set is even more selective, resulting in smaller batch sizes. As a result, LEANN frequently fails to meet the minimum batch size required to saturate GPU throughput, leading to inefficient use of hardware resources at runtime.\nTo address this, LEANN introduces a dynamic batching strategy that slightly relaxes the strict data dependency in best-first search in Algorithm 1. While this introduces minor staleness in the expansion order, it significantly increases the batch size for the embedding model, thereby reducing the end-to-end latency per query.\nThis leads to a key challenge: how can we design an algorithm that fully utilizes GPU compute capacity and takes advantage of batch processing [15, 76] without sacrificing search efficiency?\nSpecifically, LEANN breaks the strict data dependency in best-first search, where the current node to be expanded depends on the immediate results of the previous expansion, by dynamically collecting a group of the closest candidates from the priority queue. The algorithm accumulates neighbors, that is, nodes requiring recomputation, until a target batch size is reached (for example, 64 for the A10 GPU), which can be efficiently determined through lightweight offline profiling. This dynamic batching mechanism integrates naturally with the Two-Level Search described in \u00a74.1. We accumulate nodes in the set \ud835\udc40 across iterations until the predefined batch size threshold is reached, at which point we perform embedding recomputation for all nodes in \ud835\udc40 .\nThis idea shares a similar insight with the beam search strategy used in DiskANN [59], where a fixed number of round-trip node accesses are batched together to amortize disk access latency. However, unlike DiskANN's fixed beam width, LEANN uses dynamic batching based on the degrees of current candidates, reducing staleness and offering greater flexibility for our setting. Furthermore, while DiskANN aims to reduce I/O latency, our dynamic batching strategy focuses on maximizing GPU utilization. As a result, LEANN adopts a",
       "metadata": {}
     },
     {
-      "text": "For efficient approximate distance calculation, we employ PQ, a widely used technique that compresses the embedding space by several orders of magnitude. In our setting, we use only 2GB of PQ-compressed embeddings to represent the original 200GB of full-precision data, resulting in minimal storage overhead. Although PQ introduces some accuracy loss, our framework compensates by applying exact computations to a small subset of high-ranking candidates, thereby preserving end-to-end search quality.\nFinally, our method is flexible and generalizable. It can incorporate alternative lightweight approximation techniques beyond quantization. For instance, distillation-based embeddings or link-and-code representations [18] can be used, provided they offer sufficient efficiency. This adaptability makes the Two-Level Search paradigm applicable across diverse computational budgets and deployment scenarios.",
+      "text": "- 1: Input: Original graph \ud835\udc3a with the set of vertices \ud835\udc49 , candidate list size \ud835\udc52\ud835\udc53 , connection number threshold \ud835\udc40 for high degree nodes and \ud835\udc5a for other nodes, where \ud835\udc5a < \ud835\udc40 , percentage of high degree nodes \ud835\udc4e\n- 3: \u2200 \ud835\udc63 \u2208 \ud835\udc49 : \ud835\udc37 [ \ud835\udc63 ] \u2190 degree of \ud835\udc63 of \ud835\udc3a , \ud835\udc3a 1 \u2190 empty graph\n- 2: Output: Pruned graph \ud835\udc3a 1\n- 4: \ud835\udc49 \u2217 \u2190 nodes with the top \ud835\udc4e % highest (out) degree in \ud835\udc37\n- 5: for \ud835\udc63 \u2208 \ud835\udc49 do\n- 6: \ud835\udc4a \u2190 search( \ud835\udc63 , \ud835\udc52\ud835\udc53 )\n- \u22b2 Refer to Algorithm 1\n- 7: if \ud835\udc63 \u2208 \ud835\udc49 \u2217 then\n- 8: \ud835\udc40 0 \u2190 \ud835\udc40\n- 9: else\n- 10: \ud835\udc40 0 \u2190 \ud835\udc5a\n- 11: Select \ud835\udc40 0 neighbors from \ud835\udc4a using original heuristic 12: Add bidirectional edges between \ud835\udc63 and neighbors to\n\ud835\udc3a\n- 13:\n1\nShrink edges if \u2203 \ud835\udc5e \u2208 neighbor and \ud835\udc37 \ud835\udc5c\ud835\udc62\ud835\udc61 ( \ud835\udc5e ) > \ud835\udc40\ndifferent optimization objective: rather than minimizing disk access, it prioritizes efficient GPU usage to reduce end-to-end latency.",
       "metadata": {}
     },
     {
-      "text": "During the search process, GPU resources are often underutilized because each expansion step only triggers recomputation for a small number of nodes, typically equal to the degree of the current node \ud835\udc63 . This problem is further exacerbated when using the Two Level Search algorithm (see line 16), where the candidate set is even more selective, resulting in smaller batch sizes. As a result, LEANN frequently fails to meet the minimum batch size required to saturate GPU throughput, leading to inefficient use of hardware resources at runtime.\nTo address this, LEANN introduces a dynamic batching strategy that slightly relaxes the strict data dependency in best-first search in Algorithm 1. While this introduces minor staleness in the expansion order, it significantly increases the batch size for the embedding model, thereby reducing the end-to-end latency per query.\nThis leads to a key challenge: how can we design an algorithm that fully utilizes GPU compute capacity and takes advantage of batch processing [15, 76] without sacrificing search efficiency?",
+      "text": "With the Two-Level Search and dynamic batching mechanisms in place to optimize recomputation latency, we now examine how LEANN reduces the storage costs associated with graph metadata through a high degree preserving graph pruning algorithm.\nIn datacenter environments, this overhead is typically acceptable: storage is relatively inexpensive, and the operational costs of index maintenance (e.g., updates, rebuilds, and monitoring) are manageable. In contrast, consumer devices are often storage-constrained, making even the metadata footprint of the index structure a significant concern.\nAs discussed in \u00a73, while LEANN avoids storing exact embeddings by recomputing them at query time, the graph metadata used to guide the search process can still introduce substantial overhead. For example, in the datastore described by [56], the index structure alone accounts for over 30% of the total storage footprint.\nTo address this, LEANN allows users to specify a disk usage constraint \ud835\udc36 . When the metadata size exceeds this threshold, LEANN invokes a graph pruning algorithm that reduces the number of edges while preserving high-degree nodes. This design maintains retrieval accuracy and avoids significant increases in query-time latency, while substantially reducing the metadata footprint.\nThe graph, stored in a compressed sparse row (CSR) format, for example, consumes space proportional to the total\nFigure 3. Node access probability per query\nnumber of edges, i.e., the number of nodes times their average degree 2 . Since each node corresponds to a fixed chunk of text, the number of nodes is fixed given the text. The key challenge, then, is to reduce the average node degree without significantly compromising search latency. We formalize this optimization problem as follows: given a storage budget, construct a graph that maximizes search efficiency.\n<!-- formula-not-decoded -->\nHere, \ud835\udc5b denotes the number of nodes in the graph, corresponding to the number of text or image chunks. \ud835\udc37 \ud835\udc56 represents the degree of node \ud835\udc56 , and ef is a parameter that controls the length of the search queue, as described in Algorithm 1. During Best-First Search, each time a node \ud835\udc56 is selected for expansion, all of its \ud835\udc37 \ud835\udc56 neighbors must be recomputed 3 .",
       "metadata": {}
     },
     {
-      "text": "Specifically, LEANN breaks the strict data dependency in best-first search, where the current node to be expanded depends on the immediate results of the previous expansion, by dynamically collecting a group of the closest candidates from the priority queue. The algorithm accumulates neighbors, that is, nodes requiring recomputation, until a target batch size is reached (for example, 64 for the A10 GPU), which can be efficiently determined through lightweight offline profiling. This dynamic batching mechanism integrates naturally with the Two-Level Search described in \u00a74.1. We accumulate nodes in the set \ud835\udc40 across iterations until the predefined batch size threshold is reached, at which point we perform embedding recomputation for all nodes in \ud835\udc40 .",
+      "text": "While indiscriminate edge reduction in graph-based indexes often degrades search quality, as shown in \u00a76.4, our key insight is that selectively retaining hub nodes is sufficient to preserve performance. This strategy is motivated by the skewed node access pattern observed in Fig. 3, where highdegree nodes are accessed more frequently during search. Accordingly, we aim to preserve these high-degree nodes, which serve as the backbone of the graph's connectivity, even as we reduce the overall number of edges. To implement this idea, we introduce Algorithm 3. At a high level, our hubpreservation strategy incorporates two key modifications to the original graph construction process.\nThroughput denotes the number of chunks the embedding server can process per second. Since LEANN's performance bottleneck lies in recomputation as shown in Fig. 11, this formulation serves as a reasonable approximation of the search time. Finally, Dtype indicates the size of the data type used to store node connections in the graph, which is typically int32 (4 bytes).\n2 Here we refer to average out-degree.\n3 In the actual search trajectory, there may be slight differences between ef and the exact number of hops made by the query. For simplicity, we use \u02dd ef \ud835\udc56 = 1 | \ud835\udc37 \ud835\udc56 | as an approximation for the number of nodes requiring recomputation along the search path.\nOn the one hand, we apply differentiated degree thresholds to nodes based on their estimated importance. Specifically, we reduce the number of connections for most nodes to a lower threshold \ud835\udc5a (line 10), while allowing a small fraction (i.e., \ud835\udc4e %) of important nodes to retain a higher degree up to a threshold \ud835\udc40 (line 8). Given a storage budget \ud835\udc36 , LEANN automatically tunes the values of \ud835\udc5a and \ud835\udc40 through offline profiling across multiple datasets. To identify important nodes, we follow prior work [42, 51] and use node degree as a proxy for influence, selecting the top \ud835\udc4e % of nodes by degree (line 4). Empirically, we find that preserving only the top 2% of highdegree nodes significantly reduces the total number of edges while maintaining high retrieval accuracy.\nNote that this algorithm does not require knowledge about the query distribution. Hence, it can scale efficiently to large datasets, providing a simple yet effective mechanism to balance graph size and search performance.",
       "metadata": {}
     },
     {
-      "text": "This idea shares a similar insight with the beam search strategy used in DiskANN [59], where a fixed number of round-trip node accesses are batched together to amortize disk access latency. However, unlike DiskANN's fixed beam width, LEANN uses dynamic batching based on the degrees of current candidates, reducing staleness and offering greater flexibility for our setting. Furthermore, while DiskANN aims to reduce I/O latency, our dynamic batching strategy focuses on maximizing GPU utilization. As a result, LEANN adopts a",
-      "metadata": {}
-    },
-    {
-      "text": "- 1: Input: Original graph \ud835\udc3a with the set of vertices \ud835\udc49 , candidate list size \ud835\udc52\ud835\udc53 , connection number threshold \ud835\udc40 for high degree nodes and \ud835\udc5a for other nodes, where \ud835\udc5a < \ud835\udc40 , percentage of high degree nodes \ud835\udc4e\n- 3: \u2200 \ud835\udc63 \u2208 \ud835\udc49 : \ud835\udc37 [ \ud835\udc63 ] \u2190 degree of \ud835\udc63 of \ud835\udc3a , \ud835\udc3a 1 \u2190 empty graph\n- 2: Output: Pruned graph \ud835\udc3a 1\n- 4: \ud835\udc49 \u2217 \u2190 nodes with the top \ud835\udc4e % highest (out) degree in \ud835\udc37\n- 5: for \ud835\udc63 \u2208 \ud835\udc49 do\n- 6: \ud835\udc4a \u2190 search( \ud835\udc63 , \ud835\udc52\ud835\udc53 )\n- \u22b2 Refer to Algorithm 1\n- 7: if \ud835\udc63 \u2208 \ud835\udc49 \u2217 then\n- 8: \ud835\udc40 0 \u2190 \ud835\udc40\n- 9: else\n- 10: \ud835\udc40 0 \u2190 \ud835\udc5a",
-      "metadata": {}
-    },
-    {
-      "text": "- 11: Select \ud835\udc40 0 neighbors from \ud835\udc4a using original heuristic 12: Add bidirectional edges between \ud835\udc63 and neighbors to\n\ud835\udc3a\n- 13:\n1\nShrink edges if \u2203 \ud835\udc5e \u2208 neighbor and \ud835\udc37 \ud835\udc5c\ud835\udc62\ud835\udc61 ( \ud835\udc5e ) > \ud835\udc40\ndifferent optimization objective: rather than minimizing disk access, it prioritizes efficient GPU usage to reduce end-to-end latency.",
-      "metadata": {}
-    },
-    {
-      "text": "With the Two-Level Search and dynamic batching mechanisms in place to optimize recomputation latency, we now examine how LEANN reduces the storage costs associated with graph metadata through a high degree preserving graph pruning algorithm.\nIn datacenter environments, this overhead is typically acceptable: storage is relatively inexpensive, and the operational costs of index maintenance (e.g., updates, rebuilds, and monitoring) are manageable. In contrast, consumer devices are often storage-constrained, making even the metadata footprint of the index structure a significant concern.\nAs discussed in \u00a73, while LEANN avoids storing exact embeddings by recomputing them at query time, the graph metadata used to guide the search process can still introduce substantial overhead. For example, in the datastore described by [56], the index structure alone accounts for over 30% of the total storage footprint.\nTo address this, LEANN allows users to specify a disk usage constraint \ud835\udc36 . When the metadata size exceeds this threshold, LEANN invokes a graph pruning algorithm that reduces the number of edges while preserving high-degree nodes. This design maintains retrieval accuracy and avoids significant increases in query-time latency, while substantially reducing the metadata footprint.",
-      "metadata": {}
-    },
-    {
-      "text": "The graph, stored in a compressed sparse row (CSR) format, for example, consumes space proportional to the total\nFigure 3. Node access probability per query\nnumber of edges, i.e., the number of nodes times their average degree 2 . Since each node corresponds to a fixed chunk of text, the number of nodes is fixed given the text. The key challenge, then, is to reduce the average node degree without significantly compromising search latency. We formalize this optimization problem as follows: given a storage budget, construct a graph that maximizes search efficiency.\n<!-- formula-not-decoded -->\nHere, \ud835\udc5b denotes the number of nodes in the graph, corresponding to the number of text or image chunks. \ud835\udc37 \ud835\udc56 represents the degree of node \ud835\udc56 , and ef is a parameter that controls the length of the search queue, as described in Algorithm 1. During Best-First Search, each time a node \ud835\udc56 is selected for expansion, all of its \ud835\udc37 \ud835\udc56 neighbors must be recomputed 3 .",
-      "metadata": {}
-    },
-    {
-      "text": "While indiscriminate edge reduction in graph-based indexes often degrades search quality, as shown in \u00a76.4, our key insight is that selectively retaining hub nodes is sufficient to preserve performance. This strategy is motivated by the skewed node access pattern observed in Fig. 3, where highdegree nodes are accessed more frequently during search. Accordingly, we aim to preserve these high-degree nodes, which serve as the backbone of the graph's connectivity, even as we reduce the overall number of edges. To implement this idea, we introduce Algorithm 3. At a high level, our hubpreservation strategy incorporates two key modifications to the original graph construction process.\nThroughput denotes the number of chunks the embedding server can process per second. Since LEANN's performance bottleneck lies in recomputation as shown in Fig. 11, this formulation serves as a reasonable approximation of the search time. Finally, Dtype indicates the size of the data type used to store node connections in the graph, which is typically int32 (4 bytes).\n2 Here we refer to average out-degree.",
-      "metadata": {}
-    },
-    {
-      "text": "3 In the actual search trajectory, there may be slight differences between ef and the exact number of hops made by the query. For simplicity, we use \u02dd ef \ud835\udc56 = 1 | \ud835\udc37 \ud835\udc56 | as an approximation for the number of nodes requiring recomputation along the search path.\nOn the one hand, we apply differentiated degree thresholds to nodes based on their estimated importance. Specifically, we reduce the number of connections for most nodes to a lower threshold \ud835\udc5a (line 10), while allowing a small fraction (i.e., \ud835\udc4e %) of important nodes to retain a higher degree up to a threshold \ud835\udc40 (line 8). Given a storage budget \ud835\udc36 , LEANN automatically tunes the values of \ud835\udc5a and \ud835\udc40 through offline profiling across multiple datasets. To identify important nodes, we follow prior work [42, 51] and use node degree as a proxy for influence, selecting the top \ud835\udc4e % of nodes by degree (line 4). Empirically, we find that preserving only the top 2% of highdegree nodes significantly reduces the total number of edges while maintaining high retrieval accuracy.",
-      "metadata": {}
-    },
-    {
-      "text": "Note that this algorithm does not require knowledge about the query distribution. Hence, it can scale efficiently to large datasets, providing a simple yet effective mechanism to balance graph size and search performance.\nOn the other hand, while we restrict the number of outgoing connections during node insertion, as shown in line 10, weallow all nodes to establish bidirectional edges with newly inserted nodes, up to the maximum threshold \ud835\udc40 (as shown in line 13, not \ud835\udc5a ). This design choice ensures that each node retains the opportunity to connect with high-degree hub nodes, thereby preserving the navigability of the graph with minimal impact on search quality.",
+      "text": "On the other hand, while we restrict the number of outgoing connections during node insertion, as shown in line 10, weallow all nodes to establish bidirectional edges with newly inserted nodes, up to the maximum threshold \ud835\udc40 (as shown in line 13, not \ud835\udc5a ). This design choice ensures that each node retains the opportunity to connect with high-degree hub nodes, thereby preserving the navigability of the graph with minimal impact on search quality.",
       "metadata": {}
     },
     {
@@ -157,51 +101,23 @@
       "metadata": {}
     },
     {
-      "text": "Table 1. Summary of our dataset and index setup.\n\nDataset, Value = rpj_wiki [10]. Raw text size, Value = 76G. Chunk size, Value = 256 token. # of chunks, Value = 60 million. Embed model, Value = Contriever [27]. Embed dimension, Value = 768. Embedding size, Value = 171G. Index type, Value = FLAT. Distance metric, Value = Inner Product\nWorkloads We construct a datastore for retrieval based on the RPJ-Wiki dataset [10], a widely used corpus containing 76 GB of raw Wikipedia text. The indexing configuration is summarized in Tab. 1. Following prior work [57], we segment the text into passages of 256 tokens and generate an embedding for each chunk using Contriever [27], an unsupervised contrastive learning based dense retriever. Each embedding has a dimensionality of 768.",
+      "text": "Table 1. Summary of our dataset and index setup.\n\nDataset, Value = rpj_wiki [10]. Raw text size, Value = 76G. Chunk size, Value = 256 token. # of chunks, Value = 60 million. Embed model, Value = Contriever [27]. Embed dimension, Value = 768. Embedding size, Value = 171G. Index type, Value = FLAT. Distance metric, Value = Inner Product\nWorkloads We construct a datastore for retrieval based on the RPJ-Wiki dataset [10], a widely used corpus containing 76 GB of raw Wikipedia text. The indexing configuration is summarized in Tab. 1. Following prior work [57], we segment the text into passages of 256 tokens and generate an embedding for each chunk using Contriever [27], an unsupervised contrastive learning based dense retriever. Each embedding has a dimensionality of 768.\nFor evaluation, we adopt four standard benchmarks widely used in RAG and open-domain retrieval: NQ [31], TriviaQA [28], GPQA [48], and HotpotQA [68].\nBesides retrieval itself, we also consider the predominant downstream task of RAG. We adopt the widely deployed LLaMA model family for generation and report downstream task accuracy with the Llama-3.2-1B-Instruct model [19].\nTestbed. We evaluate our system and baselines on two hardware platforms. The first is an NVIDIA A10 server hosted on an AWS g5.48xlarge instance [4], equipped with a 96-core CPU, 2 \u00d7 3.8TB AWS NVMe SSD, and an NVIDIA A10G GPU with 24 GB of memory. The second is a Mac environment, provided via an AWS EC2 M1 Mac instance [3], featuring an Apple M1 Ultra processor (Arm64), macOS, and utilizes a 512GB Amazon EBS volume for its main storage.\nMetrics. We compare LEANN against alternative baselines in three main dimensions: storage, latency, and accuracy. For accuracy, we evaluate both the search (retrieval) accuracy and downstream task accuracy.",
       "metadata": {}
     },
     {
-      "text": "For evaluation, we adopt four standard benchmarks widely used in RAG and open-domain retrieval: NQ [31], TriviaQA [28], GPQA [48], and HotpotQA [68].\nBesides retrieval itself, we also consider the predominant downstream task of RAG. We adopt the widely deployed LLaMA model family for generation and report downstream task accuracy with the Llama-3.2-1B-Instruct model [19].\nTestbed. We evaluate our system and baselines on two hardware platforms. The first is an NVIDIA A10 server hosted on an AWS g5.48xlarge instance [4], equipped with a 96-core CPU, 2 \u00d7 3.8TB AWS NVMe SSD, and an NVIDIA A10G GPU with 24 GB of memory. The second is a Mac environment, provided via an AWS EC2 M1 Mac instance [3], featuring an Apple M1 Ultra processor (Arm64), macOS, and utilizes a 512GB Amazon EBS volume for its main storage.",
+      "text": "To evaluate downstream task (RAG) accuracy, we use the exact match (EM) and the F1 score as metrics. EM measures the proportion of predictions that match the ground-truth answers provided by the query dataset. The F1 score captures the harmonic mean of precision and recall, typically calculated at the token level. It assigns partial credit by considering the overlap in tokens between the predicted answer and the ground-truth answer, even if they are not an exact match.\nTo evaluate retrieval accuracy, we report Recall@k as defined in \u00a72. In open-domain settings, ground-truth labels for retrieved passages are typically unavailable. Following standard practice [29, 54, 75], we use the results from exact search as a proxy for ground truth. In our experiments, we set \ud835\udc58 = 3 following prior work standard setup [1, 57], and report Recall@3. The exact search is implemented with faiss.IndexFlatIP over our datastore for each query set.\nFor the retrieval latency evaluation, we measure the time required to reach different target recall levels. Specifically, we perform a binary search to identify the minimal search queue length \ud835\udc52\ud835\udc53 (as defined in Algorithm 1) that meets the\nFigure 4. [Main Result]: Latency-storage trade-offs in RAG applications across four datasets and two hardware configurations. The y-axis shows the storage overhead, defined as the size of the ANN index relative to the raw data size (as detailed in Tab. 1). We vary the target recall to evaluate latency under different retrieval accuracy levels. Since recall is not applicable to BM25, it appears as a single data point in each figure. Additionally, we omit the PQ-compressed method, as it fails to reach the target recall threshold despite being a vector-based approach. As shown in Fig. 5, both BM25 and PQ result in poor downstream accuracy.\n- \u00b7 IVF(in-memory) : The Inverted File (IVF) index is a widely used cluster-based vector index. We adopt the faiss.IndexIVFFlat implementation. Following best practices from Faiss [52] and prior work [25], we set the number of centroids to \u221a \ud835\udc41 , where \ud835\udc41 is the size of the datastore. In our setup, we use a 60 \ud835\udc40 datastore, which corresponds to \ud835\udc5b\ud835\udc59\ud835\udc56\ud835\udc60\ud835\udc61 = 8192.",
       "metadata": {}
     },
     {
-      "text": "Metrics. We compare LEANN against alternative baselines in three main dimensions: storage, latency, and accuracy. For accuracy, we evaluate both the search (retrieval) accuracy and downstream task accuracy.\nTo evaluate downstream task (RAG) accuracy, we use the exact match (EM) and the F1 score as metrics. EM measures the proportion of predictions that match the ground-truth answers provided by the query dataset. The F1 score captures the harmonic mean of precision and recall, typically calculated at the token level. It assigns partial credit by considering the overlap in tokens between the predicted answer and the ground-truth answer, even if they are not an exact match.\nTo evaluate retrieval accuracy, we report Recall@k as defined in \u00a72. In open-domain settings, ground-truth labels for retrieved passages are typically unavailable. Following standard practice [29, 54, 75], we use the results from exact search as a proxy for ground truth. In our experiments, we set \ud835\udc58 = 3 following prior work standard setup [1, 57], and report Recall@3. The exact search is implemented with faiss.IndexFlatIP over our datastore for each query set.",
+      "text": "Figure 5. [Main Result]: Comparison of Exact Match and F1 scores for downstream RAG tasks across three methods: keyword search (BM25), PQ-compressed vector search, and our proposed vector search system. Our method is configured to achieve a target recall of 90%, while the PQ baseline is given extended search time to reach its highest possible recall. Here we use Llama-3.2-1B as the generation model.\n- \u00b7 DiskANN [59]: DiskANN is a graph-based vector search system optimized for memory efficiency. It keeps only a PQ table in memory and loads full embeddings from disk on demand. We configure it with \ud835\udc40 = 60 and \ud835\udc52\ud835\udc53 \ud835\udc36\ud835\udc5c\ud835\udc5b\ud835\udc60\ud835\udc61\ud835\udc5f\ud835\udc62\ud835\udc50\ud835\udc61\ud835\udc56\ud835\udc5c\ud835\udc5b = 128, following recommended settings [59].\ntarget recall, and report the average latency of 20 queries using the resulting \ud835\udc52\ud835\udc53 value.\nBaselines We compare LEANN against the following baseline methods and systems:\n- \u00b7 IVF-based recomputation : We adopt the idea of IVFbased recomputation from Edge-RAG [55], where we use online recomputation to avoid storing the full set of embeddings, while using the same construction parameters as IVF (in-memory).\n- \u00b7 IVF-Disk : IVF-Disk reduces memory usage by employing memory-mapped files ( mmap ) instead of loading the entire index into memory. We implement it using Faiss's faiss.contrib.ondisk module and adopt the same configuration as in IVF (in-memory).\n- \u00b7 PQ Compression [29]: We apply PQ to compress embeddings to match our storage footprint while preserving the graph structure.\n- \u00b7 BM25 [13, 49]: A classical lexical ranking algorithm widely used in keyword-based search.\n- \u00b7 HNSW (in-memory) [38]: HNSW is a widely-used stateof-the-art vector index [2, 47]. We use the faiss.IndexHNSWFlat implementation with construction parameters recommended by Faiss: \ud835\udc40 = 30 and \ud835\udc52\ud835\udc53 \ud835\udc36\ud835\udc5c\ud835\udc5b\ud835\udc60\ud835\udc61\ud835\udc5f\ud835\udc62\ud835\udc50\ud835\udc61\ud835\udc56\ud835\udc5c\ud835\udc5b = 128, distinct from the search-time parameter \ud835\udc52\ud835\udc53 .",
       "metadata": {}
     },
     {
-      "text": "For the retrieval latency evaluation, we measure the time required to reach different target recall levels. Specifically, we perform a binary search to identify the minimal search queue length \ud835\udc52\ud835\udc53 (as defined in Algorithm 1) that meets the\nFigure 4. [Main Result]: Latency-storage trade-offs in RAG applications across four datasets and two hardware configurations. The y-axis shows the storage overhead, defined as the size of the ANN index relative to the raw data size (as detailed in Tab. 1). We vary the target recall to evaluate latency under different retrieval accuracy levels. Since recall is not applicable to BM25, it appears as a single data point in each figure. Additionally, we omit the PQ-compressed method, as it fails to reach the target recall threshold despite being a vector-based approach. As shown in Fig. 5, both BM25 and PQ result in poor downstream accuracy.",
+      "text": "Fig. 4 presents the storage consumption and end-to-end RAG query latency across all baseline systems and LEANN. The results show that LEANN is the only system that reduces storage to less than 5% of the original raw text size while maintaining reasonable latency, which we discussed in \u00a72.3, such as achieving 90% recall on GPQA in under 2 seconds.\nFor latency evaluation, we measure per-query latency under different target recall levels across all combinations of query datasets and hardware platforms. For BM25, we report a single number for its latency value using the default keyword search configuration. Unlike embedding-based search methods, BM25 is a lexical search technique and does not operate over dense embeddings. As a result, recall is not applicable for evaluating its effectiveness because it is defined based on approximate nearest neighbor retrieval. We omit results for HNSW and IVF on the Mac platform, as both methods require loading the full dense embedding matrix into memory, which leads to out-of-memory (OOM) errors. Specifically, the Mac system has 128GB of RAM, while the index size exceeds 171GB, as shown in Tab. 1. We also exclude the PQ-compressed baseline, as it fails to achieve the target recall even with an arbitrarily long search time.",
       "metadata": {}
     },
     {
-      "text": "- \u00b7 IVF(in-memory) : The Inverted File (IVF) index is a widely used cluster-based vector index. We adopt the faiss.IndexIVFFlat implementation. Following best practices from Faiss [52] and prior work [25], we set the number of centroids to \u221a \ud835\udc41 , where \ud835\udc41 is the size of the datastore. In our setup, we use a 60 \ud835\udc40 datastore, which corresponds to \ud835\udc5b\ud835\udc59\ud835\udc56\ud835\udc60\ud835\udc61 = 8192.\nFigure 5. [Main Result]: Comparison of Exact Match and F1 scores for downstream RAG tasks across three methods: keyword search (BM25), PQ-compressed vector search, and our proposed vector search system. Our method is configured to achieve a target recall of 90%, while the PQ baseline is given extended search time to reach its highest possible recall. Here we use Llama-3.2-1B as the generation model.",
-      "metadata": {}
-    },
-    {
-      "text": "- \u00b7 DiskANN [59]: DiskANN is a graph-based vector search system optimized for memory efficiency. It keeps only a PQ table in memory and loads full embeddings from disk on demand. We configure it with \ud835\udc40 = 60 and \ud835\udc52\ud835\udc53 \ud835\udc36\ud835\udc5c\ud835\udc5b\ud835\udc60\ud835\udc61\ud835\udc5f\ud835\udc62\ud835\udc50\ud835\udc61\ud835\udc56\ud835\udc5c\ud835\udc5b = 128, following recommended settings [59].\ntarget recall, and report the average latency of 20 queries using the resulting \ud835\udc52\ud835\udc53 value.\nBaselines We compare LEANN against the following baseline methods and systems:",
-      "metadata": {}
-    },
-    {
-      "text": "- \u00b7 IVF-based recomputation : We adopt the idea of IVFbased recomputation from Edge-RAG [55], where we use online recomputation to avoid storing the full set of embeddings, while using the same construction parameters as IVF (in-memory).\n- \u00b7 IVF-Disk : IVF-Disk reduces memory usage by employing memory-mapped files ( mmap ) instead of loading the entire index into memory. We implement it using Faiss's faiss.contrib.ondisk module and adopt the same configuration as in IVF (in-memory).\n- \u00b7 PQ Compression [29]: We apply PQ to compress embeddings to match our storage footprint while preserving the graph structure.\n- \u00b7 BM25 [13, 49]: A classical lexical ranking algorithm widely used in keyword-based search.",
-      "metadata": {}
-    },
-    {
-      "text": "- \u00b7 HNSW (in-memory) [38]: HNSW is a widely-used stateof-the-art vector index [2, 47]. We use the faiss.IndexHNSWFlat implementation with construction parameters recommended by Faiss: \ud835\udc40 = 30 and \ud835\udc52\ud835\udc53 \ud835\udc36\ud835\udc5c\ud835\udc5b\ud835\udc60\ud835\udc61\ud835\udc5f\ud835\udc62\ud835\udc50\ud835\udc61\ud835\udc56\ud835\udc5c\ud835\udc5b = 128, distinct from the search-time parameter \ud835\udc52\ud835\udc53 .",
-      "metadata": {}
-    },
-    {
-      "text": "Fig. 4 presents the storage consumption and end-to-end RAG query latency across all baseline systems and LEANN. The results show that LEANN is the only system that reduces storage to less than 5% of the original raw text size while maintaining reasonable latency, which we discussed in \u00a72.3, such as achieving 90% recall on GPQA in under 2 seconds.",
-      "metadata": {}
-    },
-    {
-      "text": "For latency evaluation, we measure per-query latency under different target recall levels across all combinations of query datasets and hardware platforms. For BM25, we report a single number for its latency value using the default keyword search configuration. Unlike embedding-based search methods, BM25 is a lexical search technique and does not operate over dense embeddings. As a result, recall is not applicable for evaluating its effectiveness because it is defined based on approximate nearest neighbor retrieval. We omit results for HNSW and IVF on the Mac platform, as both methods require loading the full dense embedding matrix into memory, which leads to out-of-memory (OOM) errors. Specifically, the Mac system has 128GB of RAM, while the index size exceeds 171GB, as shown in Tab. 1. We also exclude the PQ-compressed baseline, as it fails to achieve the target recall even with an arbitrarily long search time.",
-      "metadata": {}
-    },
-    {
-      "text": "We report storage consumption as a proportion of the raw text size (76 GB), referred to as proportional size in Fig. 4. Since all methods operate on the same fixed datastore based on the RPJ-Wiki dataset, their storage consumption remains constant across hardware platforms and query datasets. The figure shows that HNSW stores all dense embeddings along with the graph structure, leading to substantial storage overhead. DiskANN incurs even higher overhead due to its sectoraligned design. Each node's data, including its embedding (768 \u00d7 4 bytes) and edge list (60 neighbors, 60 \u00d7 4 bytes), is padded to a 4 KB SSD sector, resulting in the largest storage footprint among all methods. IVF and IVF-Disk exhibit similar storage overheads, both dominated by the embedding file. The additional metadata required by IVF (e.g., centroids) is relatively small, typically amounting to only about 1 / \u221a \ud835\udc41 of the total embedding size, and thus contributes little overhead. For BM25, storage is determined by the vocabulary size and the associated posting lists (i.e., the frequency of each token). In our setting, the size of",
-      "metadata": {}
-    },
-    {
-      "text": "the BM25 index is comparable to that of the original corpus. LEANN stores only a compact graph structure, resulting in less than 5% additional storage. Among the baselines, IVF-based recomputation achieves the lowest storage footprint, as it only stores the IVF centroids on disk, which adds little overhead.\nFig. 4 shows that LEANN consistently outperforms EdgeRAG, an IVF-based recomputation method, achieving significantly lower latency, ranging from 21 . 17 \u00d7 to 200 . 60 \u00d7 , across all the datasets and hardware platforms. This advantage is partly due to the asymptotic difference in recomputation complexity: the number of recomputed chunks in LEANN\n\u221a",
+      "text": "We report storage consumption as a proportion of the raw text size (76 GB), referred to as proportional size in Fig. 4. Since all methods operate on the same fixed datastore based on the RPJ-Wiki dataset, their storage consumption remains constant across hardware platforms and query datasets. The figure shows that HNSW stores all dense embeddings along with the graph structure, leading to substantial storage overhead. DiskANN incurs even higher overhead due to its sectoraligned design. Each node's data, including its embedding (768 \u00d7 4 bytes) and edge list (60 neighbors, 60 \u00d7 4 bytes), is padded to a 4 KB SSD sector, resulting in the largest storage footprint among all methods. IVF and IVF-Disk exhibit similar storage overheads, both dominated by the embedding file. The additional metadata required by IVF (e.g., centroids) is relatively small, typically amounting to only about 1 / \u221a \ud835\udc41 of the total embedding size, and thus contributes little overhead. For BM25, storage is determined by the vocabulary size and the associated posting lists (i.e., the frequency of each token). In our setting, the size of the BM25 index is comparable to that of the original corpus. LEANN stores only a compact graph structure, resulting in less than 5% additional storage. Among the baselines, IVF-based recomputation achieves the lowest storage footprint, as it only stores the IVF centroids on disk, which adds little overhead.\nFig. 4 shows that LEANN consistently outperforms EdgeRAG, an IVF-based recomputation method, achieving significantly lower latency, ranging from 21 . 17 \u00d7 to 200 . 60 \u00d7 , across all the datasets and hardware platforms. This advantage is partly due to the asymptotic difference in recomputation complexity: the number of recomputed chunks in LEANN\n\u221a",
       "metadata": {}
     },
     {
@@ -209,55 +125,31 @@
       "metadata": {}
     },
     {
-      "text": "We evaluate downstream task accuracy across four query datasets, as shown in Fig. 5. For all methods, we retrieve the top-3 most relevant documents. Our method is configured to achieve a target recall of 90%, while BM25 operates with its default keyword matching configuration. Although the PQ-compressed method fails to meet the target recall defined in \u00a76.2, it still achieves approximately 20% recall across all datasets. We include its downstream performance using these lower-quality retrieved results.\nFinally, we note that when a target recall level (e.g., 90%) is enforced, the downstream accuracy of our method aligns with that of other lossless ANN approaches, confirming that our system does not sacrifice accuracy for storage efficiency.",
+      "text": "We evaluate downstream task accuracy across four query datasets, as shown in Fig. 5. For all methods, we retrieve the top-3 most relevant documents. Our method is configured to achieve a target recall of 90%, while BM25 operates with its default keyword matching configuration. Although the PQ-compressed method fails to meet the target recall defined in \u00a76.2, it still achieves approximately 20% recall across all datasets. We include its downstream performance using these lower-quality retrieved results.\nFinally, we note that when a target recall level (e.g., 90%) is enforced, the downstream accuracy of our method aligns with that of other lossless ANN approaches, confirming that our system does not sacrifice accuracy for storage efficiency.\nAs illustrated in Fig. 5, our method consistently achieves higher downstream accuracy across all datasets except GPQA. Our ANN method shows limited gains on GPQA due to a distributional mismatch: the RPJ-Wiki datastore is somewhat out-of-distribution for GPQA, which consists of graduatelevel questions that are poorly supported by the retrieved content from Wikipedia. The accuracy improvement on HotpotQA is also more modest compared to the first two datasets, as HotpotQA requires multi-hop reasoning, while our current setup performs only single-hop retrieval, limiting its effectiveness for this task.",
       "metadata": {}
     },
     {
-      "text": "As illustrated in Fig. 5, our method consistently achieves higher downstream accuracy across all datasets except GPQA. Our ANN method shows limited gains on GPQA due to a distributional mismatch: the RPJ-Wiki datastore is somewhat out-of-distribution for GPQA, which consists of graduatelevel questions that are poorly supported by the retrieved content from Wikipedia. The accuracy improvement on HotpotQA is also more modest compared to the first two datasets, as HotpotQA requires multi-hop reasoning, while our current setup performs only single-hop retrieval, limiting its effectiveness for this task.",
+      "text": "We conduct comprehensive and detailed ablation studies to analyze the impacts of each methodology we use in LEANN.\nAblation study on latency optimization technique. To evaluate LEANN's latency optimization techniques, we incrementally enable the components introduced in \u00a74, using a fixed target recall across multiple datasets. We begin with a naive graph-based recomputation baseline. Incorporating\nFigure 6. [Ablation Study]: Speedup achieved by different optimization techniques described in \u00a74 when evaluated on four datasets to reach the same recall level on the A10 GPU. Two-level refers to the optimization in \u00a74.1, while Batch corresponds to \u00a74.2.\nFigure 7. [Ablation Study]: Comparison of pruned graph quality against two heuristic methods and the upper bound using the datastore in Tab. 1. We vary the target recall and measure the number of nodes each method needs to recompute. The dashed gray line represents the original HNSW graph, which serves as the upper bound, with twice the storage (i.e., average degree) of the others.\nthe two-level hybrid distance computation strategy from \u00a74.1 yields an average speedup of 1 . 40 \u00d7 , reaching up to 1 . 64 \u00d7 , by reducing the number of nodes requiring recomputation and enabling lightweight distance estimation without querying the embedding server. Adding the dynamic batching technique further improves GPU utilization during recomputation, increasing the overall speedup to 1 . 76 \u00d7 , with a maximum of 2 . 02 \u00d7 . Among all datasets, HotpotQA benefits the most from batching, as its longer search queue required to achieve the target recall allows more effective grouping of multi hop requests.",
       "metadata": {}
     },
     {
-      "text": "We conduct comprehensive and detailed ablation studies to analyze the impacts of each methodology we use in LEANN.\nAblation study on latency optimization technique. To evaluate LEANN's latency optimization techniques, we incrementally enable the components introduced in \u00a74, using a fixed target recall across multiple datasets. We begin with a naive graph-based recomputation baseline. Incorporating\nFigure 6. [Ablation Study]: Speedup achieved by different optimization techniques described in \u00a74 when evaluated on four datasets to reach the same recall level on the A10 GPU. Two-level refers to the optimization in \u00a74.1, while Batch corresponds to \u00a74.2.\nFigure 7. [Ablation Study]: Comparison of pruned graph quality against two heuristic methods and the upper bound using the datastore in Tab. 1. We vary the target recall and measure the number of nodes each method needs to recompute. The dashed gray line represents the original HNSW graph, which serves as the upper bound, with twice the storage (i.e., average degree) of the others.",
+      "text": "Wecompare our graph pruning algorithm with two heuristic baselines and evaluate graph quality by measuring the number of embeddings that must be fetched to achieve a given recall target, as shown in Fig. 7. In LEANN, the end-to-end latency scales linearly with the number of embeddings that\nFigure 8. [Ablation Study]: Comparison of (out-)degree distributions between the original graph, our pruning method, and two heuristic baselines. Similar to Fig. 7, the gray curve represents the original HNSW graph, which has twice the size of the others. Only our pruning method successfully preserves the high degree nodes.\nrequire recomputation, making this metric a strong proxy for retrieval latency.\nThe original graph, constructed on the datastore described in Tab. 1, has an average degree of 18. All three pruning methods, ours and the two baselines, are applied to reduce the total number of edges by half, thereby halving the graph's storage overhead.\nThe two heuristic baselines are as follows: (1) Random Prune , which randomly removes 50% of the existing edges from the original graph; and (2) Small M , which directly constrains the maximum out-degree during graph construction, resulting in an average degree that is half that of the original graph.\nWe evaluate the performance of different graph structures on the NQ dataset by varying the search queue length \ud835\udc52\ud835\udc53 , aiming to determine the minimum number of embeddings that must be fetched to achieve various recall targets. As shown in Fig. 7, our pruning method introduced in \u00a75 achieves performance comparable to the original unpruned graph, despite using only half the edges. It outperforms the Random Prune baseline by up to 1 . 18 \u00d7 and the Small M baseline by up to 5 . 76 \u00d7 . We omit the Small M data points at 94% and 96% recall targets due to their poor performance.",
       "metadata": {}
     },
     {
-      "text": "the two-level hybrid distance computation strategy from \u00a74.1 yields an average speedup of 1 . 40 \u00d7 , reaching up to 1 . 64 \u00d7 , by reducing the number of nodes requiring recomputation and enabling lightweight distance estimation without querying the embedding server. Adding the dynamic batching technique further improves GPU utilization during recomputation, increasing the overall speedup to 1 . 76 \u00d7 , with a maximum of 2 . 02 \u00d7 . Among all datasets, HotpotQA benefits the most from batching, as its longer search queue required to achieve the target recall allows more effective grouping of multi hop requests.",
+      "text": "Degree Distribution in Pruned Graphs. To better understand the effectiveness of our pruning strategy, we analyze the out-degree distributions of the original graph, our approach, Random Prune, and Small M. As discussed in \u00a75, our design explicitly aims to preserve high-degree 'hub' nodes. As shown in Fig. 8, it successfully retains a substantial number of such nodes, whereas the other two baselines fail to do so. This underscores the critical role of hub nodes in supporting efficient graph-based vector search, a finding that aligns with insights from prior work [39, 42, 51].\nFigure 9. [Ablation Study]: Latency on the A10 GPU and accuracy of a smaller embedding model evaluated on a 2Mchunk datastore, using a fixed search queue length of ef=50 . The smaller embedding model significantly reduces latency without causing a substantial drop in downstream accuracy.\nUsing different embedding model sizes. Since the primary bottleneck of our system lies in the recomputation process, as shown in Fig. 11 later, we further explore the potential for latency reduction by adopting a smaller embedding model. Specifically, we replace the original contriever model (110M parameters) used in \u00a76.2 with the lightweight GTE-small model [36], which has only 34M parameters. We evaluate performance on a smaller 2M document datastore using a fixed search queue length of ef=50 , as shown in Fig. 9. The results show that GTE-small achieves a 2 . 3 \u00d7 speedup while maintaining downstream task accuracy within 2% of the Contriever baseline. This demonstrates the potential of LEANN to further reduce search latency by leveraging a lightweight embedding model.",
       "metadata": {}
     },
     {
-      "text": "Wecompare our graph pruning algorithm with two heuristic baselines and evaluate graph quality by measuring the number of embeddings that must be fetched to achieve a given recall target, as shown in Fig. 7. In LEANN, the end-to-end latency scales linearly with the number of embeddings that\nFigure 8. [Ablation Study]: Comparison of (out-)degree distributions between the original graph, our pruning method, and two heuristic baselines. Similar to Fig. 7, the gray curve represents the original HNSW graph, which has twice the size of the others. Only our pruning method successfully preserves the high degree nodes.\nrequire recomputation, making this metric a strong proxy for retrieval latency.\nThe original graph, constructed on the datastore described in Tab. 1, has an average degree of 18. All three pruning methods, ours and the two baselines, are applied to reduce the total number of edges by half, thereby halving the graph's storage overhead.",
+      "text": "Relaxing disk constraint. As discussed in \u00a73, when disk storage constraints are relaxed, LEANN can materialize the embeddings of high-degree nodes to reduce recomputation overhead. This effectively builds an on-disk embedding cache, reducing the number of nodes that need to be recomputed at query time. For instance, storing just 10% of the original embeddings yields a 1 . 47 \u00d7 speedup, with a cache hit rate of up to 41.9%. This high cache hit rate arises from the skewed access pattern characteristic of graph-based traversal. However, the observed speedup does not fully align with the hit rate due to the non-negligible loading overhead introduced by SSDs with limited bandwidth.\nGraph-based recomputation breakdown. Fig. 11 breaks down the time cost of a single batch in graph-based recomputation into three stages, categorized by the primary system resource used. Each batch aggregates multiple hops of recomputation, as described in \u00a74.2. First, LEANN performs PQ lookups to select promising nodes, then retrieves and tokenizes the corresponding raw text. The tokenized inputs are sent to the embedding server. Finally, LEANN performs embedding recomputation and distance calculation.\nFigure 10. [Ablation Study]: Latency and cache hit rate comparison under varying storage constraints across four datasets. The x-axis indicates total storage size (graph size + cached embeddings on disk) and the corresponding percentage of cached embeddings.\nFigure 11. [Ablation Study]: Latency breakdown of a batch of requests in graph-based recomputation.\nAlthough embedding recomputation is the primary bottleneck in LEANN, accounting for 76% of total latency, the three stages-spanning I/O, CPU, and GPU resources-can potentially be overlapped to improve overall efficiency. We leave this optimization for future work.",
       "metadata": {}
     },
     {
-      "text": "The two heuristic baselines are as follows: (1) Random Prune , which randomly removes 50% of the existing edges from the original graph; and (2) Small M , which directly constrains the maximum out-degree during graph construction, resulting in an average degree that is half that of the original graph.\nWe evaluate the performance of different graph structures on the NQ dataset by varying the search queue length \ud835\udc52\ud835\udc53 , aiming to determine the minimum number of embeddings that must be fetched to achieve various recall targets. As shown in Fig. 7, our pruning method introduced in \u00a75 achieves performance comparable to the original unpruned graph, despite using only half the edges. It outperforms the Random Prune baseline by up to 1 . 18 \u00d7 and the Small M baseline by up to 5 . 76 \u00d7 . We omit the Small M data points at 94% and 96% recall targets due to their poor performance.",
+      "text": "General Vector Search. Vector search primarily follows two paradigms: IVF [33] and proximity graphs [38]. IVF clusters vectors and probes relevant subsets during search, while graph-based methods such as HNSW [38], NSG [21], Vamana [59], and others [8, 20, 41] connect similar vectors to enable efficient traversal. Graph-based approaches are widely regarded as state of the art due to their favorable trade-offs between accuracy and efficiency [65]. Prior work has explored reducing graph size through learned neighbor selection [5, 73], but these methods are often impractical due to the high training cost and the need for labeled data.\nResource-Constrained Vector Search. Numerous efforts have aimed to reduce the memory footprint of vector search. Disk-based approaches such as DiskANN [59] store both vectors and graph structures on disk, leveraging in-memory compressed embeddings for navigation. Starling [64] improves I/O efficiency for disk-resident graphs, while FusionANNS [61] enables cost-effective search through coordinated use of SSD, CPU, and GPU resources. AiSAQ [60], LM-DiskANN [46] further minimizes DRAM usage by storing compressed embeddings directly on disk. EdgeRAG [55] alleviates memory pressure by generating embeddings online using an IVF-based index. However, it still incurs substantial storage overhead due to the need to maintain large clusters on disk as dictated by its design, and its performance degrades at scale owing to the high recomputation cost introduced by an inefficient index structure. An alternative approach is embedding compression, such as PQ [29], or more recent methods like RabitQ [23], which offers quantization with theoretical error bounds. Yet, these methods struggle to maintain high search accuracy under tight storage budgets. In contrast, LEANN integrates on-the-fly embedding recomputation with a graph-based index, incorporating highdegree preserving graph pruning and a specialized traversal algorithm optimized for edge devices.",
       "metadata": {}
     },
     {
-      "text": "Degree Distribution in Pruned Graphs. To better understand the effectiveness of our pruning strategy, we analyze the out-degree distributions of the original graph, our approach, Random Prune, and Small M. As discussed in \u00a75, our design explicitly aims to preserve high-degree 'hub' nodes. As shown in Fig. 8, it successfully retains a substantial number of such nodes, whereas the other two baselines fail to do so. This underscores the critical role of hub nodes in supporting efficient graph-based vector search, a finding that aligns with insights from prior work [39, 42, 51].\nFigure 9. [Ablation Study]: Latency on the A10 GPU and accuracy of a smaller embedding model evaluated on a 2Mchunk datastore, using a fixed search queue length of ef=50 . The smaller embedding model significantly reduces latency without causing a substantial drop in downstream accuracy.",
-      "metadata": {}
-    },
-    {
-      "text": "Using different embedding model sizes. Since the primary bottleneck of our system lies in the recomputation process, as shown in Fig. 11 later, we further explore the potential for latency reduction by adopting a smaller embedding model. Specifically, we replace the original contriever model (110M parameters) used in \u00a76.2 with the lightweight GTE-small model [36], which has only 34M parameters. We evaluate performance on a smaller 2M document datastore using a fixed search queue length of ef=50 , as shown in Fig. 9. The results show that GTE-small achieves a 2 . 3 \u00d7 speedup while maintaining downstream task accuracy within 2% of the Contriever baseline. This demonstrates the potential of LEANN to further reduce search latency by leveraging a lightweight embedding model.",
-      "metadata": {}
-    },
-    {
-      "text": "Relaxing disk constraint. As discussed in \u00a73, when disk storage constraints are relaxed, LEANN can materialize the embeddings of high-degree nodes to reduce recomputation overhead. This effectively builds an on-disk embedding cache, reducing the number of nodes that need to be recomputed at query time. For instance, storing just 10% of the original embeddings yields a 1 . 47 \u00d7 speedup, with a cache hit rate of up to 41.9%. This high cache hit rate arises from the skewed access pattern characteristic of graph-based traversal. However, the observed speedup does not fully align with the hit rate due to the non-negligible loading overhead introduced by SSDs with limited bandwidth.",
-      "metadata": {}
-    },
-    {
-      "text": "Graph-based recomputation breakdown. Fig. 11 breaks down the time cost of a single batch in graph-based recomputation into three stages, categorized by the primary system resource used. Each batch aggregates multiple hops of recomputation, as described in \u00a74.2. First, LEANN performs PQ lookups to select promising nodes, then retrieves and tokenizes the corresponding raw text. The tokenized inputs are sent to the embedding server. Finally, LEANN performs embedding recomputation and distance calculation.\nFigure 10. [Ablation Study]: Latency and cache hit rate comparison under varying storage constraints across four datasets. The x-axis indicates total storage size (graph size + cached embeddings on disk) and the corresponding percentage of cached embeddings.\nFigure 11. [Ablation Study]: Latency breakdown of a batch of requests in graph-based recomputation.\nAlthough embedding recomputation is the primary bottleneck in LEANN, accounting for 76% of total latency, the three stages-spanning I/O, CPU, and GPU resources-can potentially be overlapped to improve overall efficiency. We leave this optimization for future work.",
-      "metadata": {}
-    },
-    {
-      "text": "General Vector Search. Vector search primarily follows two paradigms: IVF [33] and proximity graphs [38]. IVF clusters vectors and probes relevant subsets during search, while graph-based methods such as HNSW [38], NSG [21], Vamana [59], and others [8, 20, 41] connect similar vectors to enable efficient traversal. Graph-based approaches are widely regarded as state of the art due to their favorable trade-offs between accuracy and efficiency [65]. Prior work has explored reducing graph size through learned neighbor selection [5, 73], but these methods are often impractical due to the high training cost and the need for labeled data.",
-      "metadata": {}
-    },
-    {
-      "text": "Resource-Constrained Vector Search. Numerous efforts have aimed to reduce the memory footprint of vector search. Disk-based approaches such as DiskANN [59] store both vectors and graph structures on disk, leveraging in-memory compressed embeddings for navigation. Starling [64] improves I/O efficiency for disk-resident graphs, while FusionANNS [61] enables cost-effective search through coordinated use of SSD, CPU, and GPU resources. AiSAQ [60], LM-DiskANN [46] further minimizes DRAM usage by storing compressed embeddings directly on disk. EdgeRAG [55] alleviates memory pressure by generating embeddings online using an IVF-based index. However, it still incurs substantial storage overhead due to the need to maintain large clusters on disk as dictated by its design, and its performance degrades at scale owing to the high recomputation cost introduced by an inefficient index structure. An alternative approach is embedding compression, such as PQ [29], or more recent methods like RabitQ [23], which offers quantization with theoretical error bounds. Yet, these methods struggle to maintain high search accuracy under tight storage budgets. In contrast, LEANN integrates on-the-fly embedding recomputation with a graph-based index,",
-      "metadata": {}
-    },
-    {
-      "text": "incorporating highdegree preserving graph pruning and a specialized traversal algorithm optimized for edge devices.\nVector Search Applications on Edge Devices. On-device vector search enables privacy-preserving, low-latency, and offline capabilities across diverse applications. On-device RAGsystems ground language models in personal document collections while maintaining data privacy [32, 53, 66, 72]. Personalized recommendation systems [69] match user profiles with item embeddings directly on the device, while content-based search over large collections of locally stored images and videos employs efficient vision embedding models [50] to generate vector representations for fast retrieval. These applications motivate the design of LEANN to enable efficient, low-overhead vector search on edge devices.",
+      "text": "Vector Search Applications on Edge Devices. On-device vector search enables privacy-preserving, low-latency, and offline capabilities across diverse applications. On-device RAGsystems ground language models in personal document collections while maintaining data privacy [32, 53, 66, 72]. Personalized recommendation systems [69] match user profiles with item embeddings directly on the device, while content-based search over large collections of locally stored images and videos employs efficient vision embedding models [50] to generate vector representations for fast retrieval. These applications motivate the design of LEANN to enable efficient, low-overhead vector search on edge devices.",
       "metadata": {}
     },
     {
@@ -281,119 +173,59 @@
       "metadata": {}
     },
     {
-      "text": "- [1] Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. In The Twelfth International Conference on Learning Representations .\n- [3] AWS. 2025. Amazon EC2 G5 instance. https://aws.amazon.com/ec2/i nstance-types/mac/ . [Online; accessed April-2025].\n- [2] Martin Aum\u00fcller, Erik Bernhardsson, and Alexander Faithfull. 2020. ANN-Benchmarks: A benchmarking tool for approximate nearest neighbor algorithms. Information Systems 87 (2020), 101374.\n- [4] AWS. 2025. Amazon EC2 G5 instance. https://aws.amazon.com/ec2/i nstance-types/g5 . [Online; accessed April-2025].",
+      "text": "- [1] Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. In The Twelfth International Conference on Learning Representations .\n- [3] AWS. 2025. Amazon EC2 G5 instance. https://aws.amazon.com/ec2/i nstance-types/mac/ . [Online; accessed April-2025].\n- [2] Martin Aum\u00fcller, Erik Bernhardsson, and Alexander Faithfull. 2020. ANN-Benchmarks: A benchmarking tool for approximate nearest neighbor algorithms. Information Systems 87 (2020), 101374.\n- [4] AWS. 2025. Amazon EC2 G5 instance. https://aws.amazon.com/ec2/i nstance-types/g5 . [Online; accessed April-2025].\n- [6] Dongqi Cai, Shangguang Wang, Chen Peng, et al. 2024. Recall: Empowering Multimodal Embedding for Edge Devices. arXiv:2409.15342.\n- [5] Dmitry Baranchuk and Artem Babenko. 2019. Towards similarity graphs constructed by deep reinforcement learning. arXiv preprint arXiv:1911.12122 (2019).\n- [7] Pablo Castro. 2024. Announcing cost-effective RAG at scale with Azure AI Search. https://techcommunity.microsoft.com/blog/azure-aiservices-blog/announcing-cost-effective-rag-at-scale-with-azureai-search/4104961 .\n- [9] Davin Choo, Christoph Grunau, Julian Portmann, and V\u00e1clav Rozhon. 2020. k-means++: few more steps yield constant approximation. In International Conference on Machine Learning . PMLR, 1909-1917.",
       "metadata": {}
     },
     {
-      "text": "- [6] Dongqi Cai, Shangguang Wang, Chen Peng, et al. 2024. Recall: Empowering Multimodal Embedding for Edge Devices. arXiv:2409.15342.\n- [5] Dmitry Baranchuk and Artem Babenko. 2019. Towards similarity graphs constructed by deep reinforcement learning. arXiv preprint arXiv:1911.12122 (2019).\n- [7] Pablo Castro. 2024. Announcing cost-effective RAG at scale with Azure AI Search. https://techcommunity.microsoft.com/blog/azure-aiservices-blog/announcing-cost-effective-rag-at-scale-with-azureai-search/4104961 .\n- [9] Davin Choo, Christoph Grunau, Julian Portmann, and V\u00e1clav Rozhon. 2020. k-means++: few more steps yield constant approximation. In International Conference on Machine Learning . PMLR, 1909-1917.",
+      "text": "- [8] Qi Chen, Bing Zhao, Haidong Wang, Mingqin Li, Chuanjie Liu, Zengzhong Li, Mao Yang, and Jingdong Wang. 2021. SPANN: Highlyefficient Billion-scale Approximate Nearest Neighbor Search. In 35th Conference on Neural Information Processing Systems (NeurIPS 2021) .\n- [10] Together Computer. 2023. RedPajama: An Open Source Recipe to Reproduce LLaMA Training Dataset. https://github.com/togethercom puter/RedPajama-Data . Accessed: May 10, 2025.\n- [12] CPU-Monkey. n.d.. Apple M1 Ultra 64-Core GPU. https://www.cpumonkey.com/en/igpu-apple_m1_ultra_64_core . Accessed: 2025-05-10.\n- [11] KVCACHE.AI Contributors. 2025. KTransformers: A Flexible Framework for Experiencing Cutting-edge LLM Inference Optimizations. https://github.com/kvcache-ai/ktransformers . Accessed: 2025-05-14.\n- [13] Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, and Jimmy Lin. 2021. Ms marco: Benchmarking ranking models in the large-data regime. In proceedings of the 44th International ACM SIGIR conference on research and development in information retrieval . 15661576.\n- [15] Weihao Cui, Han Zhao, Quan Chen, Hao Wei, Zirui Li, Deze Zeng, Chao Li, and Minyi Guo. 2022. { DVABatch } : Diversity-aware { MultiEntry }{ Multi-Exit } batching for efficient processing of { DNN } services on { GPUs } . In 2022 USENIX Annual Technical Conference (USENIX ATC 22) . 183-198.",
       "metadata": {}
     },
     {
-      "text": "- [8] Qi Chen, Bing Zhao, Haidong Wang, Mingqin Li, Chuanjie Liu, Zengzhong Li, Mao Yang, and Jingdong Wang. 2021. SPANN: Highlyefficient Billion-scale Approximate Nearest Neighbor Search. In 35th Conference on Neural Information Processing Systems (NeurIPS 2021) .\n- [10] Together Computer. 2023. RedPajama: An Open Source Recipe to Reproduce LLaMA Training Dataset. https://github.com/togethercom puter/RedPajama-Data . Accessed: May 10, 2025.\n- [12] CPU-Monkey. n.d.. Apple M1 Ultra 64-Core GPU. https://www.cpumonkey.com/en/igpu-apple_m1_ultra_64_core . Accessed: 2025-05-10.",
+      "text": "- [14] Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, and Ellen M Voorhees. 2020. Overview of the TREC 2019 deep learning track. arXiv preprint arXiv:2003.07820 (2020).\n- [16] Matthijs Douze. 2020. Indexing 1T Vectors. https://github.com/faceb ookresearch/faiss/wiki/Indexing-1T-vectors .\n- [17] Matthijs Douze, Alexandr Guzhva, Chengqi Deng, Jeff Johnson, Gergely Szilvasy, Pierre-Emmanuel Mazar\u00e9, Maria Lomeli, Lucas Hosseini, and Herv\u00e9 J\u00e9gou. 2025. The Faiss library. arXiv:2401.08281 [cs.LG] https://arxiv.org/abs/2401.08281\n- [19] Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024).\n- [18] Matthijs Douze, Alexandre Sablayrolles, and Herv\u00e9 J\u00e9gou. 2018. Link and code: Fast indexing with graphs and compact regression codes. In Proceedings of the IEEE conference on computer vision and pattern recognition . 3646-3654.\n- [20] Cong Fu, Changxu Wang, and Deng Cai. 2021. High Dimensional Similarity Search with Satellite System Graph: Efficiency, Scalability, and Unindexed Query Compatibility. arXiv:1907.06146 [cs.IR] https: //arxiv.org/abs/1907.06146",
       "metadata": {}
     },
     {
-      "text": "- [11] KVCACHE.AI Contributors. 2025. KTransformers: A Flexible Framework for Experiencing Cutting-edge LLM Inference Optimizations. https://github.com/kvcache-ai/ktransformers . Accessed: 2025-05-14.\n- [13] Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, and Jimmy Lin. 2021. Ms marco: Benchmarking ranking models in the large-data regime. In proceedings of the 44th International ACM SIGIR conference on research and development in information retrieval . 15661576.\n- [15] Weihao Cui, Han Zhao, Quan Chen, Hao Wei, Zirui Li, Deze Zeng, Chao Li, and Minyi Guo. 2022. { DVABatch } : Diversity-aware { MultiEntry }{ Multi-Exit } batching for efficient processing of { DNN } services on { GPUs } . In 2022 USENIX Annual Technical Conference (USENIX ATC 22) . 183-198.",
+      "text": "- [22] Jianyang Gao and Cheng Long. 2023. High-Dimensional Approximate Nearest Neighbor Search: with Reliable and Efficient Distance Comparison Operations. Proc. ACM Manag. Data 1, 2, Article 137 (June 2023), 27 pages. https://doi.org/10.1145/3589282\n- [21] Cong Fu, Chao Xiang, Changxu Wang, and Deng Cai. 2019. Fast approximate nearest neighbor search with the navigating spreadingout graph. Proc. VLDB Endow. 12, 5 (Jan. 2019), 461-474. https: //doi.org/10.14778/3303753.3303754\n- [23] Jianyang Gao and Cheng Long. 2024. RabitQ: Quantizing HighDimensional Vectors with a Theoretical Error Bound for Approximate Nearest Neighbor Search. In Proceedings of the ACM on Management of Data (SIGMOD '24) , Vol. 2. Article 167.\n- [25] Alexandra Henzinger, Emma Dauterman, Henry Corrigan-Gibbs, and Nickolai Zeldovich. 2023. Private Web Search with Tiptoe. Cryptology ePrint Archive, Paper 2023/1438. https://doi.org/10.1145/3600006.36 13134\n- [24] Yanzhang He, Tara N. Sainath, Rohit Prabhavalkar, Ian McGraw, Raziel Alvarez, Ding Zhao, et al. 2019. Streaming End-to-End Speech Recognition for Mobile Devices. In Proc. IEEE ICASSP . 6381-6385.",
       "metadata": {}
     },
     {
-      "text": "- [14] Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, and Ellen M Voorhees. 2020. Overview of the TREC 2019 deep learning track. arXiv preprint arXiv:2003.07820 (2020).\n- [16] Matthijs Douze. 2020. Indexing 1T Vectors. https://github.com/faceb ookresearch/faiss/wiki/Indexing-1T-vectors .\n- [17] Matthijs Douze, Alexandr Guzhva, Chengqi Deng, Jeff Johnson, Gergely Szilvasy, Pierre-Emmanuel Mazar\u00e9, Maria Lomeli, Lucas Hosseini, and Herv\u00e9 J\u00e9gou. 2025. The Faiss library. arXiv:2401.08281 [cs.LG] https://arxiv.org/abs/2401.08281",
+      "text": "- [26] Piotr Indyk and Rajeev Motwani. 1998. Approximate nearest neighbors: towards removing the curse of dimensionality. In Proceedings of the Thirtieth Annual ACM Symposium on Theory of Computing (Dallas, Texas, USA) (STOC '98) . Association for Computing Machinery, New York, NY, USA, 604-613. https://doi.org/10.1145/276698.276876\n- [28] Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large scale distantly supervised challenge dataset for reading comprehension. arXiv preprint arXiv:1705.03551 (2017).\n- [27] Gautier Izacard, Mathilde Caron, Lucas Hosseini, Sebastian Riedel, Piotr Bojanowski, Armand Joulin, and Edouard Grave. 2021. Unsupervised dense information retrieval with contrastive learning. arXiv preprint arXiv:2112.09118 (2021).\n- [29] Herve J\u00e9gou, Matthijs Douze, and Cordelia Schmid. 2011. Product Quantization for Nearest Neighbor Search. IEEE Transactions on Pattern Analysis and Machine Intelligence 33, 1 (2011), 117-128. https://doi.or g/10.1109/TPAMI.2010.57",
       "metadata": {}
     },
     {
-      "text": "- [19] Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024).\n- [18] Matthijs Douze, Alexandre Sablayrolles, and Herv\u00e9 J\u00e9gou. 2018. Link and code: Fast indexing with graphs and compact regression codes. In Proceedings of the IEEE conference on computer vision and pattern recognition . 3646-3654.\n- [20] Cong Fu, Changxu Wang, and Deng Cai. 2021. High Dimensional Similarity Search with Satellite System Graph: Efficiency, Scalability, and Unindexed Query Compatibility. arXiv:1907.06146 [cs.IR] https: //arxiv.org/abs/1907.06146",
+      "text": "- [31] Tom Kwiatkowski, Jennimaria Palomaki, Olivia Redfield, Michael Collins, Ankur Parikh, Chris Alberti, Danielle Epstein, Illia Polosukhin, Jacob Devlin, Kenton Lee, Kristina Toutanova, Llion Jones, Matthew Kelcey, Ming-Wei Chang, Andrew M. Dai, Jakob Uszkoreit, Quoc Le, and Slav Petrov. 2019. Natural Questions: A Benchmark for Question Answering Research. Transactions of the Association for Computational Linguistics 7 (2019), 452-466. https://doi.org/10.1162/tacl_a_00276\n- [30] Vladimir Karpukhin, Barlas Oguz, Sewon Min, Patrick SH Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 2020. Dense Passage Retrieval for Open-Domain Question Answering.. In EMNLP (1) . 6769-6781.\n- [32] Chanhee Lee, Deeksha Prahlad, Dongha Kim, and Hokeun Kim. 2024. Work-in-Progress: On-device Retrieval Augmented Generation with",
       "metadata": {}
     },
     {
-      "text": "- [22] Jianyang Gao and Cheng Long. 2023. High-Dimensional Approximate Nearest Neighbor Search: with Reliable and Efficient Distance Comparison Operations. Proc. ACM Manag. Data 1, 2, Article 137 (June 2023), 27 pages. https://doi.org/10.1145/3589282\n- [21] Cong Fu, Chao Xiang, Changxu Wang, and Deng Cai. 2019. Fast approximate nearest neighbor search with the navigating spreadingout graph. Proc. VLDB Endow. 12, 5 (Jan. 2019), 461-474. https: //doi.org/10.14778/3303753.3303754",
+      "text": "- Knowledge Graphs for Personalized Large Language Models. In 2024 International Conference on Embedded Software (EMSOFT) . 1-1. https: //doi.org/10.1109/EMSOFT60242.2024.00006\n- [34] Muyang Li, Yujun Lin, Zhekai Zhang, Tianle Cai, Xiuyu Li, Junxian Guo, Enze Xie, Chenlin Meng, Jun-Yan Zhu, and Song Han. 2024. Svdqunat: Absorbing outliers by low-rank components for 4-bit diffusion models. arXiv preprint arXiv:2411.05007 (2024).\n- [33] Victor Lempitsky. 2012. The inverted multi-index. In Proceedings of the 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (CVPR '12) . IEEE Computer Society, USA, 3069-3076.\n- [35] Wen Li, Ying Zhang, Yifang Sun, Wei Wang, Mingjie Li, Wenjie Zhang, and Xuemin Lin. 2019. Approximate nearest neighbor search on high dimensional data-experiments, analyses, and improvement. IEEE Transactions on Knowledge and Data Engineering 32, 8 (2019), 14751488.\n- [37] Jimmy Lin, Rodrigo Nogueira, and Andrew Yates. 2022. Pretrained transformers for text ranking: Bert and beyond . Springer Nature.\n- [36] Zehan Li, Xin Zhang, Yanzhao Zhang, Dingkun Long, Pengjun Xie, and Meishan Zhang. 2023. Towards general text embeddings with multistage contrastive learning. arXiv preprint arXiv:2308.03281 (2023).\n- [38] Yu A Malkov and Dmitry A Yashunin. 2018. Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. IEEE transactions on pattern analysis and machine intelligence 42, 4 (2018), 824-836.",
       "metadata": {}
     },
     {
-      "text": "- [23] Jianyang Gao and Cheng Long. 2024. RabitQ: Quantizing HighDimensional Vectors with a Theoretical Error Bound for Approximate Nearest Neighbor Search. In Proceedings of the ACM on Management of Data (SIGMOD '24) , Vol. 2. Article 167.\n- [25] Alexandra Henzinger, Emma Dauterman, Henry Corrigan-Gibbs, and Nickolai Zeldovich. 2023. Private Web Search with Tiptoe. Cryptology ePrint Archive, Paper 2023/1438. https://doi.org/10.1145/3600006.36 13134\n- [24] Yanzhang He, Tara N. Sainath, Rohit Prabhavalkar, Ian McGraw, Raziel Alvarez, Ding Zhao, et al. 2019. Streaming End-to-End Speech Recognition for Mobile Devices. In Proc. IEEE ICASSP . 6381-6385.",
+      "text": "- [40] Microsoft Learn. 2025. Vector index size and staying under limits . https: //learn.microsoft.com/en-us/azure/search/vector-search-indexsize?utm_source=chatgpt.com&tabs=portal-vector-quota\n- [39] Magdalen Dobson Manohar, Zheqi Shen, Guy Blelloch, Laxman Dhulipala, Yan Gu, Harsha Vardhan Simhadri, and Yihan Sun. 2024. Parlayann: Scalable and deterministic parallel graph-based approximate nearest neighbor search algorithms. In Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming . 270-285.\n- [41] Javier Vargas Munoz, Marcos A Gon\u00e7alves, Zanoni Dias, and Ricardo da S Torres. 2019. Hierarchical clustering-based graphs for large scale approximate nearest neighbor search. Pattern Recognition 96 (2019), 106970.\n- [43] NVIDIA. n.d.. NVIDIA A10 Tensor Core GPU. https://www.nvidia.c om/en-us/data-center/products/a10-gpu/ . Accessed: 2025-05-10.\n- [42] Blaise Munyampirwa, Vihan Lakshman, and Benjamin Coleman. 2024. Down with the Hierarchy: The'H'in HNSW Stands for\" Hubs\". arXiv preprint arXiv:2412.01940 (2024).\n- [44] NVIDIA Corporation. 2024. NVIDIA RTX Blackwell GPU Architecture. https://images.nvidia.com/aem-dam/Solutions/geforce/blackwell/nv idia-rtx-blackwell-gpu-architecture.pdf . Whitepaper.\n- [46] Yu Pan, Jianxin Sun, and Hongfeng Yu. 2023. LM-DiskANN: Low Memory Footprint in Disk-Native Dynamic Graph-Based ANN Indexing. In 2023 IEEE International Conference on Big Data (BigData) . 5987-5996. https://doi.org/10.1109/BigData59044.2023.10386517",
       "metadata": {}
     },
     {
-      "text": "- [26] Piotr Indyk and Rajeev Motwani. 1998. Approximate nearest neighbors: towards removing the curse of dimensionality. In Proceedings of the Thirtieth Annual ACM Symposium on Theory of Computing (Dallas, Texas, USA) (STOC '98) . Association for Computing Machinery, New York, NY, USA, 604-613. https://doi.org/10.1145/276698.276876\n- [28] Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large scale distantly supervised challenge dataset for reading comprehension. arXiv preprint arXiv:1705.03551 (2017).",
+      "text": "- [45] ObjectBox Ltd. 2024. Edge AI: The era of on-device AI. https://obje ctbox.io/on-device-vector-databases-and-edge-ai/ . Accessed May 2025.\n- [47] Pinecone. n.d.. Vector Search: Hierarchical Navigable Small Worlds. https://www.pinecone.io/learn/series/faiss/hnsw/ . Accessed: 2025-05-10.\n- [49] Navid Rekabsaz, Oleg Lesota, Markus Schedl, Jon Brassey, and Carsten Eickhoff. 2021. TripClick: the log files of a large health web search engine. In Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval . 2507-2513.\n- [48] David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, and Samuel R Bowman. 2024. Gpqa: A graduate-level google-proof q&a benchmark. In First Conference on Language Modeling .",
       "metadata": {}
     },
     {
-      "text": "- [27] Gautier Izacard, Mathilde Caron, Lucas Hosseini, Sebastian Riedel, Piotr Bojanowski, Armand Joulin, and Edouard Grave. 2021. Unsupervised dense information retrieval with contrastive learning. arXiv preprint arXiv:2112.09118 (2021).\n- [29] Herve J\u00e9gou, Matthijs Douze, and Cordelia Schmid. 2011. Product Quantization for Nearest Neighbor Search. IEEE Transactions on Pattern Analysis and Machine Intelligence 33, 1 (2011), 117-128. https://doi.or g/10.1109/TPAMI.2010.57",
+      "text": ", 1 = Humphrey Shi. 2023. Efficient Neural Networks: From Algorithm Design to Practical Mobile Deployments. CVPR 2023 Tutorial. https: //snap-research.github.io/efficient-nn-tutorial/ .. [51], 1 = Jie Ren, Minjia Zhang, and Dong Li. 2020. HM-ANN: efficient billion- point nearest neighbor search on heterogeneous memory. In Proceed- ings of the 34th International Conference on Neural Information Process- ing Systems (Vancouver, BC, Canada) (NIPS '20) . Curran Associates Inc., Red Hook, NY, USA, Article 895, 13 pages.. [52], 1 = Facebook AI Research. n.d.. Guidelines to Choose an Index. https: //github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an- index/28074dc0ddc733f84b06fa4d99b3f6e2ef65613d#if-below-1m- vectors-ivfx . Accessed: 2025-05-10.. [53], 1 = Michael J. Ryan, Danmei Xu, Chris Nivera, and Daniel Campos. 2024. EnronQA: Towards Personalized RAG over Private Documents. arXiv preprint arXiv:2505.00263 (2024).. [54], 1 = Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip- filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021).. [55], 1 = Korakit Seemakhupt, Sihang Liu, and Samira Khan. 2024. EdgeRAG: Online-Indexed RAG for Edge Devices. arXiv preprint arXiv:2412.21023 (2024).. [56],",
       "metadata": {}
     },
     {
-      "text": "- [31] Tom Kwiatkowski, Jennimaria Palomaki, Olivia Redfield, Michael Collins, Ankur Parikh, Chris Alberti, Danielle Epstein, Illia Polosukhin, Jacob Devlin, Kenton Lee, Kristina Toutanova, Llion Jones, Matthew Kelcey, Ming-Wei Chang, Andrew M. Dai, Jakob Uszkoreit, Quoc Le, and Slav Petrov. 2019. Natural Questions: A Benchmark for Question Answering Research. Transactions of the Association for Computational Linguistics 7 (2019), 452-466. https://doi.org/10.1162/tacl_a_00276\n- [30] Vladimir Karpukhin, Barlas Oguz, Sewon Min, Patrick SH Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 2020. Dense Passage Retrieval for Open-Domain Question Answering.. In EMNLP (1) . 6769-6781.",
+      "text": "1 = Daniel Severo, Giuseppe Ottaviano, Matthew Muckley, Karen Ullrich, and Matthijs Douze. 2025. Lossless Compression of Vector IDs for Approximate Nearest Neighbor Search. arXiv preprint arXiv:2501.10479 (2025).. [57], 1 = Rulin Shao, Jacqueline He, Akari Asai, Weijia Shi, TimDettmers,Sewon Min, Luke Zettlemoyer, and Pang Wei WKoh. 2024. Scaling retrieval- based language models with a trillion-token datastore. Advances in Neural Information Processing Systems 37 (2024), 91260-91299.. [58], 1 = Michael Shen, Muhammad Umar, Kiwan Maeng, G. Edward Suh, and Udit Gupta. 2024. Towards Understanding Systems Trade-offs in Retrieval-Augmented Generation Model Inference. arXiv:2412.11854 [cs.AR] https://arxiv.org/abs/2412.11854. [59], 1 = Suhas Jayaram Subramanya, Devvrit, Rohan Kadekodi, Ravishankar Krishaswamy, and Harsha Vardhan Simhadri. 2019. DiskANN: fast accurate billion-point nearest neighbor search on a single node . Curran Associates Inc., Red Hook, NY, USA.. [60], 1 = Kento Tatsuno, Daisuke Miyashita, Taiga Ikeda, Kiyoshi Ishiyama, Kazunari Sumiyoshi, and Jun Deguchi. 2024. AiSAQ: All-in-Storage ANNS with Product Quantization for DRAM-free Information Re- trieval. arXiv preprint arXiv:2404.06004 (2024). arXiv:2404.06004 https://arxiv.org/abs/2404.06004. [61], 1 = Bing Tian, Haikun Liu, Yuhang Tang, Shihai Xiao, Zhuohui Duan, Xiaofei Liao, Hai Jin, Xuecang Zhang,",
       "metadata": {}
     },
     {
-      "text": "- [32] Chanhee Lee, Deeksha Prahlad, Dongha Kim, and Hokeun Kim. 2024. Work-in-Progress: On-device Retrieval Augmented Generation with",
+      "text": "Junhua Zhu, and Yu Zhang. 2025. Towards High-throughput and Low-latency Billion-scale Vector Search via CPU/GPU Collaborative Filtering and Re-ranking. In 23rd USENIX Conference on File and Storage Technologies (FAST 25) . USENIX Association, Santa Clara, CA, 171-185. https://www.usenix.org/con. [62], 1 = ference/fast25/presentation/tian-bing Vincent Totino. 2025. Phone Storage: How Much Do You Really Need? https://www.optimum.com/articles/mobile/choosing-phone- storage-amount-needs-guide. [63], 1 = Vincent Totino. 2025. Phone Storage: How Much Do You Really Need? https://www.optimum.com/articles/mobile/choosing-phone- storage-amount-needs-guide Accessed May 15, 2025.. [64], 1 = Mengzhao Wang, Weizhi Xu, Xiaomeng Yi, Songlin Wu, Zhangyang Peng, Xiangyu Ke, Yunjun Gao, Xiaoliang Xu, Rentong Guo, and Charles Xie. 2024. Starling: AnI/O-Efficient Disk-Resident Graph Index Framework for High-Dimensional Vector Similarity Search on Data Segment. In Proceedings of the ACM on Management of Data (SIGMOD",
       "metadata": {}
     },
     {
-      "text": "- Knowledge Graphs for Personalized Large Language Models. In 2024 International Conference on Embedded Software (EMSOFT) . 1-1. https: //doi.org/10.1109/EMSOFT60242.2024.00006\n- [34] Muyang Li, Yujun Lin, Zhekai Zhang, Tianle Cai, Xiuyu Li, Junxian Guo, Enze Xie, Chenlin Meng, Jun-Yan Zhu, and Song Han. 2024. Svdqunat: Absorbing outliers by low-rank components for 4-bit diffusion models. arXiv preprint arXiv:2411.05007 (2024).\n- [33] Victor Lempitsky. 2012. The inverted multi-index. In Proceedings of the 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (CVPR '12) . IEEE Computer Society, USA, 3069-3076.",
+      "text": "- [65] Peng Wang, Chen Wang, Xiaofang Lin, Wenjie Zhang, and Qing He. 2021. A Comprehensive Survey and Experimental Comparison of Graph-Based Approximate Nearest Neighbor Search. Proc. VLDB Endow. 14, 11 (2021), 1964-1978. https://doi.org/10.14778/3476249.347 6258\n- [67] Zhenliang Xue, Yixin Song, et al. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv preprint arXiv:2406.06282 (2024).\n- [66] Zijie J Wang and Duen Horng Chau. 2024. MeMemo: On-device Retrieval Augmentation for Private and Personalized Text Generation. In Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval . 2765-2770.\n- [68] Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William W Cohen, Ruslan Salakhutdinov, and Christopher D Manning. 2018. HotpotQA: A dataset for diverse, explainable multi-hop question answering. arXiv preprint arXiv:1809.09600 (2018).\n- [70] Weiping Yu, Ningyi Liao, Siqiang Luo, and Junfeng Liu. 2025. RAGDoll: Efficient Offloading-based Online RAG System on a Single GPU. arXiv preprint arXiv:2504.15302 (2025).\n- [69] Hongzhi Yin, Tong Chen, Liang Qu, and Bin Cui. 2024. On-Device Recommender Systems: A Comprehensive Survey. arXiv preprint arXiv:2401.11441 (2024).",
       "metadata": {}
     },
     {
-      "text": "- [35] Wen Li, Ying Zhang, Yifang Sun, Wei Wang, Mingjie Li, Wenjie Zhang, and Xuemin Lin. 2019. Approximate nearest neighbor search on high dimensional data-experiments, analyses, and improvement. IEEE Transactions on Knowledge and Data Engineering 32, 8 (2019), 14751488.\n- [37] Jimmy Lin, Rodrigo Nogueira, and Andrew Yates. 2022. Pretrained transformers for text ranking: Bert and beyond . Springer Nature.\n- [36] Zehan Li, Xin Zhang, Yanzhao Zhang, Dingkun Long, Pengjun Xie, and Meishan Zhang. 2023. Towards general text embeddings with multistage contrastive learning. arXiv preprint arXiv:2308.03281 (2023).",
-      "metadata": {}
-    },
-    {
-      "text": "- [38] Yu A Malkov and Dmitry A Yashunin. 2018. Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. IEEE transactions on pattern analysis and machine intelligence 42, 4 (2018), 824-836.\n- [40] Microsoft Learn. 2025. Vector index size and staying under limits . https: //learn.microsoft.com/en-us/azure/search/vector-search-indexsize?utm_source=chatgpt.com&tabs=portal-vector-quota\n- [39] Magdalen Dobson Manohar, Zheqi Shen, Guy Blelloch, Laxman Dhulipala, Yan Gu, Harsha Vardhan Simhadri, and Yihan Sun. 2024. Parlayann: Scalable and deterministic parallel graph-based approximate nearest neighbor search algorithms. In Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming . 270-285.",
-      "metadata": {}
-    },
-    {
-      "text": "- [41] Javier Vargas Munoz, Marcos A Gon\u00e7alves, Zanoni Dias, and Ricardo da S Torres. 2019. Hierarchical clustering-based graphs for large scale approximate nearest neighbor search. Pattern Recognition 96 (2019), 106970.\n- [43] NVIDIA. n.d.. NVIDIA A10 Tensor Core GPU. https://www.nvidia.c om/en-us/data-center/products/a10-gpu/ . Accessed: 2025-05-10.\n- [42] Blaise Munyampirwa, Vihan Lakshman, and Benjamin Coleman. 2024. Down with the Hierarchy: The'H'in HNSW Stands for\" Hubs\". arXiv preprint arXiv:2412.01940 (2024).\n- [44] NVIDIA Corporation. 2024. NVIDIA RTX Blackwell GPU Architecture. https://images.nvidia.com/aem-dam/Solutions/geforce/blackwell/nv idia-rtx-blackwell-gpu-architecture.pdf . Whitepaper.",
-      "metadata": {}
-    },
-    {
-      "text": "- [46] Yu Pan, Jianxin Sun, and Hongfeng Yu. 2023. LM-DiskANN: Low Memory Footprint in Disk-Native Dynamic Graph-Based ANN Indexing. In 2023 IEEE International Conference on Big Data (BigData) . 5987-5996. https://doi.org/10.1109/BigData59044.2023.10386517\n- [45] ObjectBox Ltd. 2024. Edge AI: The era of on-device AI. https://obje ctbox.io/on-device-vector-databases-and-edge-ai/ . Accessed May 2025.\n- [47] Pinecone. n.d.. Vector Search: Hierarchical Navigable Small Worlds. https://www.pinecone.io/learn/series/faiss/hnsw/ . Accessed: 2025-05-10.",
-      "metadata": {}
-    },
-    {
-      "text": "- [49] Navid Rekabsaz, Oleg Lesota, Markus Schedl, Jon Brassey, and Carsten Eickhoff. 2021. TripClick: the log files of a large health web search engine. In Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval . 2507-2513.\n- [48] David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, and Samuel R Bowman. 2024. Gpqa: A graduate-level google-proof q&a benchmark. In First Conference on Language Modeling .",
-      "metadata": {}
-    },
-    {
-      "text": ", 1 = Humphrey Shi. 2023. Efficient Neural Networks: From Algorithm Design to Practical Mobile Deployments. CVPR 2023 Tutorial. https: //snap-research.github.io/efficient-nn-tutorial/ .. [51], 1 = Jie Ren, Minjia Zhang, and Dong Li. 2020. HM-ANN: efficient billion- point nearest neighbor search on heterogeneous memory. In Proceed- ings of the 34th International Conference on Neural Information Process- ing Systems (Vancouver, BC, Canada) (NIPS '20) . Curran Associates Inc., Red Hook, NY, USA, Article 895, 13 pages.. [52], 1 = Facebook AI Research. n.d.. Guidelines to Choose an Index. https: //github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an- index/28074dc0ddc733f84b06fa4d99b3f6e2ef65613d#if-below-1m- vectors-ivfx . Accessed: 2025-05-10..",
-      "metadata": {}
-    },
-    {
-      "text": "[53], 1 = Michael J. Ryan, Danmei Xu, Chris Nivera, and Daniel Campos. 2024. EnronQA: Towards Personalized RAG over Private Documents. arXiv preprint arXiv:2505.00263 (2024).. [54], 1 = Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip- filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021).. [55], 1 = Korakit Seemakhupt, Sihang Liu, and Samira Khan. 2024. EdgeRAG: Online-Indexed RAG for Edge Devices. arXiv preprint arXiv:2412.21023 (2024)..",
-      "metadata": {}
-    },
-    {
-      "text": "[56], 1 = Daniel Severo, Giuseppe Ottaviano, Matthew Muckley, Karen Ullrich, and Matthijs Douze. 2025. Lossless Compression of Vector IDs for Approximate Nearest Neighbor Search. arXiv preprint arXiv:2501.10479 (2025).. [57], 1 = Rulin Shao, Jacqueline He, Akari Asai, Weijia Shi, TimDettmers,Sewon Min, Luke Zettlemoyer, and Pang Wei WKoh. 2024. Scaling retrieval- based language models with a trillion-token datastore. Advances in Neural Information Processing Systems 37 (2024), 91260-91299.. [58], 1 = Michael Shen, Muhammad Umar, Kiwan Maeng, G. Edward Suh, and Udit Gupta. 2024. Towards Understanding Systems Trade-offs in Retrieval-Augmented Generation Model Inference. arXiv:2412.11854 [cs.AR]",
-      "metadata": {}
-    },
-    {
-      "text": "https://arxiv.org/abs/2412.11854. [59], 1 = Suhas Jayaram Subramanya, Devvrit, Rohan Kadekodi, Ravishankar Krishaswamy, and Harsha Vardhan Simhadri. 2019. DiskANN: fast accurate billion-point nearest neighbor search on a single node . Curran Associates Inc., Red Hook, NY, USA.. [60], 1 = Kento Tatsuno, Daisuke Miyashita, Taiga Ikeda, Kiyoshi Ishiyama, Kazunari Sumiyoshi, and Jun Deguchi. 2024. AiSAQ: All-in-Storage ANNS with Product Quantization for DRAM-free Information Re- trieval. arXiv preprint arXiv:2404.06004 (2024). arXiv:2404.06004 https://arxiv.org/abs/2404.06004. [61], 1 = Bing Tian, Haikun Liu, Yuhang Tang, Shihai Xiao,",
-      "metadata": {}
-    },
-    {
-      "text": "Zhuohui Duan, Xiaofei Liao, Hai Jin, Xuecang Zhang, Junhua Zhu, and Yu Zhang. 2025. Towards High-throughput and Low-latency Billion-scale Vector Search via CPU/GPU Collaborative Filtering and Re-ranking. In 23rd USENIX Conference on File and Storage Technologies (FAST 25) . USENIX Association, Santa Clara, CA, 171-185. https://www.usenix.org/con. [62], 1 = ference/fast25/presentation/tian-bing Vincent Totino. 2025. Phone Storage: How Much Do You Really Need? https://www.optimum.com/articles/mobile/choosing-phone- storage-amount-needs-guide. [63], 1 = Vincent Totino. 2025. Phone Storage: How Much Do You Really Need? https://www.optimum.com/articles/mobile/choosing-phone- storage-amount-needs-guide Accessed May 15, 2025.. [64], 1 = Mengzhao Wang, Weizhi Xu, Xiaomeng Yi, Songlin Wu, Zhangyang Peng,",
-      "metadata": {}
-    },
-    {
-      "text": "Xiangyu Ke, Yunjun Gao, Xiaoliang Xu, Rentong Guo, and Charles Xie. 2024. Starling: AnI/O-Efficient Disk-Resident Graph Index Framework for High-Dimensional Vector Similarity Search on Data Segment. In Proceedings of the ACM on Management of Data (SIGMOD",
-      "metadata": {}
-    },
-    {
-      "text": "- [65] Peng Wang, Chen Wang, Xiaofang Lin, Wenjie Zhang, and Qing He. 2021. A Comprehensive Survey and Experimental Comparison of Graph-Based Approximate Nearest Neighbor Search. Proc. VLDB Endow. 14, 11 (2021), 1964-1978. https://doi.org/10.14778/3476249.347 6258\n- [67] Zhenliang Xue, Yixin Song, et al. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv preprint arXiv:2406.06282 (2024).\n- [66] Zijie J Wang and Duen Horng Chau. 2024. MeMemo: On-device Retrieval Augmentation for Private and Personalized Text Generation. In Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval . 2765-2770.",
-      "metadata": {}
-    },
-    {
-      "text": "- [68] Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William W Cohen, Ruslan Salakhutdinov, and Christopher D Manning. 2018. HotpotQA: A dataset for diverse, explainable multi-hop question answering. arXiv preprint arXiv:1809.09600 (2018).\n- [70] Weiping Yu, Ningyi Liao, Siqiang Luo, and Junfeng Liu. 2025. RAGDoll: Efficient Offloading-based Online RAG System on a Single GPU. arXiv preprint arXiv:2504.15302 (2025).\n- [69] Hongzhi Yin, Tong Chen, Liang Qu, and Bin Cui. 2024. On-Device Recommender Systems: A Comprehensive Survey. arXiv preprint arXiv:2401.11441 (2024).",
-      "metadata": {}
-    },
-    {
-      "text": "- [71] Hamed Zamani, Johanne R Trippas, Jeff Dalton, Filip Radlinski, et al. 2023. Conversational information seeking. Foundations and Trends\u00ae in Information Retrieval 17, 3-4 (2023), 244-456.\n- [73] Minjia Zhang, Wenhan Wang, and Yuxiong He. 2020. Learning to Anneal and Prune Proximity Graphs for Similarity Search. In International Conference on Learning Representations (ICLR) . Available at https://openreview.net/forum?id=HJlXC3EtwB .\n- [72] Saber Zerhoudi and Michael Granitzer. 2024. PersonaRAG: Enhancing Retrieval-Augmented Generation Systems with User-Centric Agents. arXiv preprint arXiv:2407.09394 (2024).",
-      "metadata": {}
-    },
-    {
-      "text": "- [74] Yanhao Zhang, Pan Pan, Yun Zheng, Kang Zhao, Yingya Zhang, Xiaofeng Ren, and Rong Jin. 2018. Visual search at alibaba. In Proceedings of the 24th ACM SIGKDD international conference on knowledge discovery & data mining . 993-1001.\n- [76] Kan Zhu, Yilong Zhao, Liangyu Zhao, Gefei Zuo, Yile Gu, Dedong Xie, Yufei Gao, Qinyu Xu, Tian Tang, Zihao Ye, et al. 2024. Nanoflow: Towards optimal large language model serving throughput. arXiv preprint arXiv:2408.12757 (2024).\n- [75] Jinhao Zhu, Liana Patel, Matei Zaharia, and Raluca Ada Popa. 2024. Compass: Encrypted Semantic Search with High Accuracy. Cryptology ePrint Archive, Paper 2024/1255. https://eprint.iacr.org/2024/1255",
-      "metadata": {}
-    },
-    {
-      "text": "- [77] Zilliz AI FAQ. 2025. How much memory overhead is typically introduced by indexes like HNSW or IVF? Accessed May 2025.",
+      "text": "- [71] Hamed Zamani, Johanne R Trippas, Jeff Dalton, Filip Radlinski, et al. 2023. Conversational information seeking. Foundations and Trends\u00ae in Information Retrieval 17, 3-4 (2023), 244-456.\n- [73] Minjia Zhang, Wenhan Wang, and Yuxiong He. 2020. Learning to Anneal and Prune Proximity Graphs for Similarity Search. In International Conference on Learning Representations (ICLR) . Available at https://openreview.net/forum?id=HJlXC3EtwB .\n- [72] Saber Zerhoudi and Michael Granitzer. 2024. PersonaRAG: Enhancing Retrieval-Augmented Generation Systems with User-Centric Agents. arXiv preprint arXiv:2407.09394 (2024).\n- [74] Yanhao Zhang, Pan Pan, Yun Zheng, Kang Zhao, Yingya Zhang, Xiaofeng Ren, and Rong Jin. 2018. Visual search at alibaba. In Proceedings of the 24th ACM SIGKDD international conference on knowledge discovery & data mining . 993-1001.\n- [76] Kan Zhu, Yilong Zhao, Liangyu Zhao, Gefei Zuo, Yile Gu, Dedong Xie, Yufei Gao, Qinyu Xu, Tian Tang, Zihao Ye, et al. 2024. Nanoflow: Towards optimal large language model serving throughput. arXiv preprint arXiv:2408.12757 (2024).\n- [75] Jinhao Zhu, Liana Patel, Matei Zaharia, and Raluca Ada Popa. 2024. Compass: Encrypted Semantic Search with High Accuracy. Cryptology ePrint Archive, Paper 2024/1255. https://eprint.iacr.org/2024/1255\n- [77] Zilliz AI FAQ. 2025. How much memory overhead is typically introduced by indexes like HNSW or IVF? Accessed May 2025.",
       "metadata": {}
     }
   ]