diff --git a/demo.ipynb b/demo.ipynb index 2346957..0aacafe 100644 --- a/demo.ipynb +++ b/demo.ipynb @@ -34,7 +34,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 2.82it/s]\n" + "Batches: 100%|██████████| 1/1 [00:00<00:00, 2.91it/s]\n" ] }, { @@ -48,7 +48,7 @@ "Writing bin: knowledge_disk.index_max_base_norm.bin\n", "bin: #pts = 1, #dims = 1, size = 12B\n", "Finished writing bin.\n", - "Time for preprocessing data for inner product: 0.000163 seconds\n", + "Time for preprocessing data for inner product: 0.000172 seconds\n", "Reading max_norm_of_base from knowledge_disk.index_max_base_norm.bin\n", "Reading bin file knowledge_disk.index_max_base_norm.bin ...\n", "Opening bin file knowledge_disk.index_max_base_norm.bin... \n", @@ -58,7 +58,7 @@ "! Using prepped_base file at knowledge_prepped_base.bin\n", "Starting index build: R=32 L=64 Query RAM budget: 4.02653e+09 Indexing ram budget: 8 T: 8\n", "getting bin metadata\n", - "Time for getting bin metadata: 0.000020 seconds\n", + "Time for getting bin metadata: 0.000019 seconds\n", "Compressing 769-dimensional data into 512 bytes per vector.\n", "Opened: knowledge_prepped_base.bin, size: 18464, cache_size: 18464\n", "Training data with 6 samples loaded.\n", @@ -86,17 +86,17 @@ "done.\n", "Loaded PQ pivot information\n", "Processing points [0, 6)...done.\n", - "Time for generating quantized data: 0.051329 seconds\n", + "Time for generating quantized data: 0.055587 seconds\n", "Full index fits in RAM budget, should consume at most 2.03973e-05GiBs, so building in one shot\n", "L2: Using AVX2 distance computation DistanceL2Float\n", "Passed, empty search_params while creating index config\n", "Using only first 6 from file.. \n", "Starting index build with 6 points... \n", - "0% of index build completed.Starting final cleanup..done. Link time: 0.00016s\n", + "0% of index build completed.Starting final cleanup..done. Link time: 0.00011s\n", "Index built with degree: max:5 avg:5 min:5 count(deg<2):0\n", "Not saving tags as they are not enabled.\n", - "Time taken for save: 0.00015s.\n", - "Time for building merged vamana index: 0.000982 seconds\n", + "Time taken for save: 0.000148s.\n", + "Time for building merged vamana index: 0.000836 seconds\n", "Opened: knowledge_prepped_base.bin, size: 18464, cache_size: 18464\n", "Vamana index file size=168\n", "Opened: knowledge_disk.index, cache_size: 67108864\n", @@ -111,11 +111,11 @@ "Finished writing bin.\n", "Output disk index file written to knowledge_disk.index\n", "Finished writing 28672B\n", - "Time for generating disk layout: 0.044798 seconds\n", + "Time for generating disk layout: 0.040268 seconds\n", "Opened: knowledge_prepped_base.bin, size: 18464, cache_size: 18464\n", "Loading base knowledge_prepped_base.bin. #points: 6. #dim: 769.\n", "Wrote 1 points to sample file: knowledge_sample_data.bin\n", - "Indexing time: 0.0974831\n", + "Indexing time: 0.0970594\n", "INFO: Leann metadata saved to knowledge.leann.meta.json\n" ] }, @@ -163,14 +163,14 @@ "Disk-Index File Meta-data: # nodes per sector: 1, max node len (bytes): 3100, max node degree: 5\n", "Disk-Index Meta: nodes per sector: 1, max node len: 3100, max node degree: 5\n", "Setting up thread-specific contexts for nthreads: 8\n", - "allocating ctx: 0x77203100b000 to thread-id:130971745651648\n", - "allocating ctx: 0x77202fc8c000 to thread-id:130971756137280\n", - "allocating ctx: 0x77202fc7b000 to thread-id:130971735166016\n", - "allocating ctx: 0x77202fc6a000 to thread-id:130983600146240\n", - "allocating ctx: 0x77202fc59000 to thread-id:130971766622912\n", - "allocating ctx: 0x77202fc48000 to thread-id:130971703709120\n", - "allocating ctx: 0x77202fc37000 to thread-id:130971714194752\n", - "allocating ctx: 0x77202fc26000 to thread-id:130971724680384\n", + "allocating ctx: 0x7a33f7204000 to thread-id:134367072315200\n", + "allocating ctx: 0x7a33f6805000 to thread-id:134355206802368\n", + "allocating ctx: 0x7a33f5e72000 to thread-id:134355217288000\n", + "allocating ctx: 0x7a33f5e61000 to thread-id:134355227773632\n", + "allocating ctx: 0x7a33f5e50000 to thread-id:134355196316736\n", + "allocating ctx: 0x7a33f5e3f000 to thread-id:134355164859840\n", + "allocating ctx: 0x7a33f5e2e000 to thread-id:134355175345472\n", + "allocating ctx: 0x7a33f5e1d000 to thread-id:134355185831104\n", "Loading centroid data from medoids vector data of 1 medoid(s)\n", "Reading bin file knowledge_disk.index_max_base_norm.bin ...\n", "Opening bin file knowledge_disk.index_max_base_norm.bin... \n", @@ -190,7 +190,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Batches: 100%|██████████| 1/1 [00:00<00:00, 56.33it/s]" + "Batches: 100%|██████████| 1/1 [00:00<00:00, 60.54it/s]" ] }, { @@ -200,7 +200,7 @@ "INFO: DiskANN ZMQ mode enabled - ensuring embedding server is running\n", "INFO: Starting session-level embedding server as a background process...\n", "INFO: Running command from project root: /home/ubuntu/LEANN_clean/leann\n", - "INFO: Server process started with PID: 313817\n" + "INFO: Server process started with PID: 424761\n" ] }, { @@ -224,100 +224,100 @@ "[EmbeddingServer LOG]: INFO: Loaded 6 demo documents\n", "[EmbeddingServer LOG]: INFO: ZMQ ROUTER server listening on port 5555\n", "[EmbeddingServer LOG]: INFO: Embedding server ready to serve requests\n", - "reserve ratio: [EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 3 bytes\n", - "1\n", - "Graph traversal completed, hops: 3\n", + "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 3 bytes\n", "[EmbeddingServer LOG]: INFO: Request for 1 node embeddings: [0]\n", "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 0\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000025 seconds\n", + "[EmbeddingServer LOG]: Time taken for text lookup: 0.000028 seconds\n", "[EmbeddingServer LOG]: INFO: Total batch size: 1, max_batch_size: 128\n", "[EmbeddingServer LOG]: INFO: Processing batch of size 1\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.019455 seconds\n", + "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.019294 seconds\n", "[EmbeddingServer LOG]: Batch size: 1, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000176 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 3.062818 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.052912 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000228 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 3.136761 seconds\n", + "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000210 seconds\n", + "[EmbeddingServer LOG]: Time taken for embedding (batch): 3.065444 seconds\n", + "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.041810 seconds\n", + "[EmbeddingServer LOG]: INFO: Serialize time: 0.000194 seconds\n", + "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 3.128073 seconds\n", "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n", "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [1, 2, 3, 4, 5]\n", "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 5\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000045 seconds\n", + "[EmbeddingServer LOG]: Time taken for text lookup: 0.000042 seconds\n", "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n", "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001596 seconds\n", + "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001791 seconds\n", "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000094 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 3.517292 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000360 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000155 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 3.520236 seconds\n", + "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000112 seconds\n", + "[EmbeddingServer LOG]: Time taken for embedding (batch): 3.674183 seconds\n", + "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000372 seconds\n", + "[EmbeddingServer LOG]: INFO: Serialize time: 0.000177 seconds\n", + "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 3.677425 seconds\n", "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n", "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [3, 4, 2, 1, 0]\n", "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 4\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000031 seconds\n", + "[EmbeddingServer LOG]: Time taken for text lookup: 0.000030 seconds\n", "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n", "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001254 seconds\n", + "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001550 seconds\n", "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000076 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.009231 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000189 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000087 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.011405 seconds\n", + "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000097 seconds\n", + "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.009335 seconds\n", + "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000154 seconds\n", + "[EmbeddingServer LOG]: INFO: Serialize time: 0.000073 seconds\n", + "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.011773 seconds\n", "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n", "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [0, 1, 2, 4, 5]\n", "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 5\n", "[EmbeddingServer LOG]: Time taken for text lookup: 0.000020 seconds\n", "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n", "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000968 seconds\n", + "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001041 seconds\n", "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000071 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008908 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000148 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000055 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010627 seconds\n", + "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000125 seconds\n", + "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008972 seconds\n", + "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000151 seconds\n", + "[EmbeddingServer LOG]: INFO: Serialize time: 0.000048 seconds\n", + "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010853 seconds\n", "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n", "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [3, 1, 0, 2, 5]\n", "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 5\n", "[EmbeddingServer LOG]: Time taken for text lookup: 0.000020 seconds\n", "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n", "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000959 seconds\n", + "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001350 seconds\n", "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000071 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008822 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000151 seconds\n", + "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", + "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008869 seconds\n", + "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000146 seconds\n", "[EmbeddingServer LOG]: INFO: Serialize time: 0.000063 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010516 seconds\n", + "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.011054 seconds\n", "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n", "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [0, 2, 3, 4, 5]\n", "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 5\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000019 seconds\n", + "[EmbeddingServer LOG]: Time taken for text lookup: 0.000022 seconds\n", "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n", "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001046 seconds\n", + "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001195 seconds\n", "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000074 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008821 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000146 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000059 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010625 seconds\n", + "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000087 seconds\n", + "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008903 seconds\n", + "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000145 seconds\n", + "[EmbeddingServer LOG]: INFO: Serialize time: 0.000060 seconds\n", + "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010921 seconds\n", "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 7 bytes\n", "[EmbeddingServer LOG]: INFO: Request for 5 node embeddings: [1, 0, 3, 4, 5]\n", "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 5\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000019 seconds\n", + "[EmbeddingServer LOG]: Time taken for text lookup: 0.000020 seconds\n", "[EmbeddingServer LOG]: INFO: Total batch size: 5, max_batch_size: 128\n", "[EmbeddingServer LOG]: INFO: Processing batch of size 5\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000947 seconds\n", + "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.001188 seconds\n", "[EmbeddingServer LOG]: Batch size: 5, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000071 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008823 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000143 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000047 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010491 seconds\n", - "Score: -0.481 - C++ is a powerful programming language\n", + "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000087 seconds\n", + "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.008858 seconds\n", + "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000153 seconds\n", + "[EmbeddingServer LOG]: INFO: Serialize time: 0.000052 seconds\n", + "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.010886 seconds\n", + "reserve ratio: Score: -0.481 - C++ is a powerful programming language1\n", + "Graph traversal completed, hops: 3\n", + "\n", "Score: -1.049 - Java is a powerful programming language\n" ] }, @@ -325,847 +325,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 3 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 1 node embeddings: [1]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 1\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000020 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 1, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 1\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.000475 seconds\n", - "[EmbeddingServer LOG]: Batch size: 1, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000245 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.005035 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000223 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000076 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.006687 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [41, 3, 40, 35, 27, 11, 2, 34, 7, 17, 53, 36, 10, 5, 6, 23, 39, 38, 15, 29, 25, 20, 4, 13, 26, 37, 33, 28, 21, 32, 12, 50]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 2 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.006198 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000091 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043852 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000362 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000303 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.051755 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [35, 40, 25, 12, 1, 2, 20, 24, 53, 9, 3, 6, 36, 11, 27, 41, 34, 26, 29, 33, 28, 5, 21, 17, 13, 7, 32, 30, 15, 31, 39, 4]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000036 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.006460 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000094 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043777 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000364 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000125 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.051677 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [41, 14, 30, 27, 11, 7, 31, 15, 53, 0, 4, 5, 1, 36, 13, 37, 39, 40, 38, 3, 22, 29, 34, 32, 19, 33, 25, 26, 35, 28, 8, 50]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000035 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005619 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000119 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043553 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000359 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000145 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050564 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [39, 4, 37, 34, 26, 41, 13, 15, 35, 32, 29, 50, 30, 1, 5, 27, 14, 3, 36, 23, 40, 12, 28, 7, 11, 22, 31, 2, 19, 33, 8, 17]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 50\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.006054 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043605 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000074 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.051038 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [37, 7, 41, 22, 50, 0, 36, 23, 39, 8, 11, 35, 53, 16, 30, 14, 1, 4, 15, 40, 38, 27, 13, 6, 19, 10, 3, 25, 34, 45, 29, 48]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000040 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005921 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000086 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043591 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000359 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000081 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050846 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [55, 54, 44, 45, 49, 46, 51, 8, 42, 52, 43, 50, 53, 47, 0, 2, 24, 5, 15, 36, 37, 41, 1, 6, 19, 9, 35, 16, 7, 17, 22, 26]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005599 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000090 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043636 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000369 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000118 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050578 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [27, 34, 20, 32, 3, 26, 2, 24, 22, 51, 31, 35, 1, 25, 12, 21, 41, 38, 33, 29, 40, 13, 10, 15, 4, 30, 14, 6, 17, 11, 37, 39]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 51\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000036 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005486 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000090 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043729 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000371 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000092 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050599 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [31, 3, 8, 32, 34, 12, 41, 21, 27, 2, 10, 24, 45, 13, 40, 1, 35, 29, 20, 28, 15, 4, 17, 14, 26, 25, 36, 38, 23, 37, 11, 30]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 45\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005491 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000090 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043680 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000072 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050465 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [26, 6, 30, 32, 40, 27, 3, 7, 10, 20, 15, 24, 55, 1, 28, 35, 8, 21, 29, 31, 11, 17, 33, 9, 5, 34, 23, 19, 2, 36, 22, 37]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000032 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005579 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000086 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043559 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000372 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000078 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050408 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [42, 53, 50, 49, 19, 55, 45, 47, 44, 54, 46, 51, 0, 41, 24, 17, 48, 52, 16, 1, 15, 5, 40, 2, 36, 35, 3, 6, 9, 37, 14, 34]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005751 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043583 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000354 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000072 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050691 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [48, 24, 52, 45, 44, 42, 51, 46, 43, 53, 47, 2, 55, 49, 50, 1, 6, 36, 19, 26, 35, 41, 9, 16, 25, 5, 40, 0, 3, 37, 8, 17]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005239 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000086 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043486 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000374 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000132 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050063 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [50, 46, 52, 0, 43, 55, 45, 44, 37, 26, 48, 42, 53, 54, 49, 5, 51, 19, 16, 1, 22, 36, 8, 41, 7, 14, 15, 11, 38, 4, 40, 25]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005200 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000087 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043580 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000364 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000136 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050169 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [48, 5, 17, 55, 49, 50, 43, 53, 46, 42, 47, 52, 24, 54, 44, 51, 15, 16, 0, 19, 37, 35, 8, 1, 22, 41, 6, 2, 9, 7, 36, 27]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005379 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043595 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000364 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000079 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050286 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [1, 10, 6, 26, 55, 18, 2, 7, 34, 38, 28, 21, 17, 53, 24, 19, 3, 41, 36, 11, 13, 40, 33, 5, 25, 27, 20, 32, 4, 31, 12, 15]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005289 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000087 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043688 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000358 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000075 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050273 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [15, 9, 5, 22, 13, 25, 29, 48, 18, 19, 7, 37, 16, 6, 21, 17, 41, 39, 14, 26, 27, 1, 32, 20, 38, 4, 34, 33, 36, 35, 11, 31]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 48\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000032 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005197 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000107 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043549 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000364 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000073 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050126 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [48, 53, 42, 47, 25, 43, 45, 50, 54, 52, 44, 49, 46, 35, 0, 19, 51, 1, 5, 6, 41, 37, 26, 7, 8, 11, 36, 2, 3, 22, 9, 16]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 54\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000035 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004973 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000098 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043565 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000362 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000070 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050157 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [55, 44, 43, 24, 48, 50, 49, 45, 52, 46, 19, 2, 54, 47, 51, 53, 6, 1, 41, 5, 16, 37, 0, 9, 35, 17, 15, 7, 26, 25, 27, 8]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000035 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005029 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000096 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043512 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000359 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000127 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050025 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [1, 29, 37, 33, 49, 36, 34, 15, 23, 13, 25, 53, 41, 3, 5, 4, 10, 27, 14, 11, 35, 2, 7, 39, 38, 31, 28, 22, 6, 32, 12, 30]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000042 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005203 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000095 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043577 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000069 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050252 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [39, 41, 34, 32, 20, 8, 38, 12, 23, 15, 3, 27, 21, 0, 44, 4, 1, 22, 36, 33, 35, 31, 40, 5, 14, 19, 11, 7, 28, 37, 26, 29]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 44\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000057 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005057 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000095 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043559 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000358 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000071 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050047 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [3, 42, 26, 17, 16, 25, 27, 28, 33, 32, 35, 1, 10, 31, 30, 6, 34, 2, 21, 29, 12, 20, 13, 38, 43, 15, 52, 4, 11, 54, 49, 44]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 54\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000035 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004832 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000093 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043604 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000365 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000072 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049907 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [21, 12, 28, 6, 18, 3, 15, 13, 34, 32, 2, 10, 29, 52, 17, 1, 33, 31, 25, 35, 27, 26, 9, 4, 40, 8, 19, 38, 39, 22, 36, 23]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 52\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004736 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000095 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043551 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000075 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049720 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [55, 50, 1, 0, 43, 45, 49, 54, 5, 44, 19, 48, 47, 52, 35, 41, 36, 42, 46, 40, 37, 11, 51, 16, 14, 7, 3, 10, 6, 27, 23, 39]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000052 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005280 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000097 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043612 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000069 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050364 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [20, 34, 2, 12, 21, 31, 17, 33, 15, 3, 35, 28, 13, 29, 26, 10, 27, 1, 8, 9, 32, 6, 4, 25, 19, 24, 30, 14, 22, 23, 7, 11]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 35\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004933 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000093 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043592 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000362 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000087 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049905 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [31, 12, 13, 1, 28, 16, 33, 38, 14, 29, 15, 26, 19, 18, 30, 24, 49, 3, 27, 41, 32, 35, 40, 4, 2, 20, 39, 21, 37, 10, 23, 11]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 49\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005100 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043713 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000358 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000132 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050184 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [8, 22, 17, 21, 41, 25, 45, 27, 14, 19, 7, 32, 6, 34, 2, 18, 0, 13, 1, 5, 20, 39, 38, 37, 40, 4, 9, 23, 31, 3, 29, 36]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 45\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005211 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000086 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043583 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000369 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000082 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050108 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [47, 43, 45, 36, 53, 55, 42, 49, 46, 37, 51, 19, 26, 48, 5, 44, 41, 54, 1, 7, 40, 52, 11, 35, 4, 0, 14, 22, 38, 23, 8, 6]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005326 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043689 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000359 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000077 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050398 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [51, 44, 54, 6, 0, 48, 49, 53, 47, 24, 55, 42, 45, 46, 43, 50, 2, 35, 1, 25, 16, 26, 9, 17, 36, 10, 15, 20, 19, 34, 3, 41]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005019 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000086 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043549 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000354 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000076 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049855 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [1, 33, 12, 26, 28, 24, 30, 35, 27, 4, 32, 21, 13, 34, 2, 29, 7, 19, 53, 41, 40, 20, 31, 10, 25, 36, 38, 6, 23, 15, 11, 39]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004843 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043592 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000077 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049748 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [41, 31, 23, 38, 3, 13, 27, 15, 7, 50, 0, 1, 39, 36, 5, 40, 34, 37, 14, 29, 32, 11, 22, 12, 35, 33, 26, 2, 28, 20, 19, 30]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 50\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000046 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004848 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000086 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043578 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000355 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000079 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049762 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [48, 35, 43, 45, 50, 42, 53, 52, 19, 55, 44, 54, 51, 46, 47, 40, 6, 1, 3, 36, 34, 26, 7, 2, 5, 41, 38, 27, 15, 25, 9, 10]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005006 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000087 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043567 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000356 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000067 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049898 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [27, 37, 25, 11, 31, 7, 17, 24, 46, 3, 23, 1, 14, 32, 5, 15, 28, 26, 40, 22, 33, 4, 34, 21, 8, 13, 10, 38, 6, 29, 20, 19]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 46\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000035 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004698 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043511 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000079 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049511 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [19, 0, 45, 9, 15, 5, 53, 24, 8, 22, 37, 6, 7, 14, 43, 50, 39, 25, 47, 34, 1, 23, 48, 13, 36, 26, 27, 32, 42, 55, 21, 11]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005009 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000087 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043560 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000356 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000067 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049864 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [5, 35, 27, 9, 23, 1, 11, 8, 39, 31, 50, 37, 14, 41, 15, 22, 6, 36, 25, 4, 40, 3, 26, 38, 30, 13, 32, 29, 10, 19, 2, 34]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 50\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004949 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000099 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043564 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000354 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000087 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049865 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [32, 34, 33, 14, 4, 30, 27, 3, 12, 28, 13, 15, 18, 49, 21, 23, 29, 7, 41, 20, 40, 25, 26, 1, 35, 38, 2, 37, 17, 22, 11, 8]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 49\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000036 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004950 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000087 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043601 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000355 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000073 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049895 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [41, 40, 50, 0, 5, 23, 35, 11, 29, 34, 19, 1, 4, 39, 37, 13, 10, 3, 7, 38, 27, 2, 22, 15, 14, 53, 6, 33, 26, 8, 25, 31]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005066 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000090 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043597 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000358 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000077 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.050019 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [48, 42, 52, 46, 43, 49, 50, 6, 24, 55, 54, 45, 51, 53, 47, 35, 5, 1, 19, 41, 8, 16, 37, 36, 3, 9, 26, 0, 15, 2, 34, 40]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000035 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.005022 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000114 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043570 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000364 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000069 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049988 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [1, 23, 30, 0, 5, 39, 25, 32, 53, 41, 7, 35, 37, 14, 36, 27, 40, 10, 13, 4, 38, 3, 22, 15, 6, 34, 29, 26, 33, 28, 17, 50]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000036 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004957 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000085 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043560 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000078 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049927 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [5, 15, 19, 14, 13, 21, 0, 30, 28, 29, 45, 37, 8, 41, 7, 39, 23, 36, 4, 1, 11, 32, 40, 16, 38, 27, 34, 31, 9, 3, 35, 25]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 45\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000035 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004693 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000095 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043676 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000356 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000077 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049752 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [25, 27, 35, 9, 24, 47, 6, 3, 28, 38, 7, 17, 50, 18, 1, 32, 29, 34, 10, 41, 21, 31, 13, 8, 15, 20, 33, 4, 19, 30, 36, 12]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 50\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004860 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000106 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043520 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000085 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049804 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [37, 23, 31, 15, 22, 7, 11, 40, 38, 19, 30, 0, 53, 5, 41, 34, 4, 13, 27, 1, 39, 36, 8, 33, 32, 28, 29, 35, 3, 16, 50, 25]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004716 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000087 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043595 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000356 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000070 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049714 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [52, 50, 48, 6, 24, 54, 44, 55, 49, 42, 45, 43, 46, 53, 47, 26, 9, 25, 35, 16, 2, 36, 1, 10, 3, 19, 5, 41, 40, 27, 7, 8]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004821 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000091 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043538 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000073 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049735 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [48, 44, 45, 47, 5, 24, 9, 54, 42, 55, 50, 49, 52, 43, 53, 51, 37, 1, 16, 19, 15, 6, 8, 41, 7, 36, 0, 40, 22, 35, 17, 11]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004873 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043572 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000353 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000075 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049889 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [20, 15, 32, 2, 12, 3, 17, 13, 6, 28, 26, 18, 49, 33, 22, 31, 34, 35, 1, 8, 25, 9, 41, 10, 27, 19, 40, 29, 30, 38, 7, 4]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 49\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004954 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043505 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000355 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000120 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049897 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [1, 4, 36, 5, 13, 23, 38, 15, 33, 43, 39, 40, 27, 11, 34, 53, 19, 37, 3, 35, 14, 22, 7, 32, 29, 2, 10, 31, 12, 28, 8, 26]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000041 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004878 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000087 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043611 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000364 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000075 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049876 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [35, 25, 9, 20, 52, 2, 7, 15, 40, 53, 24, 1, 26, 10, 21, 17, 3, 8, 5, 41, 19, 11, 36, 13, 12, 28, 34, 27, 16, 32, 29, 38]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000032 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004828 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000089 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043607 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000354 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000077 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049797 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [31, 27, 21, 13, 33, 3, 28, 15, 19, 42, 34, 41, 12, 20, 25, 26, 17, 1, 38, 4, 35, 22, 29, 30, 23, 11, 8, 7, 37, 40, 14, 39]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 42\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004633 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043628 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000354 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000068 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049742 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [5, 14, 38, 47, 41, 22, 11, 34, 0, 50, 30, 16, 7, 23, 39, 36, 40, 1, 4, 15, 8, 19, 13, 27, 35, 32, 31, 33, 53, 3, 28, 29]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004749 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000089 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043532 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000355 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000076 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049615 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [20, 34, 3, 13, 33, 2, 18, 28, 10, 17, 21, 31, 32, 41, 1, 38, 15, 4, 27, 35, 40, 6, 39, 26, 29, 8, 25, 24, 23, 36, 14, 9]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 41\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000035 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004750 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043570 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000353 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000073 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049821 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [19, 8, 6, 17, 16, 21, 26, 48, 15, 20, 25, 7, 22, 35, 1, 2, 10, 32, 3, 5, 13, 12, 27, 29, 33, 43, 28, 31, 55, 45, 42, 18]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004903 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043492 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000357 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000080 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049751 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [1, 30, 26, 28, 23, 29, 32, 4, 15, 34, 13, 7, 20, 24, 53, 41, 3, 25, 31, 11, 33, 40, 38, 5, 10, 35, 39, 14, 36, 12, 8, 50]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000043 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004987 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000089 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043620 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000353 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000070 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049956 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [13, 38, 5, 41, 15, 34, 43, 1, 4, 36, 37, 23, 7, 11, 22, 40, 27, 14, 8, 3, 35, 12, 29, 32, 19, 28, 20, 31, 33, 6, 53, 26]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004892 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000089 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043572 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000355 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000077 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049816 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [15, 2, 24, 21, 9, 1, 32, 25, 18, 45, 20, 6, 35, 8, 3, 33, 12, 34, 26, 27, 10, 40, 28, 19, 31, 30, 11, 38, 13, 22, 4, 29]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 45\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000034 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004802 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043615 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000354 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000071 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049759 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [27, 40, 18, 3, 34, 15, 10, 11, 1, 36, 41, 33, 31, 23, 26, 4, 28, 25, 32, 20, 13, 38, 7, 35, 39, 2, 14, 6, 12, 37, 22, 19]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 41\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000032 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004797 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000086 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043515 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000362 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000073 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049675 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [16, 9, 22, 34, 43, 8, 13, 26, 0, 30, 15, 14, 37, 5, 6, 41, 32, 21, 38, 23, 1, 4, 20, 39, 7, 17, 25, 53, 3, 36, 27, 29]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 0 to 53\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004681 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043572 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000356 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000072 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049589 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [16, 5, 43, 53, 37, 22, 19, 14, 47, 36, 50, 7, 11, 8, 15, 55, 45, 48, 39, 23, 27, 42, 13, 1, 4, 9, 52, 30, 46, 44, 25, 32]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 55\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004735 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000088 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043556 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000356 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000068 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049612 seconds\n", - "[EmbeddingServer LOG]: INFO: Received ZMQ request from client 006b8b45, size 34 bytes\n", - "[EmbeddingServer LOG]: INFO: Request for 32 node embeddings: [1, 12, 52, 6, 34, 17, 18, 24, 43, 35, 10, 3, 21, 20, 40, 41, 36, 15, 33, 28, 31, 38, 4, 29, 7, 27, 32, 13, 25, 9, 26, 11]\n", - "[EmbeddingServer LOG]: DEBUG: Node ID range: 1 to 52\n", - "[EmbeddingServer LOG]: Time taken for text lookup: 0.000033 seconds\n", - "[EmbeddingServer LOG]: INFO: Total batch size: 32, max_batch_size: 128\n", - "[EmbeddingServer LOG]: INFO: Processing batch of size 32\n", - "[EmbeddingServer LOG]: Time taken for tokenization (batch): 0.004743 seconds\n", - "[EmbeddingServer LOG]: Batch size: 32, Sequence length: 256\n", - "[EmbeddingServer LOG]: Time taken for transfer to device (batch): 0.000087 seconds\n", - "[EmbeddingServer LOG]: Time taken for embedding (batch): 0.043615 seconds\n", - "[EmbeddingServer LOG]: Time taken for mean pooling (batch): 0.000354 seconds\n", - "[EmbeddingServer LOG]: INFO: Serialize time: 0.000074 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ E2E time: 0.049773 seconds\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", - "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", "[EmbeddingServer LOG]: INFO: ZMQ socket timeout, continuing to listen\n", diff --git a/examples/data/FairTree__OSDI_25_ (1).pdf b/examples/data/FairTree__OSDI_25_ (1).pdf new file mode 100644 index 0000000..bc10f4e Binary files /dev/null and b/examples/data/FairTree__OSDI_25_ (1).pdf differ diff --git a/examples/main_cli_example.py b/examples/main_cli_example.py index 8e452ae..be0cf96 100644 --- a/examples/main_cli_example.py +++ b/examples/main_cli_example.py @@ -21,7 +21,7 @@ file_extractor: dict[str, BaseReader] = { ".xlsx": reader, } node_parser = DoclingNodeParser( - chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=512) + chunker=HybridChunker(tokenizer="Qwen/Qwen3-Embedding-4B", max_tokens=64) ) documents = SimpleDirectoryReader( @@ -67,7 +67,7 @@ async def main(): print(f"\n[PHASE 2] Starting Leann chat session...") chat = LeannChat(index_path=INDEX_PATH) - query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead?" + query = "Based on the paper, what are the main techniques LEANN and DLPM explores to reduce the storage overhead?" print(f"You: {query}") chat_response = chat.ask(query, top_k=10, recompute_beighbor_embeddings=True) print(f"Leann: {chat_response}") diff --git a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py index 53e7e91..740678c 100644 --- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py +++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py @@ -192,6 +192,7 @@ class HNSWBuilder(LeannBackendBuilderInterface): build_kwargs = {**self.build_params, **kwargs} metric_str = build_kwargs.get("distance_metric", "mips").lower() metric_enum = get_metric_map().get(metric_str) + print('metric_enum', metric_enum,' metric_str', metric_str) if metric_enum is None: raise ValueError(f"Unsupported distance_metric '{metric_str}'.") @@ -217,7 +218,9 @@ class HNSWBuilder(LeannBackendBuilderInterface): faiss.normalize_L2(data) # Add vectors to index + print('starting to add vectors to index') index.add(data.shape[0], faiss.swig_ptr(data)) + print('vectors added to index') # Save index index_file = index_dir / f"{index_prefix}.index" diff --git a/test_pdf_index/pdf_documents.leann.meta.json b/test_pdf_index/pdf_documents.leann.meta.json index 3247879..38b478b 100644 --- a/test_pdf_index/pdf_documents.leann.meta.json +++ b/test_pdf_index/pdf_documents.leann.meta.json @@ -2,26 +2,130 @@ "version": "0.1.0", "backend_name": "diskann", "embedding_model": "facebook/contriever", - "num_chunks": 56, + "num_chunks": 826, "chunks": [ { - "text": "Yichuan Wang \u2020 , 1 , Shu Liu 1 , Zhifei Li 1 , Yongji Wu \u2020 , 1 , Ziming Mao 1 , Yilong Zhao 1 , Xiao Yan 2 , Zhiying Xu \u2217 , 3 , Yang Zhou 1 , 4 , Ion Stoica 1 , Sewon Min 1 , Matei Zaharia 1 , Joseph E. Gonzalez 1 1 UC Berkeley 2 CUHK 3 Amazon Web Services 4 UC Davis", + "text": "Yichuan Wang \u2020 , 1 , Shu Liu 1 , Zhifei Li 1 , Yongji Wu \u2020 , 1 , Ziming Mao 1 , Yilong Zhao 1 , Xiao Yan 2 , Zhiying Xu", "metadata": {} }, { - "text": "Embedding-based search is widely used in applications such as recommendation and retrieval-augmented generation (RAG). Recently, there is a growing demand to support these capabilities over personal data stored locally on devices. However, maintaining the necessary data structure associated with the embedding-based search is often infeasible due to its high storage overhead. For example, indexing 100 GB of raw data requires 150 to 700 GB of storage, making local deployment impractical. Reducing this overhead while maintaining search quality and latency becomes a critical challenge.\nsearch viable in these settings, we seek to reduce storage overhead to under 5% of the original data size. At the same time, any such reduction must preserve high search accuracy while maintaining reasonable search latency to ensure responsive, real-time search experiences.\nIn this paper, we present LEANN, a storage-efficient approximate nearest neighbor (ANN) search index optimized for resource-constrained personal devices. LEANN combines a compact graph-based structure with an efficient on-the-fly recomputation strategy to enable fast and accurate retrieval with minimal storage overhead. Our evaluation shows that LEANN reduces index size to under 5% of the original raw data, achieving up to 50 times smaller storage than standard indexes, while maintaining 90% top-3 recall in under 2 seconds on real-world question answering benchmarks.\nExisting solutions, however, fall short of this goal. Most ANN indices store full embeddings and index metadata on disk [65], requiring terabytes of storage to index hundreds of gigabytes of documents, far exceeding the capacity of edge devices. While compression techniques such as product quantization (PQ) [29] can reduce storage, they often come at the cost of degraded search accuracy or require increased search latency to achieve comparable results.", + "text": "\u2217 , 3 , Yang Zhou 1 , 4 , Ion Stoica 1 , Sewon Min 1 , Matei Zaharia 1 , Joseph E. Gonzalez 1 1 UC Berkeley 2 CUHK 3 Amazon Web Services 4 UC", "metadata": {} }, { - "text": "With the recent advances in AI [27, 37], embedding-based search now significantly outperforms traditional keywordbased search methods [30, 71] across many domains such as question answering, recommendation, and large-scale web applications such as search engines [14, 74]. These systems rely on dense vector representations to capture semantic similarity and use approximate nearest neighbor (ANN) search to retrieve relevant results efficiently. Recently, there has been growing interest in enabling such capabilities on edge devices like laptops or phones, enabling applications like personalized search, on-device assistants, and privacypreserving retrieval over local data [24, 32, 66, 69].\nHowever, ANN data structures introduce substantial storage overheads, often 1.5 to 7 \u00d7 the size of the original raw data [57]. While such overheads are acceptable in large-scale web application deployments, they pose a significant bottleneck when deploying ANN search on personal devices or when using large datasets. For example, a 2 \u00d7 storage overhead on a personal laptop is impractical. To make ANN\n*This work does not relate to the position at Amazon. \u2020Corresponding authors. Email: , .\nThe first insight is that in graph-based indexes like HNSW, a single query typically explores only a small subset of the embedding vectors to identify its nearest neighbors. As such, instead of storing these embeddings on disk, we can recompute them on the fly at search time. However, naive recomputation can still incur a high latency overhead. To address this challenge, LEANN introduces a two-level traversal algorithm that interleaves an approximate and an exact distance queue, while prioritizing the most promising candidates in the search process, thus reducing the number of recomputations. Additionally, LEANN also incorporates a dynamic batching mechanism that aggregates embedding computations across search hops, improving GPU utilization and thus minimizing recomputation latency.", + "text": "Davis", "metadata": {} }, { - "text": "In this paper, we tackle the challenge of reducing ANN storage overhead and present LEANN, a novel graph-based vector index designed for storage-constrained environments. Built on top of Hierarchical Navigable Small World (HNSW) [38], a widely adopted, state-of-the-art graph-based ANN index, LEANN introduces system and algorithm optimizations that reduce total index storage to under 5% of the original data size, while preserving low query latency and high retrieval accuracy. At its core, LEANN is driven by two key insights.\nHowever, even without storing embeddings, the index metadata (e.g., graph structure) itself can lead to non-trivial storage overhead relative to the original data size. For example, a typical HNSW index uses a node degree of 64, meaning each node stores 64 neighbor links. With 4 bytes per link, this results in 256 bytes of metadata per node, which normally accounts for more than 25% storage overhead of a common 256-token document chunk [57].\nThe second insight is that much of the graph index metadata is redundant: not all nodes and edges contribute equally\nto search accuracy. Based on this observation, LEANN introduces a high-degree preserving graph pruning strategy that removes low-utility edges while preserving high-degree 'hub' nodes that are essential for maintaining effective search paths. By retaining only structurally important components of the graph, LEANN significantly reduces the size of the index without sacrificing the quality of the retrieval.\n- \u00b7 We conduct the first study on enabling low-latency, highaccuracy search over personal data with minimal storage overhead on edge devices.\nWe implement LEANN on top of FAISS [17] and evaluate it on four popular information retrieval (IR) benchmarks: NQ [31], HotpotQA [68], TriviaQA [28], and GPQA [48]. These benchmarks have been widely used in evaluations of information retrieval systems. Our experiments span both an NVIDIA A10 workstation [43] and an M1-based Mac [3]. The results show that LEANN reduces storage consumption by more than 50 \u00d7 compared to state-of-the-art indexes while achieving competitive latency to achieve high accuracy. In summary, we make the following contributions:", + "text": "Embedding-based search is widely used in applications such as recommendation and retrieval-augmented generation (RAG). Recently, there is a growing demand to support these capabilities over personal data stored locally on devices. However, maintaining the necessary data structure associated with the embedding-based search is often infeasible due to its", "metadata": {} }, { - "text": "- \u00b7 We present LEANN, a compact graph-based ANN index that prunes redundant graph metadata by prioritizing preserving high degree nodes, and avoids storing embeddings by recomputing them on the fly. To minimize recomputation latency, LEANN also introduces a two-level search strategy with dynamic batching.\n- \u00b7 We show that LEANN can deliver 90% top-3 recall using less than 5% storage overhead relative to the raw data size, while the end-to-end search time is still less than 2 seconds on four benchmarks and various hardware platforms.", + "text": "high storage overhead. For example, indexing 100 GB of raw data requires 150 to 700 GB of storage, making local deployment impractical. Reducing this overhead while maintaining search quality and latency becomes a critical challenge.", + "metadata": {} + }, + { + "text": "search viable in these settings, we seek to reduce storage overhead to under 5% of the original data size. At the same time, any such reduction must preserve high search accuracy while maintaining reasonable search latency to ensure responsive, real-time search experiences.", + "metadata": {} + }, + { + "text": "In this paper, we present LEANN, a storage-efficient approximate nearest neighbor (ANN) search index optimized for resource-constrained personal devices. LEANN combines a compact graph-based structure with an efficient on-the-fly recomputation strategy to enable fast and accurate retrieval with minimal storage overhead. Our evaluation shows that LEANN", + "metadata": {} + }, + { + "text": "reduces index size to under 5% of the original raw data, achieving up to 50 times smaller storage than standard indexes, while maintaining 90% top-3 recall in under 2 seconds on real-world question answering benchmarks.", + "metadata": {} + }, + { + "text": "Existing solutions, however, fall short of this goal. Most ANN indices store full embeddings and index metadata on disk [65], requiring terabytes of storage to index hundreds of gigabytes of documents, far exceeding the capacity of edge devices. While compression techniques such as product quantization (PQ)", + "metadata": {} + }, + { + "text": "[29] can reduce storage, they often come at the cost of degraded search accuracy or require increased search latency to achieve comparable results.", + "metadata": {} + }, + { + "text": "With the recent advances in AI [27, 37], embedding-based search now significantly outperforms traditional keywordbased search methods [30, 71] across many domains such as question answering, recommendation, and large-scale web applications such as search engines [14,", + "metadata": {} + }, + { + "text": "74]. These systems rely on dense vector representations to capture semantic similarity and use approximate nearest neighbor (ANN) search to retrieve relevant results efficiently. Recently, there has been growing interest in enabling such capabilities on edge devices like laptops or phones, enabling applications like personalized search, on-device assistants, and", + "metadata": {} + }, + { + "text": "privacypreserving retrieval over local data [24, 32, 66, 69].", + "metadata": {} + }, + { + "text": "However, ANN data structures introduce substantial storage overheads, often 1.5 to 7 \u00d7 the size of the original raw data [57]. While such overheads are acceptable in large-scale web application deployments, they pose a significant bottleneck when deploying ANN search on personal devices or when using large", + "metadata": {} + }, + { + "text": "datasets. For example, a 2 \u00d7 storage overhead on a personal laptop is impractical. To make ANN", + "metadata": {} + }, + { + "text": "*This work does not relate to the position at Amazon. \u2020Corresponding authors. Email: , .", + "metadata": {} + }, + { + "text": "The first insight is that in graph-based indexes like HNSW, a single query typically explores only a small subset of the embedding vectors to identify its nearest neighbors. As such, instead of storing these embeddings on disk, we can recompute them on the fly at search time. However, naive recomputation", + "metadata": {} + }, + { + "text": "can still incur a high latency overhead. To address this challenge, LEANN introduces a two-level traversal algorithm that interleaves an approximate and an exact distance queue, while prioritizing the most promising candidates in the search process, thus reducing the number of recomputations. Additionally, LEANN also incorporates a dynamic", + "metadata": {} + }, + { + "text": "batching mechanism that aggregates embedding computations across search hops, improving GPU utilization and thus minimizing recomputation latency.", + "metadata": {} + }, + { + "text": "In this paper, we tackle the challenge of reducing ANN storage overhead and present LEANN, a novel graph-based vector index designed for storage-constrained environments. Built on top of Hierarchical Navigable Small World (HNSW) [38], a widely adopted, state-of-the-art graph-based", + "metadata": {} + }, + { + "text": "ANN index, LEANN introduces system and algorithm optimizations that reduce total index storage to under 5% of the original data size, while preserving low query latency and high retrieval accuracy. At its core, LEANN is driven by two key insights.", + "metadata": {} + }, + { + "text": "However, even without storing embeddings, the index metadata (e.g., graph structure) itself can lead to non-trivial storage overhead relative to the original data size. For example, a typical HNSW index uses a node degree of 64, meaning each node stores 64 neighbor links.", + "metadata": {} + }, + { + "text": "With 4 bytes per link, this results in 256 bytes of metadata per node, which normally accounts for more than 25% storage overhead of a common 256-token document chunk [57].", + "metadata": {} + }, + { + "text": "The second insight is that much of the graph index metadata is redundant: not all nodes and edges contribute equally", + "metadata": {} + }, + { + "text": "to search accuracy. Based on this observation, LEANN introduces a high-degree preserving graph pruning strategy that removes low-utility edges while preserving high-degree 'hub' nodes that are essential for maintaining effective search paths. By retaining only structurally important components of the graph, LEANN significantly reduces the size of the", + "metadata": {} + }, + { + "text": "index without sacrificing the quality of the retrieval.\n- \u00b7 We conduct the first study on enabling low-latency, highaccuracy search over personal data with minimal storage overhead on edge devices.", + "metadata": {} + }, + { + "text": "We implement LEANN on top of FAISS [17] and evaluate it on four popular information retrieval (IR) benchmarks: NQ [31], HotpotQA [68], TriviaQA [28], and GPQA [48]. These benchmarks have been widely used in evaluations", + "metadata": {} + }, + { + "text": "of information retrieval systems. Our experiments span both an NVIDIA A10 workstation [43] and an M1-based Mac [3]. The results show that LEANN reduces storage consumption by more than 50 \u00d7 compared to state-of-the-art indexes while achieving competitive latency to achieve high accuracy. In", + "metadata": {} + }, + { + "text": "summary, we make the following contributions:", + "metadata": {} + }, + { + "text": "- \u00b7 We present LEANN, a compact graph-based ANN index that prunes redundant graph metadata by prioritizing preserving high degree nodes, and avoids storing embeddings by recomputing them on the fly. To minimize recomputation latency, LEANN also introduces a two-level search strategy with dynamic batching.", + "metadata": {} + }, + { + "text": "- \u00b7 We show that LEANN can deliver 90% top-3 recall using less than 5% storage overhead relative to the raw data size, while the end-to-end search time is still less than 2 seconds on four benchmarks and various hardware platforms.", "metadata": {} }, { @@ -29,35 +133,239 @@ "metadata": {} }, { - "text": "Vector search systems rely on high-dimensional embeddings to enable semantic search across unstructured data. A core operation in such systems is the top\ud835\udc58 nearest neighbor search, where the goal is to find the \ud835\udc58 most similar vectors in a dataset to a given query vector. Formally, given a set of vectors \ud835\udc4b = { \ud835\udc65 1 , \ud835\udc65 2 , . . . , \ud835\udc65 \ud835\udc5b } \u2282 R \ud835\udc5a and a query vector \ud835\udc5e \u2208 R \ud835\udc5a , a top\ud835\udc58 nearest neighbor search aims to retrieve a set S \u2282 \ud835\udc4b of \ud835\udc58 vectors such that:\n\nwhere Dist (\u00b7 , \u00b7) denotes a distance or similarity metric (e.g., Euclidean distance or cosine similarity).\nWhile exact search guarantees retrieval of the true nearest neighbors, it becomes computationally prohibitive at scale. Approximate nearest neighbor (ANN) methods [33, 38] offer a trade-off by allowing minor inaccuracies in exchange for substantially lower query latency. The effectiveness of an ANN algorithm is typically measured by Recall@k, defined as:\n\nwhere S is the set of true top\ud835\udc58 neighbors returned by exact search, and S \u2032 is the set returned by the ANN method. This metric quantifies the fraction of relevant neighbors successfully retrieved. Applications such as retrieval-augmented generation (RAG) typically require high recall (e.g., \u2265 0 . 9) to preserve downstream task quality [58].\nTo accelerate ANN search, vector indexes organize embeddings using data structures that reduce the number of comparisons required. Generally, a vector index consists of two primary components: (1) the stored embedding vectors themselves, representing the data, and (2) the index structure (such as graph connections or cluster assignments) built upon these vectors to expedite the search. Both components contribute to the overall storage footprint. Two widely used classes of ANN indices are described below:\nCluster-based Index. Methods such as IVF [33] partition the dataset into clusters (or 'cells') using algorithms like K-means [9], grouping semantically similar vectors together. At query time, only the most relevant clusters are searched, reducing the overall number of comparisons.", + "text": "Vector search systems rely on high-dimensional embeddings to enable semantic search across unstructured data. A core operation in such systems is the top\ud835\udc58 nearest neighbor search, where the goal is to find the \ud835\udc58 most similar vectors in a dataset to a given query vector. Formally, given a", "metadata": {} }, { - "text": "Graph-based Index. Methods such as HNSW [38] and other proximity-graph-based approaches [21, 26, 59] construct a graph by linking each vector to its nearest neighbors. These indices are among the most effective for ANN search, typically requiring fewer distance computations to reach a target recall compared to cluster-based alternatives like IVF. Despite differences in graph construction, most methods rely on a best-first search (BFS) strategy to process ANN queries. We describe this algorithm in \u00a72.2.", + "text": "set of vectors \ud835\udc4b = { \ud835\udc65 1 , \ud835\udc65 2 , . . . , \ud835\udc65 \ud835\udc5b } \u2282 R \ud835\udc5a and a query vector \ud835\udc5e \u2208 R \ud835\udc5a , a top\ud835\udc58 nearest neighbor search aims to", "metadata": {} }, { - "text": "In Algorithm 1, we illustrate how BFS operates on a graphbased index. The search begins by placing the entry node \ud835\udc5d into a min-priority queue \ud835\udc36 , referred to as the candidate queue , which prioritizes nodes closer to the query vector \ud835\udc65 \ud835\udc5e . In each iteration (lines 4-9), the algorithm selects the closest node \ud835\udc50 from \ud835\udc36 and explores its neighbors. For each unvisited neighbor \ud835\udc5b , we extract its embedding, compute its distance to the query \ud835\udc65 \ud835\udc5e , and insert \ud835\udc5b into the visited set \ud835\udc49 , the candidate queue \ud835\udc36 , and the result set \ud835\udc45 .\nThe search terminates when the candidate queue \ud835\udc36 becomes empty or when the closest node in \ud835\udc36 is farther from the query than the farthest node in the result set \ud835\udc45 , indicating that further exploration is unlikely to improve the result. The parameter \ud835\udc52\ud835\udc53 controls how many candidates the algorithm considers during the search. It acts as a quality", + "text": "retrieve a set S \u2282 \ud835\udc4b of \ud835\udc58 vectors such that:\n\nwhere Dist (\u00b7 , \u00b7) denotes a distance or similarity metric (e.g., Euclidean distance or cosine similarity).", "metadata": {} }, { - "text": "- 1: Input: Graph \ud835\udc3a with entry node \ud835\udc5d , query \ud835\udc65 \ud835\udc5e , result size \ud835\udc58 , queue size \ud835\udc52\ud835\udc53 ( \ud835\udc58 \u2264 \ud835\udc52\ud835\udc53 )\n- 2: Output: Top- \ud835\udc58 approximate neighbors \ud835\udc45\n- 3: Initialize \ud835\udc36 \u2190{ \ud835\udc5d } , \ud835\udc45 \u2190{ \ud835\udc5d } , \ud835\udc49 \u2190{ \ud835\udc5d }\n- 4: while \ud835\udc36 \u2260 \u2205 and min ( \ud835\udc36. dist ) \u2264 max ( \ud835\udc45. dist ) do\n- 5: \ud835\udc50 \u2190 node in \ud835\udc36 with smallest distance to \ud835\udc65 \ud835\udc5e\n- 6: Remove \ud835\udc50 from \ud835\udc36\n- 7: for each neighbor \ud835\udc5b of \ud835\udc50 do\n8:\nif \ud835\udc5b \u2209 \ud835\udc49 then\n9:\nExtract Embedding \ud835\udc65 \ud835\udc5b\n10:\nCompute \ud835\udc51 = \ud835\udc37\ud835\udc56\ud835\udc60\ud835\udc61 ( \ud835\udc65 \ud835\udc5e , \ud835\udc65 \ud835\udc5b )\n11:\nAdd \ud835\udc5b to \ud835\udc49 , add \ud835\udc5b to \ud835\udc36 and \ud835\udc45 with distance \ud835\udc51\n12:\nif | \ud835\udc45 | > \ud835\udc52\ud835\udc53 then\n13:\nKeep only the \ud835\udc52\ud835\udc53 closest nodes in \ud835\udc45\n- 14: return top \ud835\udc58 closest nodes in \ud835\udc45\nknob : increasing \ud835\udc52\ud835\udc53 allows the algorithm to examine more candidates, improving recall at the expense of higher latency.\nGraph-based indexes converge quickly to the nearest neighbors for two main reasons: (1) During index construction, each vector is connected to a set of approximate neighbors, typically selected to be semantically similar. If a vector is close to the query, its neighbors are also likely to be close, allowing the search to rapidly move toward high-quality results. (2) The graph implicitly yields a much finer-grained partitioning of the vector space compared to IVF, enabling the search to examine significantly fewer candidates from the entire database [22, 26, 35, 38]. An illustrative example of this traversal process is shown in Fig. 1.", + "text": "While exact search guarantees retrieval of the true nearest neighbors, it becomes computationally prohibitive at scale. Approximate nearest neighbor (ANN) methods [33, 38] offer a trade-off by allowing minor inaccuracies in exchange for substantially lower query latency. The effectiveness of an ANN", "metadata": {} }, { - "text": "Local Vector Index System Requirement. Consumer devices, such as smart home appliances and personal workstations [32, 55, 66, 70], are heavily constrained in storage capacity [45, 62, 67]. At the same time, many downstream generative AI tasks rely heavily on similarity search over dense embeddings. However, embeddings are often significantly larger than the original raw data, typically causing up to a 7 \u00d7 storage blowup [40, 57, 77]. Unlike datacenter servers, which can dedicate hundreds of gigabytes or even terabytes to store uncompressed vector indexes [7, 16], consumer devices typically share a limited storage capacity with many colocating applications and media content [63]. This tight storage constraint makes it infeasible to store large-scale, uncompressed indexes and embeddings.\nAt the same time, these devices often support user-facing tasks such as large-scale document retrieval [32, 66] or offline semantic recall [6], where second-level latency (i.e., under 10 seconds) is generally acceptable. Usability typically degrades only when response times exceed this threshold.\nFigure 1. Best-First Search in graph-based index\nThis combination of stringent storage constraints (e.g., using less than 5% of the original data size) and moderately relaxed latency requirements opens up a distinct design space for on-device vector search: a highly storage-efficient index that exploits on-device compute resources (e.g., GPU) to achieve high recall within seconds.\nExisting System Limitations on Consumer Devices. Most vector search indexes, such as HNSW and IVF, are designed to optimize retrieval accuracy and latency under the assumption that the entire index, including full-precision embeddings, fits in DRAM. As a result, they are not suitable for DRAM-constrained environments such as consumer devices. Some recent methods [59, 64] reduce memory usage by storing compressed embeddings in DRAM for initial traversal. However, they still require accessing full-precision embeddings from disk for reranking, which incurs substantial storage overhead at query time.\nTo our knowledge, there is no prior system for vector index that has explicitly targeted consumer devices where storage footprint is a first-class objective. Our goal in this work is to design a vector search system that significantly reduces storage overhead, both for embeddings and index structures, while meeting the latency and recall requirements.", + "text": "algorithm is typically measured by Recall@k, defined as:\n", "metadata": {} }, { - "text": "Quantization-based methods, such as PQ [29], are the main approach for reducing storage by approximating embeddings using compact codebooks. While these techniques can shrink the embedding size dramatically, the inherent information loss from this lossy compression often degrades retrieval accuracy. This degradation means that critical vector distinctions can be permanently lost during quantization, making it impossible to achieve high target recall using only the compressed data, a limitation we experimentally demonstrate in \u00a76 and which is documented in the literature [59]. As a result, they struggle to balance storage efficiency with the high accuracy needed for quality retrieval.", + "text": "where S is the set of true top\ud835\udc58 neighbors returned by exact search, and S \u2032 is the set returned by the ANN method. This metric quantifies the fraction of relevant neighbors successfully retrieved. Applications such as retrieval-augmented generation (RAG) typically require high recall", "metadata": {} }, { - "text": "In this section, we provide an overview of the core techniques and show how LEANN incorporates them into its architecture.\nGraph-based Recomputation. In the HNSW structure that LEANN builds upon, each query requires embeddings for\nFigure 2. LEANN System Diagram. The system combines high-degree preserving graph pruning for minimal storage footprint with graph-based recomputation and two-level search with dynamic batching for efficient query processing (Steps 1-4).\nonly a small subset of nodes, specifically those in the candidate set \ud835\udc36 defined in Algorithm 1. This observation motivates LEANN to compute these embeddings at query time rather than storing all of them beforehand. Concretely, instead of loading precomputed embeddings as in line 9, we modify the system to recompute them during query execution without changing any algorithm.\nthe embedding server (an on-device component utilizing the original embedding model for recomputation, as illustrated in Fig. 2) to obtain their corresponding embeddings. To further improve GPU utilization and reduce latency, LEANN employs a dynamic batching strategy to schedule embedding computation tasks on the GPU (\u00a74.2).\nMain Techniques. This paradigm introduces two key challenges. First, naive on-demand recomputation of embeddings at query time can lead to high search latency. Second, although LEANN removes the need to store dense embeddings, the remaining graph metadata, particularly node connectivity information, can still account for a significant portion of total storage (for example, over 10 percent).\nLEANN offers two main techniques to address the challenges mentioned before. First, LEANN uses a two-level graph traversal algorithm and a dynamic batching mechanism to reduce recomputation latency (\u00a74). Second, LEANN deploys a high degree of preserving graph pruning technique to greatly reduce the storage needed for graph metadata (\u00a75).\nSystem Workflow. The end-to-end workflow incorporating the optimizations discussed above is shown in Fig. 2. Given a dataset of items, LEANN first computes the embeddings of all items to build a vector index for the dataset using an off-shelf graph-based index. While LEANN design is agnostic to any particular graph index, we focus on the commonly used HNSW. We discuss how LEANN can be applied to other graph indices in \u00a78.1.", + "text": "(e.g., \u2265 0 . 9) to preserve downstream task quality [58].", "metadata": {} }, { - "text": "After the index is built, LEANN discards the embeddings (dense vectors) of the items, while pruning the graph for offline storage with our high degree preserving graph pruning algorithm (\u00a75). The pruning algorithm aims to preserve important high-degree nodes, as we observe that node access patterns are highly skewed in practice: a small subset of nodes, often 'hub' nodes of high degree, are frequently visited, while many others contribute little to search quality. To serve a user query at runtime, LEANN applies a two-level search algorithm (described in \u00a74.1) to traverse the pruned graph, identifying and prioritizing promising nodes for efficient exploration. These selected nodes are then sent to\nFurthermore, when additional disk space is available, LEANN uses it to cache 'hub' nodes by prioritizing the embeddings of high-degree nodes. At runtime, LEANN recomputes embeddings only for nodes not in the cache and loads cached embeddings directly from disk.", + "text": "To accelerate ANN search, vector indexes organize embeddings using data structures that reduce the number of comparisons required. Generally, a vector index consists of two primary components: (1) the stored embedding vectors themselves, representing the data, and (2) the index structure (such as graph connections or cluster", + "metadata": {} + }, + { + "text": "assignments) built upon these vectors to expedite the search. Both components contribute to the overall storage footprint. Two widely used classes of ANN indices are described below:", + "metadata": {} + }, + { + "text": "Cluster-based Index. Methods such as IVF [33] partition the dataset into clusters (or 'cells') using algorithms like K-means [9], grouping semantically similar vectors together. At query time, only the most relevant clusters are searched, reducing the overall number of comparisons.", + "metadata": {} + }, + { + "text": "Graph-based Index. Methods such as HNSW [38] and other proximity-graph-based approaches [21, 26, 59] construct a graph by linking each vector to its nearest neighbors. These indices are among the most effective for ANN search, typically requiring fewer", + "metadata": {} + }, + { + "text": "distance computations to reach a target recall compared to cluster-based alternatives like IVF. Despite differences in graph construction, most methods rely on a best-first search (BFS) strategy to process ANN queries. We describe this algorithm in \u00a72.2.", + "metadata": {} + }, + { + "text": "In Algorithm 1, we illustrate how BFS operates on a graphbased index. The search begins by placing the entry node \ud835\udc5d into a min-priority queue \ud835\udc36 , referred to as the candidate queue , which prioritizes nodes closer", + "metadata": {} + }, + { + "text": "to the query vector \ud835\udc65 \ud835\udc5e . In each iteration (lines 4-9), the algorithm selects the closest node \ud835\udc50 from \ud835\udc36 and explores its neighbors. For each unvisited neighbor \ud835\udc5b , we extract", + "metadata": {} + }, + { + "text": "its embedding, compute its distance to the query \ud835\udc65 \ud835\udc5e , and insert \ud835\udc5b into the visited set \ud835\udc49 , the candidate queue \ud835\udc36 , and the result set \ud835\udc45 .", + "metadata": {} + }, + { + "text": "The search terminates when the candidate queue \ud835\udc36 becomes empty or when the closest node in \ud835\udc36 is farther from the query than the farthest node in the result set \ud835\udc45 , indicating that further exploration is unlikely to improve the", + "metadata": {} + }, + { + "text": "result. The parameter \ud835\udc52\ud835\udc53 controls how many candidates the algorithm considers during the search. It acts as a quality", + "metadata": {} + }, + { + "text": "- 1: Input: Graph \ud835\udc3a with entry node \ud835\udc5d , query \ud835\udc65 \ud835\udc5e , result size \ud835\udc58 , queue size \ud835\udc52\ud835\udc53 ( \ud835\udc58 \u2264 \ud835\udc52\ud835\udc53 )", + "metadata": {} + }, + { + "text": "- 2: Output: Top- \ud835\udc58 approximate neighbors \ud835\udc45\n- 3: Initialize \ud835\udc36 \u2190{ \ud835\udc5d } , \ud835\udc45 \u2190{ \ud835\udc5d } , \ud835\udc49 \u2190{ \ud835\udc5d }", + "metadata": {} + }, + { + "text": "- 4: while \ud835\udc36 \u2260 \u2205 and min ( \ud835\udc36. dist ) \u2264 max ( \ud835\udc45. dist ) do", + "metadata": {} + }, + { + "text": "- 5: \ud835\udc50 \u2190 node in \ud835\udc36 with smallest distance to \ud835\udc65 \ud835\udc5e\n- 6: Remove \ud835\udc50 from \ud835\udc36\n- 7: for each neighbor \ud835\udc5b of \ud835\udc50 do\n8:", + "metadata": {} + }, + { + "text": "if \ud835\udc5b \u2209 \ud835\udc49 then\n9:\nExtract Embedding \ud835\udc65 \ud835\udc5b\n10:\nCompute \ud835\udc51 = \ud835\udc37\ud835\udc56\ud835\udc60\ud835\udc61 ( \ud835\udc65 \ud835\udc5e , \ud835\udc65 \ud835\udc5b )", + "metadata": {} + }, + { + "text": "11:\nAdd \ud835\udc5b to \ud835\udc49 , add \ud835\udc5b to \ud835\udc36 and \ud835\udc45 with distance \ud835\udc51\n12:\nif | \ud835\udc45 | > \ud835\udc52\ud835\udc53 then\n13:", + "metadata": {} + }, + { + "text": "Keep only the \ud835\udc52\ud835\udc53 closest nodes in \ud835\udc45\n- 14: return top \ud835\udc58 closest nodes in \ud835\udc45", + "metadata": {} + }, + { + "text": "knob : increasing \ud835\udc52\ud835\udc53 allows the algorithm to examine more candidates, improving recall at the expense of higher latency.", + "metadata": {} + }, + { + "text": "Graph-based indexes converge quickly to the nearest neighbors for two main reasons: (1) During index construction, each vector is connected to a set of approximate neighbors, typically selected to be semantically similar. If a vector is close to the query, its neighbors are", + "metadata": {} + }, + { + "text": "also likely to be close, allowing the search to rapidly move toward high-quality results. (2) The graph implicitly yields a much finer-grained partitioning of the vector space compared to IVF, enabling the search to examine significantly fewer candidates from the entire database", + "metadata": {} + }, + { + "text": "[22, 26, 35, 38]. An illustrative example of this traversal process is shown in Fig. 1.", + "metadata": {} + }, + { + "text": "Local Vector Index System Requirement. Consumer devices, such as smart home appliances and personal workstations [32, 55, 66, 70], are heavily constrained in storage capacity [45, 62, 67]. At", + "metadata": {} + }, + { + "text": "the same time, many downstream generative AI tasks rely heavily on similarity search over dense embeddings. However, embeddings are often significantly larger than the original raw data, typically causing up to a 7 \u00d7 storage blowup [40, 57,", + "metadata": {} + }, + { + "text": "77]. Unlike datacenter servers, which can dedicate hundreds of gigabytes or even terabytes to store uncompressed vector indexes [7, 16], consumer devices typically share a limited storage capacity with many colocating applications and media content [63]. This tight", + "metadata": {} + }, + { + "text": "storage constraint makes it infeasible to store large-scale, uncompressed indexes and embeddings.", + "metadata": {} + }, + { + "text": "At the same time, these devices often support user-facing tasks such as large-scale document retrieval [32, 66] or offline semantic recall [6], where second-level latency (i.e., under 10 seconds) is generally acceptable. Usability", + "metadata": {} + }, + { + "text": "typically degrades only when response times exceed this threshold.\nFigure 1. Best-First Search in graph-based index", + "metadata": {} + }, + { + "text": "This combination of stringent storage constraints (e.g., using less than 5% of the original data size) and moderately relaxed latency requirements opens up a distinct design space for on-device vector search: a highly storage-efficient index that exploits on-device compute resources", + "metadata": {} + }, + { + "text": "(e.g., GPU) to achieve high recall within seconds.", + "metadata": {} + }, + { + "text": "Existing System Limitations on Consumer Devices. Most vector search indexes, such as HNSW and IVF, are designed to optimize retrieval accuracy and latency under the assumption that the entire index, including full-precision embeddings, fits in DRAM. As a result,", + "metadata": {} + }, + { + "text": "they are not suitable for DRAM-constrained environments such as consumer devices. Some recent methods [59, 64] reduce memory usage by storing compressed embeddings in DRAM for initial traversal. However, they still require accessing full-precision embeddings from disk for", + "metadata": {} + }, + { + "text": "reranking, which incurs substantial storage overhead at query time.", + "metadata": {} + }, + { + "text": "To our knowledge, there is no prior system for vector index that has explicitly targeted consumer devices where storage footprint is a first-class objective. Our goal in this work is to design a vector search system that significantly reduces storage overhead, both for embeddings and index structures, while", + "metadata": {} + }, + { + "text": "meeting the latency and recall requirements.", + "metadata": {} + }, + { + "text": "Quantization-based methods, such as PQ [29], are the main approach for reducing storage by approximating embeddings using compact codebooks. While these techniques can shrink the embedding size dramatically, the inherent information loss from this lossy compression often degrades retrieval accuracy.", + "metadata": {} + }, + { + "text": "This degradation means that critical vector distinctions can be permanently lost during quantization, making it impossible to achieve high target recall using only the compressed data, a limitation we experimentally demonstrate in \u00a76 and which is documented in the literature [59]. As a result,", + "metadata": {} + }, + { + "text": "they struggle to balance storage efficiency with the high accuracy needed for quality retrieval.", + "metadata": {} + }, + { + "text": "In this section, we provide an overview of the core techniques and show how LEANN incorporates them into its architecture.\nGraph-based Recomputation. In the HNSW structure that LEANN builds upon, each query requires embeddings for", + "metadata": {} + }, + { + "text": "Figure 2. LEANN System Diagram. The system combines high-degree preserving graph pruning for minimal storage footprint with graph-based recomputation and two-level search with dynamic batching for efficient query processing (Steps 1-4).", + "metadata": {} + }, + { + "text": "only a small subset of nodes, specifically those in the candidate set \ud835\udc36 defined in Algorithm 1. This observation motivates LEANN to compute these embeddings at query time rather than storing all of them beforehand. Concretely, instead of loading precomputed embeddings as in line 9, we modify", + "metadata": {} + }, + { + "text": "the system to recompute them during query execution without changing any algorithm.", + "metadata": {} + }, + { + "text": "the embedding server (an on-device component utilizing the original embedding model for recomputation, as illustrated in Fig. 2) to obtain their corresponding embeddings. To further improve GPU utilization and reduce latency, LEANN employs a dynamic batching strategy to schedule embedding computation tasks on the GPU (\u00a74.2).", + "metadata": {} + }, + { + "text": "Main Techniques. This paradigm introduces two key challenges. First, naive on-demand recomputation of embeddings at query time can lead to high search latency. Second, although LEANN removes the need to store dense embeddings, the remaining graph metadata, particularly node connectivity information, can still account for a significant portion of total", + "metadata": {} + }, + { + "text": "storage (for example, over 10 percent).", + "metadata": {} + }, + { + "text": "LEANN offers two main techniques to address the challenges mentioned before. First, LEANN uses a two-level graph traversal algorithm and a dynamic batching mechanism to reduce recomputation latency (\u00a74). Second, LEANN deploys a high degree of preserving graph pruning technique to greatly reduce the storage needed for graph metadata", + "metadata": {} + }, + { + "text": "(\u00a75).", + "metadata": {} + }, + { + "text": "System Workflow. The end-to-end workflow incorporating the optimizations discussed above is shown in Fig. 2. Given a dataset of items, LEANN first computes the embeddings of all items to build a vector index for the dataset using an off-shelf graph-based index. While LEANN design is agnostic to", + "metadata": {} + }, + { + "text": "any particular graph index, we focus on the commonly used HNSW. We discuss how LEANN can be applied to other graph indices in \u00a78.1.", + "metadata": {} + }, + { + "text": "After the index is built, LEANN discards the embeddings (dense vectors) of the items, while pruning the graph for offline storage with our high degree preserving graph pruning algorithm (\u00a75). The pruning algorithm aims to preserve important high-degree nodes, as we observe that node access patterns are highly skewed in", + "metadata": {} + }, + { + "text": "practice: a small subset of nodes, often 'hub' nodes of high degree, are frequently visited, while many others contribute little to search quality. To serve a user query at runtime, LEANN applies a two-level search algorithm (described in \u00a74.1) to traverse the pruned", + "metadata": {} + }, + { + "text": "graph, identifying and prioritizing promising nodes for efficient exploration. These selected nodes are then sent to", + "metadata": {} + }, + { + "text": "Furthermore, when additional disk space is available, LEANN uses it to cache 'hub' nodes by prioritizing the embeddings of high-degree nodes. At runtime, LEANN recomputes embeddings only for nodes not in the cache and loads cached embeddings directly from disk.", "metadata": {} }, { @@ -65,91 +373,687 @@ "metadata": {} }, { - "text": "As shown in eq. (1), the overall latency of a single query in LEANN is largely determined by the number of nodes whose embeddings must be recomputed. To reduce this overhead, we introduce the Two-Level Search, a multi-fidelity distance computation framework that strategically varies computational intensity across different stages of the search. By using lightweight approximate computations to broadly evaluate candidates and applying exact computations only to the most promising ones, our approach reduces search latency while preserving search quality.\nAlgorithm 2 presents the complete algorithm. At each expansion step, we first compute approximate distances for all neighboring nodes using a lightweight method (line 12). We maintain an approximate queue ( \ud835\udc34\ud835\udc44 ), a priority queue that stores approximate distances for all nodes encountered throughout the search. Rather than computing exact distances for all neighbors of the current expansion node \ud835\udc63 , we define a re-ranking ratio \ud835\udc4e and extract the top \ud835\udc4e % of nodes from \ud835\udc34\ud835\udc44 . To avoid redundant computation, we exclude nodes that are already present in the exact queue ( \ud835\udc38\ud835\udc44 ). The resulting subset is denoted as \ud835\udc40 (line 14), for which we then compute exact distances.", + "text": "As shown in eq. (1), the overall latency of a single query in LEANN is largely determined by the number of nodes whose embeddings must be recomputed. To reduce this overhead, we introduce the Two-Level Search, a multi-fidelity distance computation framework that strategically", "metadata": {} }, { - "text": "1:, 1 = Input: query \ud835\udc5e , entry point \ud835\udc5d , re-ranking ratio \ud835\udc4e , result size \ud835\udc58 , search queue length \ud835\udc52\ud835\udc53. 2:, 1 = Output: \ud835\udc58 closest neighbors to \ud835\udc5e. 3:, 1 = \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u2190{ \ud835\udc5d } ; \ud835\udc34\ud835\udc44 \u2190\u2205 ; \ud835\udc38\ud835\udc44 \u2190{ \ud835\udc5d } ; \ud835\udc45 \u2190{ \ud835\udc5d }. 4:, 1 = while \ud835\udc38\ud835\udc44 \u2260 \u2205 do. 5:, 1 = \ud835\udc63 \u2190 extract closest element from \ud835\udc38\ud835\udc44 to \ud835\udc5e. 6:, 1 = \ud835\udc53 \u2190 get furthest element from \ud835\udc45 to \ud835\udc5e. 7:, 1 = if \ud835\udc51\ud835\udc56\ud835\udc60\ud835\udc61\ud835\udc4e\ud835\udc5b\ud835\udc50\ud835\udc52 ( \ud835\udc63,\ud835\udc5e ) > \ud835\udc51\ud835\udc56\ud835\udc60\ud835\udc61\ud835\udc4e\ud835\udc5b\ud835\udc50\ud835\udc52 ( \ud835\udc53 ,\ud835\udc5e ) then. 8:, 1 = break. 9:, 1 = for each \ud835\udc5b \u2208 neighbors( \ud835\udc63 ) do. 10:, 1 = if \ud835\udc5b \u2209 \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 then. 11:, 1 = \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u2190 \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u222a { \ud835\udc5b }. 12:, 1 = Calculate approximate distance \ud835\udc51 \ud835\udc4e\ud835\udc5d\ud835\udc5d\ud835\udc5f\ud835\udc5c\ud835\udc65 ( \ud835\udc5b,\ud835\udc5e ). 13:, 1 = \ud835\udc34\ud835\udc44 \u2190 \ud835\udc34\ud835\udc44 \u222a { \ud835\udc5b }. 14:, 1 = \ud835\udc40 \u2190 extract top \ud835\udc4e % from \ud835\udc34\ud835\udc44 that are not in \ud835\udc38\ud835\udc44. 15:, 1 = for each \ud835\udc5a \u2208 \ud835\udc40 do. 16:, 1 = Compute exact distance \ud835\udc51 \ud835\udc52\ud835\udc65\ud835\udc4e\ud835\udc50\ud835\udc61 ( \ud835\udc5a,\ud835\udc5e ). 17:, 1 = \ud835\udc38\ud835\udc44 \u2190 \ud835\udc38\ud835\udc44 \u222a { \ud835\udc5a } ; \ud835\udc45 \u2190 \ud835\udc45 \u222a { \ud835\udc5a }. 18:, 1 = if | \ud835\udc45 | > \ud835\udc52\ud835\udc53 then. 19:, 1 = Remove furthest element from \ud835\udc45 to \ud835\udc5e.", + "text": "varies computational intensity across different stages of the search. By using lightweight approximate computations to broadly evaluate candidates and applying exact computations only to the most promising ones, our approach reduces search latency while preserving search quality.", "metadata": {} }, { - "text": "20:, 1 = return top \ud835\udc58 elements from \ud835\udc45\nBecause \ud835\udc34\ud835\udc44 globally tracks all previously encountered nodes, the algorithm can revisit earlier neighbors that become more promising as the search progresses. As a result, even when all immediate neighbors in the current iteration are far from the query, the algorithm can still select previously seen but unexplored nodes that are now ranked higher.\nThe core insight of this design is to combine the complementary strengths of approximate and exact distance computations. Approximate distances, though not fully accurate, are often sufficient to surface the most relevant candidates near the top, enabling early pruning of unpromising directions. We exploit this by using approximate distances to evaluate neighbors during traversal, and exact distances to re-rank only the most promising candidates. This approach achieves high recall while substantially reducing computational cost, thereby lowering overall latency.\nAt the end of each iteration, nodes in \ud835\udc40 with computed exact distances are inserted into \ud835\udc38\ud835\udc44 , which serves as the candidate pool for subsequent expansions. We repeat this process iteratively, and in each iteration, the number of nodes requiring recomputation is further reduced.\nFor efficient approximate distance calculation, we employ PQ, a widely used technique that compresses the embedding space by several orders of magnitude. In our setting, we use only 2GB of PQ-compressed embeddings to represent the original 200GB of full-precision data, resulting in minimal storage overhead. Although PQ introduces some accuracy loss, our framework compensates by applying exact computations to a small subset of high-ranking candidates, thereby preserving end-to-end search quality.\nFinally, our method is flexible and generalizable. It can incorporate alternative lightweight approximation techniques beyond quantization. For instance, distillation-based embeddings or link-and-code representations [18] can be used, provided they offer sufficient efficiency. This adaptability makes the Two-Level Search paradigm applicable across diverse computational budgets and deployment scenarios.", + "text": "Algorithm 2 presents the complete algorithm. At each expansion step, we first compute approximate distances for all neighboring nodes using a lightweight method (line 12). We maintain an approximate queue ( \ud835\udc34\ud835\udc44 ), a priority queue that stores approximate distances for all nodes encountered", "metadata": {} }, { - "text": "During the search process, GPU resources are often underutilized because each expansion step only triggers recomputation for a small number of nodes, typically equal to the degree of the current node \ud835\udc63 . This problem is further exacerbated when using the Two Level Search algorithm (see line 16), where the candidate set is even more selective, resulting in smaller batch sizes. As a result, LEANN frequently fails to meet the minimum batch size required to saturate GPU throughput, leading to inefficient use of hardware resources at runtime.\nTo address this, LEANN introduces a dynamic batching strategy that slightly relaxes the strict data dependency in best-first search in Algorithm 1. While this introduces minor staleness in the expansion order, it significantly increases the batch size for the embedding model, thereby reducing the end-to-end latency per query.\nThis leads to a key challenge: how can we design an algorithm that fully utilizes GPU compute capacity and takes advantage of batch processing [15, 76] without sacrificing search efficiency?\nSpecifically, LEANN breaks the strict data dependency in best-first search, where the current node to be expanded depends on the immediate results of the previous expansion, by dynamically collecting a group of the closest candidates from the priority queue. The algorithm accumulates neighbors, that is, nodes requiring recomputation, until a target batch size is reached (for example, 64 for the A10 GPU), which can be efficiently determined through lightweight offline profiling. This dynamic batching mechanism integrates naturally with the Two-Level Search described in \u00a74.1. We accumulate nodes in the set \ud835\udc40 across iterations until the predefined batch size threshold is reached, at which point we perform embedding recomputation for all nodes in \ud835\udc40 .\nThis idea shares a similar insight with the beam search strategy used in DiskANN [59], where a fixed number of round-trip node accesses are batched together to amortize disk access latency. However, unlike DiskANN's fixed beam width, LEANN uses dynamic batching based on the degrees of current candidates, reducing staleness and offering greater flexibility for our setting. Furthermore, while DiskANN aims to reduce I/O latency, our dynamic batching strategy focuses on maximizing GPU utilization. As a result, LEANN adopts a", + "text": "throughout the search. Rather than computing exact distances for all neighbors of the current expansion node \ud835\udc63 , we define a re-ranking ratio \ud835\udc4e and extract the top \ud835\udc4e % of nodes from \ud835\udc34\ud835\udc44 . To avoid redundant computation, we exclude", "metadata": {} }, { - "text": "- 1: Input: Original graph \ud835\udc3a with the set of vertices \ud835\udc49 , candidate list size \ud835\udc52\ud835\udc53 , connection number threshold \ud835\udc40 for high degree nodes and \ud835\udc5a for other nodes, where \ud835\udc5a < \ud835\udc40 , percentage of high degree nodes \ud835\udc4e\n- 3: \u2200 \ud835\udc63 \u2208 \ud835\udc49 : \ud835\udc37 [ \ud835\udc63 ] \u2190 degree of \ud835\udc63 of \ud835\udc3a , \ud835\udc3a 1 \u2190 empty graph\n- 2: Output: Pruned graph \ud835\udc3a 1\n- 4: \ud835\udc49 \u2217 \u2190 nodes with the top \ud835\udc4e % highest (out) degree in \ud835\udc37\n- 5: for \ud835\udc63 \u2208 \ud835\udc49 do\n- 6: \ud835\udc4a \u2190 search( \ud835\udc63 , \ud835\udc52\ud835\udc53 )\n- \u22b2 Refer to Algorithm 1\n- 7: if \ud835\udc63 \u2208 \ud835\udc49 \u2217 then\n- 8: \ud835\udc40 0 \u2190 \ud835\udc40\n- 9: else\n- 10: \ud835\udc40 0 \u2190 \ud835\udc5a\n- 11: Select \ud835\udc40 0 neighbors from \ud835\udc4a using original heuristic 12: Add bidirectional edges between \ud835\udc63 and neighbors to\n\ud835\udc3a\n- 13:\n1\nShrink edges if \u2203 \ud835\udc5e \u2208 neighbor and \ud835\udc37 \ud835\udc5c\ud835\udc62\ud835\udc61 ( \ud835\udc5e ) > \ud835\udc40\ndifferent optimization objective: rather than minimizing disk access, it prioritizes efficient GPU usage to reduce end-to-end latency.", + "text": "nodes that are already present in the exact queue ( \ud835\udc38\ud835\udc44 ). The resulting subset is denoted as \ud835\udc40 (line 14), for which we then compute exact distances.", "metadata": {} }, { - "text": "With the Two-Level Search and dynamic batching mechanisms in place to optimize recomputation latency, we now examine how LEANN reduces the storage costs associated with graph metadata through a high degree preserving graph pruning algorithm.\nIn datacenter environments, this overhead is typically acceptable: storage is relatively inexpensive, and the operational costs of index maintenance (e.g., updates, rebuilds, and monitoring) are manageable. In contrast, consumer devices are often storage-constrained, making even the metadata footprint of the index structure a significant concern.\nAs discussed in \u00a73, while LEANN avoids storing exact embeddings by recomputing them at query time, the graph metadata used to guide the search process can still introduce substantial overhead. For example, in the datastore described by [56], the index structure alone accounts for over 30% of the total storage footprint.\nTo address this, LEANN allows users to specify a disk usage constraint \ud835\udc36 . When the metadata size exceeds this threshold, LEANN invokes a graph pruning algorithm that reduces the number of edges while preserving high-degree nodes. This design maintains retrieval accuracy and avoids significant increases in query-time latency, while substantially reducing the metadata footprint.\nThe graph, stored in a compressed sparse row (CSR) format, for example, consumes space proportional to the total\nFigure 3. Node access probability per query\nnumber of edges, i.e., the number of nodes times their average degree 2 . Since each node corresponds to a fixed chunk of text, the number of nodes is fixed given the text. The key challenge, then, is to reduce the average node degree without significantly compromising search latency. We formalize this optimization problem as follows: given a storage budget, construct a graph that maximizes search efficiency.\n\nHere, \ud835\udc5b denotes the number of nodes in the graph, corresponding to the number of text or image chunks. \ud835\udc37 \ud835\udc56 represents the degree of node \ud835\udc56 , and ef is a parameter that controls the length of the search queue, as described in Algorithm 1. During Best-First Search, each time a node \ud835\udc56 is selected for expansion, all of its \ud835\udc37 \ud835\udc56 neighbors must be recomputed 3 .", + "text": "1:, 1 = Input: query \ud835\udc5e , entry point \ud835\udc5d , re-ranking ratio \ud835\udc4e , result size \ud835\udc58 , search queue length \ud835\udc52\ud835\udc53. 2:, 1 = Output: \ud835\udc58 closest neighbors to \ud835\udc5e.", "metadata": {} }, { - "text": "While indiscriminate edge reduction in graph-based indexes often degrades search quality, as shown in \u00a76.4, our key insight is that selectively retaining hub nodes is sufficient to preserve performance. This strategy is motivated by the skewed node access pattern observed in Fig. 3, where highdegree nodes are accessed more frequently during search. Accordingly, we aim to preserve these high-degree nodes, which serve as the backbone of the graph's connectivity, even as we reduce the overall number of edges. To implement this idea, we introduce Algorithm 3. At a high level, our hubpreservation strategy incorporates two key modifications to the original graph construction process.\nThroughput denotes the number of chunks the embedding server can process per second. Since LEANN's performance bottleneck lies in recomputation as shown in Fig. 11, this formulation serves as a reasonable approximation of the search time. Finally, Dtype indicates the size of the data type used to store node connections in the graph, which is typically int32 (4 bytes).\n2 Here we refer to average out-degree.\n3 In the actual search trajectory, there may be slight differences between ef and the exact number of hops made by the query. For simplicity, we use \u02dd ef \ud835\udc56 = 1 | \ud835\udc37 \ud835\udc56 | as an approximation for the number of nodes requiring recomputation along the search path.\nOn the one hand, we apply differentiated degree thresholds to nodes based on their estimated importance. Specifically, we reduce the number of connections for most nodes to a lower threshold \ud835\udc5a (line 10), while allowing a small fraction (i.e., \ud835\udc4e %) of important nodes to retain a higher degree up to a threshold \ud835\udc40 (line 8). Given a storage budget \ud835\udc36 , LEANN automatically tunes the values of \ud835\udc5a and \ud835\udc40 through offline profiling across multiple datasets. To identify important nodes, we follow prior work [42, 51] and use node degree as a proxy for influence, selecting the top \ud835\udc4e % of nodes by degree (line 4). Empirically, we find that preserving only the top 2% of highdegree nodes significantly reduces the total number of edges while maintaining high retrieval accuracy.\nNote that this algorithm does not require knowledge about the query distribution. Hence, it can scale efficiently to large datasets, providing a simple yet effective mechanism to balance graph size and search performance.", + "text": "3:, 1 = \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u2190{ \ud835\udc5d } ; \ud835\udc34\ud835\udc44 \u2190\u2205 ; \ud835\udc38\ud835\udc44 \u2190{ \ud835\udc5d } ; \ud835\udc45 \u2190{ \ud835\udc5d }. 4:, 1 = while", "metadata": {} }, { - "text": "On the other hand, while we restrict the number of outgoing connections during node insertion, as shown in line 10, weallow all nodes to establish bidirectional edges with newly inserted nodes, up to the maximum threshold \ud835\udc40 (as shown in line 13, not \ud835\udc5a ). This design choice ensures that each node retains the opportunity to connect with high-degree hub nodes, thereby preserving the navigability of the graph with minimal impact on search quality.", + "text": "\ud835\udc38\ud835\udc44 \u2260 \u2205 do. 5:, 1 = \ud835\udc63 \u2190 extract closest element from \ud835\udc38\ud835\udc44 to \ud835\udc5e. 6:, 1 = \ud835\udc53 \u2190 get furthest element from \ud835\udc45 to \ud835\udc5e. 7:,", "metadata": {} }, { - "text": "In our evaluation, we answer the following important questions:\n- 2. How does LEANN impact downstream task accuracy, particularly when compared to low-storage alternatives such as index compression techniques and keyword-based search? (\u00a76.3)\n- 1. How does LEANN perform compared to state-of-the-art baselines in terms of latency and storage, across different search accuracy (recall) levels? (\u00a76.2)\n- 3. Howdoes each individual technique of LEANN contribute to its overall efficiency and effectiveness? (\u00a76.4)", + "text": "1 = if \ud835\udc51\ud835\udc56\ud835\udc60\ud835\udc61\ud835\udc4e\ud835\udc5b\ud835\udc50\ud835\udc52 ( \ud835\udc63,\ud835\udc5e ) > \ud835\udc51\ud835\udc56\ud835\udc60\ud835\udc61\ud835\udc4e\ud835\udc5b\ud835\udc50\ud835\udc52 ( \ud835\udc53 ,\ud835\udc5e ) then. 8:, 1 = break. 9:, 1 = for each", "metadata": {} }, { - "text": "Table 1. Summary of our dataset and index setup.\n\nDataset, Value = rpj_wiki [10]. Raw text size, Value = 76G. Chunk size, Value = 256 token. # of chunks, Value = 60 million. Embed model, Value = Contriever [27]. Embed dimension, Value = 768. Embedding size, Value = 171G. Index type, Value = FLAT. Distance metric, Value = Inner Product\nWorkloads We construct a datastore for retrieval based on the RPJ-Wiki dataset [10], a widely used corpus containing 76 GB of raw Wikipedia text. The indexing configuration is summarized in Tab. 1. Following prior work [57], we segment the text into passages of 256 tokens and generate an embedding for each chunk using Contriever [27], an unsupervised contrastive learning based dense retriever. Each embedding has a dimensionality of 768.\nFor evaluation, we adopt four standard benchmarks widely used in RAG and open-domain retrieval: NQ [31], TriviaQA [28], GPQA [48], and HotpotQA [68].\nBesides retrieval itself, we also consider the predominant downstream task of RAG. We adopt the widely deployed LLaMA model family for generation and report downstream task accuracy with the Llama-3.2-1B-Instruct model [19].\nTestbed. We evaluate our system and baselines on two hardware platforms. The first is an NVIDIA A10 server hosted on an AWS g5.48xlarge instance [4], equipped with a 96-core CPU, 2 \u00d7 3.8TB AWS NVMe SSD, and an NVIDIA A10G GPU with 24 GB of memory. The second is a Mac environment, provided via an AWS EC2 M1 Mac instance [3], featuring an Apple M1 Ultra processor (Arm64), macOS, and utilizes a 512GB Amazon EBS volume for its main storage.\nMetrics. We compare LEANN against alternative baselines in three main dimensions: storage, latency, and accuracy. For accuracy, we evaluate both the search (retrieval) accuracy and downstream task accuracy.", + "text": "\ud835\udc5b \u2208 neighbors( \ud835\udc63 ) do. 10:, 1 = if \ud835\udc5b \u2209 \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 then. 11:, 1 = \ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u2190", "metadata": {} }, { - "text": "To evaluate downstream task (RAG) accuracy, we use the exact match (EM) and the F1 score as metrics. EM measures the proportion of predictions that match the ground-truth answers provided by the query dataset. The F1 score captures the harmonic mean of precision and recall, typically calculated at the token level. It assigns partial credit by considering the overlap in tokens between the predicted answer and the ground-truth answer, even if they are not an exact match.\nTo evaluate retrieval accuracy, we report Recall@k as defined in \u00a72. In open-domain settings, ground-truth labels for retrieved passages are typically unavailable. Following standard practice [29, 54, 75], we use the results from exact search as a proxy for ground truth. In our experiments, we set \ud835\udc58 = 3 following prior work standard setup [1, 57], and report Recall@3. The exact search is implemented with faiss.IndexFlatIP over our datastore for each query set.\nFor the retrieval latency evaluation, we measure the time required to reach different target recall levels. Specifically, we perform a binary search to identify the minimal search queue length \ud835\udc52\ud835\udc53 (as defined in Algorithm 1) that meets the\nFigure 4. [Main Result]: Latency-storage trade-offs in RAG applications across four datasets and two hardware configurations. The y-axis shows the storage overhead, defined as the size of the ANN index relative to the raw data size (as detailed in Tab. 1). We vary the target recall to evaluate latency under different retrieval accuracy levels. Since recall is not applicable to BM25, it appears as a single data point in each figure. Additionally, we omit the PQ-compressed method, as it fails to reach the target recall threshold despite being a vector-based approach. As shown in Fig. 5, both BM25 and PQ result in poor downstream accuracy.\n- \u00b7 IVF(in-memory) : The Inverted File (IVF) index is a widely used cluster-based vector index. We adopt the faiss.IndexIVFFlat implementation. Following best practices from Faiss [52] and prior work [25], we set the number of centroids to \u221a \ud835\udc41 , where \ud835\udc41 is the size of the datastore. In our setup, we use a 60 \ud835\udc40 datastore, which corresponds to \ud835\udc5b\ud835\udc59\ud835\udc56\ud835\udc60\ud835\udc61 = 8192.", + "text": "\ud835\udc63\ud835\udc56\ud835\udc60\ud835\udc56\ud835\udc61\ud835\udc52\ud835\udc51 \u222a { \ud835\udc5b }. 12:, 1 = Calculate approximate distance \ud835\udc51 \ud835\udc4e\ud835\udc5d\ud835\udc5d\ud835\udc5f\ud835\udc5c\ud835\udc65 ( \ud835\udc5b,\ud835\udc5e ). 13:, 1 = \ud835\udc34\ud835\udc44 \u2190", "metadata": {} }, { - "text": "Figure 5. [Main Result]: Comparison of Exact Match and F1 scores for downstream RAG tasks across three methods: keyword search (BM25), PQ-compressed vector search, and our proposed vector search system. Our method is configured to achieve a target recall of 90%, while the PQ baseline is given extended search time to reach its highest possible recall. Here we use Llama-3.2-1B as the generation model.\n- \u00b7 DiskANN [59]: DiskANN is a graph-based vector search system optimized for memory efficiency. It keeps only a PQ table in memory and loads full embeddings from disk on demand. We configure it with \ud835\udc40 = 60 and \ud835\udc52\ud835\udc53 \ud835\udc36\ud835\udc5c\ud835\udc5b\ud835\udc60\ud835\udc61\ud835\udc5f\ud835\udc62\ud835\udc50\ud835\udc61\ud835\udc56\ud835\udc5c\ud835\udc5b = 128, following recommended settings [59].\ntarget recall, and report the average latency of 20 queries using the resulting \ud835\udc52\ud835\udc53 value.\nBaselines We compare LEANN against the following baseline methods and systems:\n- \u00b7 IVF-based recomputation : We adopt the idea of IVFbased recomputation from Edge-RAG [55], where we use online recomputation to avoid storing the full set of embeddings, while using the same construction parameters as IVF (in-memory).\n- \u00b7 IVF-Disk : IVF-Disk reduces memory usage by employing memory-mapped files ( mmap ) instead of loading the entire index into memory. We implement it using Faiss's faiss.contrib.ondisk module and adopt the same configuration as in IVF (in-memory).\n- \u00b7 PQ Compression [29]: We apply PQ to compress embeddings to match our storage footprint while preserving the graph structure.\n- \u00b7 BM25 [13, 49]: A classical lexical ranking algorithm widely used in keyword-based search.\n- \u00b7 HNSW (in-memory) [38]: HNSW is a widely-used stateof-the-art vector index [2, 47]. We use the faiss.IndexHNSWFlat implementation with construction parameters recommended by Faiss: \ud835\udc40 = 30 and \ud835\udc52\ud835\udc53 \ud835\udc36\ud835\udc5c\ud835\udc5b\ud835\udc60\ud835\udc61\ud835\udc5f\ud835\udc62\ud835\udc50\ud835\udc61\ud835\udc56\ud835\udc5c\ud835\udc5b = 128, distinct from the search-time parameter \ud835\udc52\ud835\udc53 .", + "text": "\ud835\udc34\ud835\udc44 \u222a { \ud835\udc5b }. 14:, 1 = \ud835\udc40 \u2190 extract top \ud835\udc4e % from \ud835\udc34\ud835\udc44 that are not in \ud835\udc38\ud835\udc44. 15:, 1 = for each \ud835\udc5a \u2208 \ud835\udc40", "metadata": {} }, { - "text": "Fig. 4 presents the storage consumption and end-to-end RAG query latency across all baseline systems and LEANN. The results show that LEANN is the only system that reduces storage to less than 5% of the original raw text size while maintaining reasonable latency, which we discussed in \u00a72.3, such as achieving 90% recall on GPQA in under 2 seconds.\nFor latency evaluation, we measure per-query latency under different target recall levels across all combinations of query datasets and hardware platforms. For BM25, we report a single number for its latency value using the default keyword search configuration. Unlike embedding-based search methods, BM25 is a lexical search technique and does not operate over dense embeddings. As a result, recall is not applicable for evaluating its effectiveness because it is defined based on approximate nearest neighbor retrieval. We omit results for HNSW and IVF on the Mac platform, as both methods require loading the full dense embedding matrix into memory, which leads to out-of-memory (OOM) errors. Specifically, the Mac system has 128GB of RAM, while the index size exceeds 171GB, as shown in Tab. 1. We also exclude the PQ-compressed baseline, as it fails to achieve the target recall even with an arbitrarily long search time.", + "text": "do. 16:, 1 = Compute exact distance \ud835\udc51 \ud835\udc52\ud835\udc65\ud835\udc4e\ud835\udc50\ud835\udc61 ( \ud835\udc5a,\ud835\udc5e ). 17:, 1 = \ud835\udc38\ud835\udc44 \u2190 \ud835\udc38\ud835\udc44 \u222a { \ud835\udc5a } ; \ud835\udc45 \u2190", "metadata": {} }, { - "text": "We report storage consumption as a proportion of the raw text size (76 GB), referred to as proportional size in Fig. 4. Since all methods operate on the same fixed datastore based on the RPJ-Wiki dataset, their storage consumption remains constant across hardware platforms and query datasets. The figure shows that HNSW stores all dense embeddings along with the graph structure, leading to substantial storage overhead. DiskANN incurs even higher overhead due to its sectoraligned design. Each node's data, including its embedding (768 \u00d7 4 bytes) and edge list (60 neighbors, 60 \u00d7 4 bytes), is padded to a 4 KB SSD sector, resulting in the largest storage footprint among all methods. IVF and IVF-Disk exhibit similar storage overheads, both dominated by the embedding file. The additional metadata required by IVF (e.g., centroids) is relatively small, typically amounting to only about 1 / \u221a \ud835\udc41 of the total embedding size, and thus contributes little overhead. For BM25, storage is determined by the vocabulary size and the associated posting lists (i.e., the frequency of each token). In our setting, the size of the BM25 index is comparable to that of the original corpus. LEANN stores only a compact graph structure, resulting in less than 5% additional storage. Among the baselines, IVF-based recomputation achieves the lowest storage footprint, as it only stores the IVF centroids on disk, which adds little overhead.\nFig. 4 shows that LEANN consistently outperforms EdgeRAG, an IVF-based recomputation method, achieving significantly lower latency, ranging from 21 . 17 \u00d7 to 200 . 60 \u00d7 , across all the datasets and hardware platforms. This advantage is partly due to the asymptotic difference in recomputation complexity: the number of recomputed chunks in LEANN\n\u221a", + "text": "\ud835\udc45 \u222a { \ud835\udc5a }. 18:, 1 = if | \ud835\udc45 | > \ud835\udc52\ud835\udc53 then. 19:, 1 = Remove furthest element from \ud835\udc45 to \ud835\udc5e. 20:, 1 = return top", "metadata": {} }, { - "text": "grows polylogarithmically with \ud835\udc41 , while it grows as \ud835\udc41 in Edge-RAG[65]. Graph-based baselines such as HNSW and DiskANN represent upper bounds on latency performance, as they store all embeddings in RAM or on disk. While LEANN trades some latency for substantial storage savings, its performance remains well within an acceptable range. This latency degradation is acceptable for two main reasons as we discussed in \u00a72.3: (1) second-level latency is acceptable for large-scale local document or image retrieval tasks, and (2) many downstream tasks on local devices, such as image or text generation, typically take over tens of seconds to complete [11, 34], making the additional latency introduced by LEANN reasonable in practice. Comparing across hardware platforms, A10 achieves a 2 . 28 \u00d7 to 3 . 01 \u00d7 speedup over the Mac, which aligns with their theoretical TFLOPS specifications [12, 43].", + "text": "\ud835\udc58 elements from \ud835\udc45", "metadata": {} }, { - "text": "We evaluate downstream task accuracy across four query datasets, as shown in Fig. 5. For all methods, we retrieve the top-3 most relevant documents. Our method is configured to achieve a target recall of 90%, while BM25 operates with its default keyword matching configuration. Although the PQ-compressed method fails to meet the target recall defined in \u00a76.2, it still achieves approximately 20% recall across all datasets. We include its downstream performance using these lower-quality retrieved results.\nFinally, we note that when a target recall level (e.g., 90%) is enforced, the downstream accuracy of our method aligns with that of other lossless ANN approaches, confirming that our system does not sacrifice accuracy for storage efficiency.\nAs illustrated in Fig. 5, our method consistently achieves higher downstream accuracy across all datasets except GPQA. Our ANN method shows limited gains on GPQA due to a distributional mismatch: the RPJ-Wiki datastore is somewhat out-of-distribution for GPQA, which consists of graduatelevel questions that are poorly supported by the retrieved content from Wikipedia. The accuracy improvement on HotpotQA is also more modest compared to the first two datasets, as HotpotQA requires multi-hop reasoning, while our current setup performs only single-hop retrieval, limiting its effectiveness for this task.", + "text": "Because \ud835\udc34\ud835\udc44 globally tracks all previously encountered nodes, the algorithm can revisit earlier neighbors that become more promising as the search progresses. As a result, even when all immediate neighbors in the current iteration are far from the query, the algorithm can still select previously seen but unexplored", "metadata": {} }, { - "text": "We conduct comprehensive and detailed ablation studies to analyze the impacts of each methodology we use in LEANN.\nAblation study on latency optimization technique. To evaluate LEANN's latency optimization techniques, we incrementally enable the components introduced in \u00a74, using a fixed target recall across multiple datasets. We begin with a naive graph-based recomputation baseline. Incorporating\nFigure 6. [Ablation Study]: Speedup achieved by different optimization techniques described in \u00a74 when evaluated on four datasets to reach the same recall level on the A10 GPU. Two-level refers to the optimization in \u00a74.1, while Batch corresponds to \u00a74.2.\nFigure 7. [Ablation Study]: Comparison of pruned graph quality against two heuristic methods and the upper bound using the datastore in Tab. 1. We vary the target recall and measure the number of nodes each method needs to recompute. The dashed gray line represents the original HNSW graph, which serves as the upper bound, with twice the storage (i.e., average degree) of the others.\nthe two-level hybrid distance computation strategy from \u00a74.1 yields an average speedup of 1 . 40 \u00d7 , reaching up to 1 . 64 \u00d7 , by reducing the number of nodes requiring recomputation and enabling lightweight distance estimation without querying the embedding server. Adding the dynamic batching technique further improves GPU utilization during recomputation, increasing the overall speedup to 1 . 76 \u00d7 , with a maximum of 2 . 02 \u00d7 . Among all datasets, HotpotQA benefits the most from batching, as its longer search queue required to achieve the target recall allows more effective grouping of multi hop requests.", + "text": "nodes that are now ranked higher.", "metadata": {} }, { - "text": "Wecompare our graph pruning algorithm with two heuristic baselines and evaluate graph quality by measuring the number of embeddings that must be fetched to achieve a given recall target, as shown in Fig. 7. In LEANN, the end-to-end latency scales linearly with the number of embeddings that\nFigure 8. [Ablation Study]: Comparison of (out-)degree distributions between the original graph, our pruning method, and two heuristic baselines. Similar to Fig. 7, the gray curve represents the original HNSW graph, which has twice the size of the others. Only our pruning method successfully preserves the high degree nodes.\nrequire recomputation, making this metric a strong proxy for retrieval latency.\nThe original graph, constructed on the datastore described in Tab. 1, has an average degree of 18. All three pruning methods, ours and the two baselines, are applied to reduce the total number of edges by half, thereby halving the graph's storage overhead.\nThe two heuristic baselines are as follows: (1) Random Prune , which randomly removes 50% of the existing edges from the original graph; and (2) Small M , which directly constrains the maximum out-degree during graph construction, resulting in an average degree that is half that of the original graph.\nWe evaluate the performance of different graph structures on the NQ dataset by varying the search queue length \ud835\udc52\ud835\udc53 , aiming to determine the minimum number of embeddings that must be fetched to achieve various recall targets. As shown in Fig. 7, our pruning method introduced in \u00a75 achieves performance comparable to the original unpruned graph, despite using only half the edges. It outperforms the Random Prune baseline by up to 1 . 18 \u00d7 and the Small M baseline by up to 5 . 76 \u00d7 . We omit the Small M data points at 94% and 96% recall targets due to their poor performance.", + "text": "The core insight of this design is to combine the complementary strengths of approximate and exact distance computations. Approximate distances, though not fully accurate, are often sufficient to surface the most relevant candidates near the top, enabling early pruning of unpromising directions. We exploit this by using approximate distances", "metadata": {} }, { - "text": "Degree Distribution in Pruned Graphs. To better understand the effectiveness of our pruning strategy, we analyze the out-degree distributions of the original graph, our approach, Random Prune, and Small M. As discussed in \u00a75, our design explicitly aims to preserve high-degree 'hub' nodes. As shown in Fig. 8, it successfully retains a substantial number of such nodes, whereas the other two baselines fail to do so. This underscores the critical role of hub nodes in supporting efficient graph-based vector search, a finding that aligns with insights from prior work [39, 42, 51].\nFigure 9. [Ablation Study]: Latency on the A10 GPU and accuracy of a smaller embedding model evaluated on a 2Mchunk datastore, using a fixed search queue length of ef=50 . The smaller embedding model significantly reduces latency without causing a substantial drop in downstream accuracy.\nUsing different embedding model sizes. Since the primary bottleneck of our system lies in the recomputation process, as shown in Fig. 11 later, we further explore the potential for latency reduction by adopting a smaller embedding model. Specifically, we replace the original contriever model (110M parameters) used in \u00a76.2 with the lightweight GTE-small model [36], which has only 34M parameters. We evaluate performance on a smaller 2M document datastore using a fixed search queue length of ef=50 , as shown in Fig. 9. The results show that GTE-small achieves a 2 . 3 \u00d7 speedup while maintaining downstream task accuracy within 2% of the Contriever baseline. This demonstrates the potential of LEANN to further reduce search latency by leveraging a lightweight embedding model.", + "text": "to evaluate neighbors during traversal, and exact distances to re-rank only the most promising candidates. This approach achieves high recall while substantially reducing computational cost, thereby lowering overall latency.", "metadata": {} }, { - "text": "Relaxing disk constraint. As discussed in \u00a73, when disk storage constraints are relaxed, LEANN can materialize the embeddings of high-degree nodes to reduce recomputation overhead. This effectively builds an on-disk embedding cache, reducing the number of nodes that need to be recomputed at query time. For instance, storing just 10% of the original embeddings yields a 1 . 47 \u00d7 speedup, with a cache hit rate of up to 41.9%. This high cache hit rate arises from the skewed access pattern characteristic of graph-based traversal. However, the observed speedup does not fully align with the hit rate due to the non-negligible loading overhead introduced by SSDs with limited bandwidth.\nGraph-based recomputation breakdown. Fig. 11 breaks down the time cost of a single batch in graph-based recomputation into three stages, categorized by the primary system resource used. Each batch aggregates multiple hops of recomputation, as described in \u00a74.2. First, LEANN performs PQ lookups to select promising nodes, then retrieves and tokenizes the corresponding raw text. The tokenized inputs are sent to the embedding server. Finally, LEANN performs embedding recomputation and distance calculation.\nFigure 10. [Ablation Study]: Latency and cache hit rate comparison under varying storage constraints across four datasets. The x-axis indicates total storage size (graph size + cached embeddings on disk) and the corresponding percentage of cached embeddings.\nFigure 11. [Ablation Study]: Latency breakdown of a batch of requests in graph-based recomputation.\nAlthough embedding recomputation is the primary bottleneck in LEANN, accounting for 76% of total latency, the three stages-spanning I/O, CPU, and GPU resources-can potentially be overlapped to improve overall efficiency. We leave this optimization for future work.", + "text": "At the end of each iteration, nodes in \ud835\udc40 with computed exact distances are inserted into \ud835\udc38\ud835\udc44 , which serves as the candidate pool for subsequent expansions. We repeat this process iteratively, and in each iteration, the number of nodes requiring recomputation is further reduced.", "metadata": {} }, { - "text": "General Vector Search. Vector search primarily follows two paradigms: IVF [33] and proximity graphs [38]. IVF clusters vectors and probes relevant subsets during search, while graph-based methods such as HNSW [38], NSG [21], Vamana [59], and others [8, 20, 41] connect similar vectors to enable efficient traversal. Graph-based approaches are widely regarded as state of the art due to their favorable trade-offs between accuracy and efficiency [65]. Prior work has explored reducing graph size through learned neighbor selection [5, 73], but these methods are often impractical due to the high training cost and the need for labeled data.\nResource-Constrained Vector Search. Numerous efforts have aimed to reduce the memory footprint of vector search. Disk-based approaches such as DiskANN [59] store both vectors and graph structures on disk, leveraging in-memory compressed embeddings for navigation. Starling [64] improves I/O efficiency for disk-resident graphs, while FusionANNS [61] enables cost-effective search through coordinated use of SSD, CPU, and GPU resources. AiSAQ [60], LM-DiskANN [46] further minimizes DRAM usage by storing compressed embeddings directly on disk. EdgeRAG [55] alleviates memory pressure by generating embeddings online using an IVF-based index. However, it still incurs substantial storage overhead due to the need to maintain large clusters on disk as dictated by its design, and its performance degrades at scale owing to the high recomputation cost introduced by an inefficient index structure. An alternative approach is embedding compression, such as PQ [29], or more recent methods like RabitQ [23], which offers quantization with theoretical error bounds. Yet, these methods struggle to maintain high search accuracy under tight storage budgets. In contrast, LEANN integrates on-the-fly embedding recomputation with a graph-based index, incorporating highdegree preserving graph pruning and a specialized traversal algorithm optimized for edge devices.", + "text": "For efficient approximate distance calculation, we employ PQ, a widely used technique that compresses the embedding space by several orders of magnitude. In our setting, we use only 2GB of PQ-compressed embeddings to represent the original 200GB of full-precision data, resulting", "metadata": {} }, { - "text": "Vector Search Applications on Edge Devices. On-device vector search enables privacy-preserving, low-latency, and offline capabilities across diverse applications. On-device RAGsystems ground language models in personal document collections while maintaining data privacy [32, 53, 66, 72]. Personalized recommendation systems [69] match user profiles with item embeddings directly on the device, while content-based search over large collections of locally stored images and videos employs efficient vision embedding models [50] to generate vector representations for fast retrieval. These applications motivate the design of LEANN to enable efficient, low-overhead vector search on edge devices.", + "text": "in minimal storage overhead. Although PQ introduces some accuracy loss, our framework compensates by applying exact computations to a small subset of high-ranking candidates, thereby preserving end-to-end search quality.", + "metadata": {} + }, + { + "text": "Finally, our method is flexible and generalizable. It can incorporate alternative lightweight approximation techniques beyond quantization. For instance, distillation-based embeddings or link-and-code representations [18] can be used, provided they offer sufficient efficiency. This adaptability makes the Two-Level Search paradigm applicable", + "metadata": {} + }, + { + "text": "across diverse computational budgets and deployment scenarios.", + "metadata": {} + }, + { + "text": "During the search process, GPU resources are often underutilized because each expansion step only triggers recomputation for a small number of nodes, typically equal to the degree of the current node \ud835\udc63 . This problem is further exacerbated when using the Two Level Search algorithm", + "metadata": {} + }, + { + "text": "(see line 16), where the candidate set is even more selective, resulting in smaller batch sizes. As a result, LEANN frequently fails to meet the minimum batch size required to saturate GPU throughput, leading to inefficient use of hardware resources at", + "metadata": {} + }, + { + "text": "runtime.", + "metadata": {} + }, + { + "text": "To address this, LEANN introduces a dynamic batching strategy that slightly relaxes the strict data dependency in best-first search in Algorithm 1. While this introduces minor staleness in the expansion order, it significantly increases the batch size for the embedding model, thereby reducing", + "metadata": {} + }, + { + "text": "the end-to-end latency per query.\nThis leads to a key challenge: how can we design an algorithm that fully utilizes GPU compute capacity and takes advantage of batch processing [15, 76] without sacrificing search efficiency?", + "metadata": {} + }, + { + "text": "Specifically, LEANN breaks the strict data dependency in best-first search, where the current node to be expanded depends on the immediate results of the previous expansion, by dynamically collecting a group of the closest candidates from the priority queue. The algorithm accumulates neighbors,", + "metadata": {} + }, + { + "text": "that is, nodes requiring recomputation, until a target batch size is reached (for example, 64 for the A10 GPU), which can be efficiently determined through lightweight offline profiling. This dynamic batching mechanism integrates naturally with the Two-Level Search described in", + "metadata": {} + }, + { + "text": "\u00a74.1. We accumulate nodes in the set \ud835\udc40 across iterations until the predefined batch size threshold is reached, at which point we perform embedding recomputation for all nodes in \ud835\udc40 .", + "metadata": {} + }, + { + "text": "This idea shares a similar insight with the beam search strategy used in DiskANN [59], where a fixed number of round-trip node accesses are batched together to amortize disk access latency. However, unlike DiskANN's fixed beam width, LEANN", + "metadata": {} + }, + { + "text": "uses dynamic batching based on the degrees of current candidates, reducing staleness and offering greater flexibility for our setting. Furthermore, while DiskANN aims to reduce I/O latency, our dynamic batching strategy focuses on maximizing GPU utilization. As a result, LEANN adopts a", + "metadata": {} + }, + { + "text": "- 1: Input: Original graph \ud835\udc3a with the set of vertices \ud835\udc49 , candidate list size \ud835\udc52\ud835\udc53 , connection number threshold \ud835\udc40 for high degree nodes and \ud835\udc5a for other nodes, where \ud835\udc5a <", + "metadata": {} + }, + { + "text": "\ud835\udc40 , percentage of high degree nodes \ud835\udc4e\n- 3: \u2200 \ud835\udc63 \u2208 \ud835\udc49 : \ud835\udc37 [ \ud835\udc63 ] \u2190 degree of \ud835\udc63 of \ud835\udc3a , \ud835\udc3a 1 \u2190 empty graph", + "metadata": {} + }, + { + "text": "- 2: Output: Pruned graph \ud835\udc3a 1\n- 4: \ud835\udc49 \u2217 \u2190 nodes with the top \ud835\udc4e % highest (out) degree in \ud835\udc37", + "metadata": {} + }, + { + "text": "- 5: for \ud835\udc63 \u2208 \ud835\udc49 do\n- 6: \ud835\udc4a \u2190 search( \ud835\udc63 , \ud835\udc52\ud835\udc53 )\n- \u22b2 Refer to Algorithm 1", + "metadata": {} + }, + { + "text": "- 7: if \ud835\udc63 \u2208 \ud835\udc49 \u2217 then\n- 8: \ud835\udc40 0 \u2190 \ud835\udc40\n- 9: else\n- 10: \ud835\udc40 0 \u2190 \ud835\udc5a", + "metadata": {} + }, + { + "text": "- 11: Select \ud835\udc40 0 neighbors from \ud835\udc4a using original heuristic 12: Add bidirectional edges between \ud835\udc63 and neighbors to\n\ud835\udc3a\n- 13:\n1", + "metadata": {} + }, + { + "text": "Shrink edges if \u2203 \ud835\udc5e \u2208 neighbor and \ud835\udc37 \ud835\udc5c\ud835\udc62\ud835\udc61 ( \ud835\udc5e ) > \ud835\udc40\ndifferent optimization objective: rather than minimizing disk access, it prioritizes efficient GPU usage to reduce end-to-end latency.", + "metadata": {} + }, + { + "text": "With the Two-Level Search and dynamic batching mechanisms in place to optimize recomputation latency, we now examine how LEANN reduces the storage costs associated with graph metadata through a high degree preserving graph pruning algorithm.", + "metadata": {} + }, + { + "text": "In datacenter environments, this overhead is typically acceptable: storage is relatively inexpensive, and the operational costs of index maintenance (e.g., updates, rebuilds, and monitoring) are manageable. In contrast, consumer devices are often storage-constrained, making even the metadata footprint of the index", + "metadata": {} + }, + { + "text": "structure a significant concern.", + "metadata": {} + }, + { + "text": "As discussed in \u00a73, while LEANN avoids storing exact embeddings by recomputing them at query time, the graph metadata used to guide the search process can still introduce substantial overhead. For example, in the datastore described by [56], the index structure alone accounts for over", + "metadata": {} + }, + { + "text": "30% of the total storage footprint.", + "metadata": {} + }, + { + "text": "To address this, LEANN allows users to specify a disk usage constraint \ud835\udc36 . When the metadata size exceeds this threshold, LEANN invokes a graph pruning algorithm that reduces the number of edges while preserving high-degree nodes. This design maintains retrieval accuracy and avoids significant increases in query-time", + "metadata": {} + }, + { + "text": "latency, while substantially reducing the metadata footprint.\nThe graph, stored in a compressed sparse row (CSR) format, for example, consumes space proportional to the total\nFigure 3. Node access probability per query", + "metadata": {} + }, + { + "text": "number of edges, i.e., the number of nodes times their average degree 2 . Since each node corresponds to a fixed chunk of text, the number of nodes is fixed given the text. The key challenge, then, is to reduce the average node degree without significantly compromising search", + "metadata": {} + }, + { + "text": "latency. We formalize this optimization problem as follows: given a storage budget, construct a graph that maximizes search efficiency.\n", + "metadata": {} + }, + { + "text": "Here, \ud835\udc5b denotes the number of nodes in the graph, corresponding to the number of text or image chunks. \ud835\udc37 \ud835\udc56 represents the degree of node \ud835\udc56 , and ef is a parameter that controls the length of the search queue, as described in Algorithm", + "metadata": {} + }, + { + "text": "1. During Best-First Search, each time a node \ud835\udc56 is selected for expansion, all of its \ud835\udc37 \ud835\udc56 neighbors must be recomputed 3 .", + "metadata": {} + }, + { + "text": "While indiscriminate edge reduction in graph-based indexes often degrades search quality, as shown in \u00a76.4, our key insight is that selectively retaining hub nodes is sufficient to preserve performance. This strategy is motivated by the skewed node access pattern observed in Fig. 3, where", + "metadata": {} + }, + { + "text": "highdegree nodes are accessed more frequently during search. Accordingly, we aim to preserve these high-degree nodes, which serve as the backbone of the graph's connectivity, even as we reduce the overall number of edges. To implement this idea, we introduce Algorithm 3. At a high", + "metadata": {} + }, + { + "text": "level, our hubpreservation strategy incorporates two key modifications to the original graph construction process.", + "metadata": {} + }, + { + "text": "Throughput denotes the number of chunks the embedding server can process per second. Since LEANN's performance bottleneck lies in recomputation as shown in Fig. 11, this formulation serves as a reasonable approximation of the search time. Finally, Dtype indicates the size of the data type used", + "metadata": {} + }, + { + "text": "to store node connections in the graph, which is typically int32 (4 bytes).\n2 Here we refer to average out-degree.", + "metadata": {} + }, + { + "text": "3 In the actual search trajectory, there may be slight differences between ef and the exact number of hops made by the query. For simplicity, we use \u02dd ef \ud835\udc56 = 1 | \ud835\udc37 \ud835\udc56 | as an approximation for the number of nodes requiring", + "metadata": {} + }, + { + "text": "recomputation along the search path.", + "metadata": {} + }, + { + "text": "On the one hand, we apply differentiated degree thresholds to nodes based on their estimated importance. Specifically, we reduce the number of connections for most nodes to a lower threshold \ud835\udc5a (line 10), while allowing a small fraction (i.e., \ud835\udc4e %) of", + "metadata": {} + }, + { + "text": "important nodes to retain a higher degree up to a threshold \ud835\udc40 (line 8). Given a storage budget \ud835\udc36 , LEANN automatically tunes the values of \ud835\udc5a and \ud835\udc40 through offline profiling across multiple datasets. To identify important nodes, we follow prior work", + "metadata": {} + }, + { + "text": "[42, 51] and use node degree as a proxy for influence, selecting the top \ud835\udc4e % of nodes by degree (line 4). Empirically, we find that preserving only the top 2% of highdegree nodes significantly reduces the total number", + "metadata": {} + }, + { + "text": "of edges while maintaining high retrieval accuracy.\nNote that this algorithm does not require knowledge about the query distribution. Hence, it can scale efficiently to large datasets, providing a simple yet effective mechanism to balance graph size and search performance.", + "metadata": {} + }, + { + "text": "On the other hand, while we restrict the number of outgoing connections during node insertion, as shown in line 10, weallow all nodes to establish bidirectional edges with newly inserted nodes, up to the maximum threshold \ud835\udc40 (as shown in line 13, not", + "metadata": {} + }, + { + "text": "\ud835\udc5a ). This design choice ensures that each node retains the opportunity to connect with high-degree hub nodes, thereby preserving the navigability of the graph with minimal impact on search quality.", + "metadata": {} + }, + { + "text": "In our evaluation, we answer the following important questions:\n- 2. How does LEANN impact downstream task accuracy, particularly when compared to low-storage alternatives such as index compression techniques and keyword-based search? (\u00a76.3)", + "metadata": {} + }, + { + "text": "- 1. How does LEANN perform compared to state-of-the-art baselines in terms of latency and storage, across different search accuracy (recall) levels? (\u00a76.2)", + "metadata": {} + }, + { + "text": "- 3. Howdoes each individual technique of LEANN contribute to its overall efficiency and effectiveness? (\u00a76.4)", + "metadata": {} + }, + { + "text": "\nTable 1. Summary of our dataset and index setup.", + "metadata": {} + }, + { + "text": "Dataset, Value = rpj_wiki [10]. Raw text size, Value = 76G. Chunk size, Value = 256 token. # of chunks, Value = 60 million. Embed model, Value = Contriever [27]. Embed", + "metadata": {} + }, + { + "text": "dimension, Value = 768. Embedding size, Value = 171G. Index type, Value = FLAT. Distance metric, Value = Inner Product", + "metadata": {} + }, + { + "text": "Workloads We construct a datastore for retrieval based on the RPJ-Wiki dataset [10], a widely used corpus containing 76 GB of raw Wikipedia text. The indexing configuration is summarized in Tab. 1. Following prior work [57], we segment the text into passages of", + "metadata": {} + }, + { + "text": "256 tokens and generate an embedding for each chunk using Contriever [27], an unsupervised contrastive learning based dense retriever. Each embedding has a dimensionality of 768.", + "metadata": {} + }, + { + "text": "For evaluation, we adopt four standard benchmarks widely used in RAG and open-domain retrieval: NQ [31], TriviaQA [28], GPQA [48], and HotpotQA [68].", + "metadata": {} + }, + { + "text": "Besides retrieval itself, we also consider the predominant downstream task of RAG. We adopt the widely deployed LLaMA model family for generation and report downstream task accuracy with the Llama-3.2-1B-Instruct model [19].", + "metadata": {} + }, + { + "text": "Testbed. We evaluate our system and baselines on two hardware platforms. The first is an NVIDIA A10 server hosted on an AWS g5.48xlarge instance [4], equipped with a 96-core CPU, 2 \u00d7 3.8TB AWS NVMe", + "metadata": {} + }, + { + "text": "SSD, and an NVIDIA A10G GPU with 24 GB of memory. The second is a Mac environment, provided via an AWS EC2 M1 Mac instance [3], featuring an Apple M1 Ultra processor (Arm64), macOS, and utilizes a", + "metadata": {} + }, + { + "text": "512GB Amazon EBS volume for its main storage.\nMetrics. We compare LEANN against alternative baselines in three main dimensions: storage, latency, and accuracy. For accuracy, we evaluate both the search (retrieval) accuracy and downstream task accuracy.", + "metadata": {} + }, + { + "text": "To evaluate downstream task (RAG) accuracy, we use the exact match (EM) and the F1 score as metrics. EM measures the proportion of predictions that match the ground-truth answers provided by the query dataset. The F1 score captures the harmonic mean of precision and recall, typically", + "metadata": {} + }, + { + "text": "calculated at the token level. It assigns partial credit by considering the overlap in tokens between the predicted answer and the ground-truth answer, even if they are not an exact match.", + "metadata": {} + }, + { + "text": "To evaluate retrieval accuracy, we report Recall@k as defined in \u00a72. In open-domain settings, ground-truth labels for retrieved passages are typically unavailable. Following standard practice [29, 54, 75], we use the results from exact search as a proxy for ground", + "metadata": {} + }, + { + "text": "truth. In our experiments, we set \ud835\udc58 = 3 following prior work standard setup [1, 57], and report Recall@3. The exact search is implemented with faiss.IndexFlatIP over our datastore for each query set.", + "metadata": {} + }, + { + "text": "For the retrieval latency evaluation, we measure the time required to reach different target recall levels. Specifically, we perform a binary search to identify the minimal search queue length \ud835\udc52\ud835\udc53 (as defined in Algorithm 1) that meets the", + "metadata": {} + }, + { + "text": "Figure 4. [Main Result]: Latency-storage trade-offs in RAG applications across four datasets and two hardware configurations. The y-axis shows the storage overhead, defined as the size of the ANN index relative to the raw data size (as detailed in Tab. 1). We vary the", + "metadata": {} + }, + { + "text": "target recall to evaluate latency under different retrieval accuracy levels. Since recall is not applicable to BM25, it appears as a single data point in each figure. Additionally, we omit the PQ-compressed method, as it fails to reach the target recall threshold despite being a vector-based approach. As", + "metadata": {} + }, + { + "text": "shown in Fig. 5, both BM25 and PQ result in poor downstream accuracy.", + "metadata": {} + }, + { + "text": "- \u00b7 IVF(in-memory) : The Inverted File (IVF) index is a widely used cluster-based vector index. We adopt the faiss.IndexIVFFlat implementation. Following best practices from Faiss [52] and prior work [25], we set the number of", + "metadata": {} + }, + { + "text": "centroids to \u221a \ud835\udc41 , where \ud835\udc41 is the size of the datastore. In our setup, we use a 60 \ud835\udc40 datastore, which corresponds to \ud835\udc5b\ud835\udc59\ud835\udc56\ud835\udc60\ud835\udc61 = 8192.", + "metadata": {} + }, + { + "text": "Figure 5. [Main Result]: Comparison of Exact Match and F1 scores for downstream RAG tasks across three methods: keyword search (BM25), PQ-compressed vector search, and our proposed vector search system. Our method is configured to achieve a target recall of 90%,", + "metadata": {} + }, + { + "text": "while the PQ baseline is given extended search time to reach its highest possible recall. Here we use Llama-3.2-1B as the generation model.", + "metadata": {} + }, + { + "text": "- \u00b7 DiskANN [59]: DiskANN is a graph-based vector search system optimized for memory efficiency. It keeps only a PQ table in memory and loads full embeddings from disk on demand. We configure it with \ud835\udc40 = 60 and \ud835\udc52\ud835\udc53", + "metadata": {} + }, + { + "text": "\ud835\udc36\ud835\udc5c\ud835\udc5b\ud835\udc60\ud835\udc61\ud835\udc5f\ud835\udc62\ud835\udc50\ud835\udc61\ud835\udc56\ud835\udc5c\ud835\udc5b = 128, following recommended settings [59].\ntarget recall, and report the average latency of 20 queries using the resulting \ud835\udc52\ud835\udc53 value.", + "metadata": {} + }, + { + "text": "Baselines We compare LEANN against the following baseline methods and systems:", + "metadata": {} + }, + { + "text": "- \u00b7 IVF-based recomputation : We adopt the idea of IVFbased recomputation from Edge-RAG [55], where we use online recomputation to avoid storing the full set of embeddings, while using the same construction parameters as IVF (in-memory).", + "metadata": {} + }, + { + "text": "- \u00b7 IVF-Disk : IVF-Disk reduces memory usage by employing memory-mapped files ( mmap ) instead of loading the entire index into memory. We implement it using Faiss's faiss.contrib.ondisk module and adopt the same configuration as in IVF (in-memory).", + "metadata": {} + }, + { + "text": "- \u00b7 PQ Compression [29]: We apply PQ to compress embeddings to match our storage footprint while preserving the graph structure.\n- \u00b7 BM25 [13, 49]: A classical lexical ranking algorithm widely used in keyword-based search.", + "metadata": {} + }, + { + "text": "- \u00b7 HNSW (in-memory) [38]: HNSW is a widely-used stateof-the-art vector index [2, 47]. We use the faiss.IndexHNSWFlat implementation with construction parameters recommended by Faiss: \ud835\udc40 = 30", + "metadata": {} + }, + { + "text": "and \ud835\udc52\ud835\udc53 \ud835\udc36\ud835\udc5c\ud835\udc5b\ud835\udc60\ud835\udc61\ud835\udc5f\ud835\udc62\ud835\udc50\ud835\udc61\ud835\udc56\ud835\udc5c\ud835\udc5b = 128, distinct from the search-time parameter \ud835\udc52\ud835\udc53 .", + "metadata": {} + }, + { + "text": "Fig. 4 presents the storage consumption and end-to-end RAG query latency across all baseline systems and LEANN. The results show that LEANN is the only system that reduces storage to less than 5% of the original raw text size while maintaining reasonable", + "metadata": {} + }, + { + "text": "latency, which we discussed in \u00a72.3, such as achieving 90% recall on GPQA in under 2 seconds.", + "metadata": {} + }, + { + "text": "For latency evaluation, we measure per-query latency under different target recall levels across all combinations of query datasets and hardware platforms. For BM25, we report a single number for its latency value using the default keyword search configuration. Unlike embedding-based search methods,", + "metadata": {} + }, + { + "text": "BM25 is a lexical search technique and does not operate over dense embeddings. As a result, recall is not applicable for evaluating its effectiveness because it is defined based on approximate nearest neighbor retrieval. We omit results for HNSW and IVF on the Mac", + "metadata": {} + }, + { + "text": "platform, as both methods require loading the full dense embedding matrix into memory, which leads to out-of-memory (OOM) errors. Specifically, the Mac system has 128GB of RAM, while the index size exceeds 171GB, as shown", + "metadata": {} + }, + { + "text": "in Tab. 1. We also exclude the PQ-compressed baseline, as it fails to achieve the target recall even with an arbitrarily long search time.", + "metadata": {} + }, + { + "text": "We report storage consumption as a proportion of the raw text size (76 GB), referred to as proportional size in Fig. 4. Since all methods operate on the same fixed datastore based on the RPJ-Wiki dataset, their storage consumption remains constant across hardware", + "metadata": {} + }, + { + "text": "platforms and query datasets. The figure shows that HNSW stores all dense embeddings along with the graph structure, leading to substantial storage overhead. DiskANN incurs even higher overhead due to its sectoraligned design. Each node's data, including its embedding", + "metadata": {} + }, + { + "text": "(768 \u00d7 4 bytes) and edge list (60 neighbors, 60 \u00d7 4 bytes), is padded to a 4 KB SSD sector, resulting in the largest storage footprint among all methods. IVF and IVF-Disk exhibit", + "metadata": {} + }, + { + "text": "similar storage overheads, both dominated by the embedding file. The additional metadata required by IVF (e.g., centroids) is relatively small, typically amounting to only about 1 / \u221a \ud835\udc41 of the total embedding size, and thus contributes little", + "metadata": {} + }, + { + "text": "overhead. For BM25, storage is determined by the vocabulary size and the associated posting lists (i.e., the frequency of each token). In our setting, the size of the BM25 index is comparable to that of the original corpus. LEANN", + "metadata": {} + }, + { + "text": "stores only a compact graph structure, resulting in less than 5% additional storage. Among the baselines, IVF-based recomputation achieves the lowest storage footprint, as it only stores the IVF centroids on disk, which adds little overhead.", + "metadata": {} + }, + { + "text": "Fig. 4 shows that LEANN consistently outperforms EdgeRAG, an IVF-based recomputation method, achieving significantly lower latency, ranging from 21 . 17 \u00d7 to 200 . 60 \u00d7 , across all the", + "metadata": {} + }, + { + "text": "datasets and hardware platforms. This advantage is partly due to the asymptotic difference in recomputation complexity: the number of recomputed chunks in LEANN\n\u221a", + "metadata": {} + }, + { + "text": "grows polylogarithmically with \ud835\udc41 , while it grows as \ud835\udc41 in Edge-RAG[65]. Graph-based baselines such as HNSW and DiskANN represent upper bounds on latency performance, as they store all embeddings in RAM", + "metadata": {} + }, + { + "text": "or on disk. While LEANN trades some latency for substantial storage savings, its performance remains well within an acceptable range. This latency degradation is acceptable for two main reasons as we discussed in \u00a72.3: (1) second-level latency is acceptable for large-scale", + "metadata": {} + }, + { + "text": "local document or image retrieval tasks, and (2) many downstream tasks on local devices, such as image or text generation, typically take over tens of seconds to complete [11, 34], making the additional latency introduced by LEANN reasonable in practice.", + "metadata": {} + }, + { + "text": "Comparing across hardware platforms, A10 achieves a 2 . 28 \u00d7 to 3 . 01 \u00d7 speedup over the Mac, which aligns with their theoretical TFLOPS specifications [12, 43].", + "metadata": {} + }, + { + "text": "We evaluate downstream task accuracy across four query datasets, as shown in Fig. 5. For all methods, we retrieve the top-3 most relevant documents. Our method is configured to achieve a target recall of 90%, while BM25", + "metadata": {} + }, + { + "text": "operates with its default keyword matching configuration. Although the PQ-compressed method fails to meet the target recall defined in \u00a76.2, it still achieves approximately 20% recall across all datasets. We include its downstream performance using these lower-quality", + "metadata": {} + }, + { + "text": "retrieved results.", + "metadata": {} + }, + { + "text": "Finally, we note that when a target recall level (e.g., 90%) is enforced, the downstream accuracy of our method aligns with that of other lossless ANN approaches, confirming that our system does not sacrifice accuracy for storage efficiency.", + "metadata": {} + }, + { + "text": "As illustrated in Fig. 5, our method consistently achieves higher downstream accuracy across all datasets except GPQA. Our ANN method shows limited gains on GPQA due to a distributional mismatch: the RPJ-Wiki datastore is somewhat out-of-distribution", + "metadata": {} + }, + { + "text": "for GPQA, which consists of graduatelevel questions that are poorly supported by the retrieved content from Wikipedia. The accuracy improvement on HotpotQA is also more modest compared to the first two datasets, as HotpotQA requires multi-hop reasoning, while our", + "metadata": {} + }, + { + "text": "current setup performs only single-hop retrieval, limiting its effectiveness for this task.", + "metadata": {} + }, + { + "text": "We conduct comprehensive and detailed ablation studies to analyze the impacts of each methodology we use in LEANN.", + "metadata": {} + }, + { + "text": "Ablation study on latency optimization technique. To evaluate LEANN's latency optimization techniques, we incrementally enable the components introduced in \u00a74, using a fixed target recall across multiple datasets. We begin with a naive graph-based recomputation baseline. Incorporating", + "metadata": {} + }, + { + "text": "Figure 6. [Ablation Study]: Speedup achieved by different optimization techniques described in \u00a74 when evaluated on four datasets to reach the same recall level on the A10 GPU. Two-level refers to the optimization in \u00a74.1, while Batch corresponds to", + "metadata": {} + }, + { + "text": "\u00a74.2.", + "metadata": {} + }, + { + "text": "Figure 7. [Ablation Study]: Comparison of pruned graph quality against two heuristic methods and the upper bound using the datastore in Tab. 1. We vary the target recall and measure the number of nodes each method needs to recompute. The dashed gray line represents", + "metadata": {} + }, + { + "text": "the original HNSW graph, which serves as the upper bound, with twice the storage (i.e., average degree) of the others.", + "metadata": {} + }, + { + "text": "the two-level hybrid distance computation strategy from \u00a74.1 yields an average speedup of 1 . 40 \u00d7 , reaching up to 1 . 64 \u00d7 , by reducing the number of nodes requiring recomputation and enabling lightweight distance estimation without querying the embedding", + "metadata": {} + }, + { + "text": "server. Adding the dynamic batching technique further improves GPU utilization during recomputation, increasing the overall speedup to 1 . 76 \u00d7 , with a maximum of 2 . 02 \u00d7 . Among all datasets, HotpotQA benefits the most from batching, as its", + "metadata": {} + }, + { + "text": "longer search queue required to achieve the target recall allows more effective grouping of multi hop requests.", + "metadata": {} + }, + { + "text": "Wecompare our graph pruning algorithm with two heuristic baselines and evaluate graph quality by measuring the number of embeddings that must be fetched to achieve a given recall target, as shown in Fig. 7. In LEANN, the end-to-end latency scales linearly with the number", + "metadata": {} + }, + { + "text": "of embeddings that", + "metadata": {} + }, + { + "text": "Figure 8. [Ablation Study]: Comparison of (out-)degree distributions between the original graph, our pruning method, and two heuristic baselines. Similar to Fig. 7, the gray curve represents the original HNSW graph, which has twice the size of", + "metadata": {} + }, + { + "text": "the others. Only our pruning method successfully preserves the high degree nodes.\nrequire recomputation, making this metric a strong proxy for retrieval latency.", + "metadata": {} + }, + { + "text": "The original graph, constructed on the datastore described in Tab. 1, has an average degree of 18. All three pruning methods, ours and the two baselines, are applied to reduce the total number of edges by half, thereby halving the graph's storage", + "metadata": {} + }, + { + "text": "overhead.", + "metadata": {} + }, + { + "text": "The two heuristic baselines are as follows: (1) Random Prune , which randomly removes 50% of the existing edges from the original graph; and (2) Small M , which directly constrains the maximum out-degree during graph construction, resulting in an average degree", + "metadata": {} + }, + { + "text": "that is half that of the original graph.", + "metadata": {} + }, + { + "text": "We evaluate the performance of different graph structures on the NQ dataset by varying the search queue length \ud835\udc52\ud835\udc53 , aiming to determine the minimum number of embeddings that must be fetched to achieve various recall targets. As shown in Fig. 7, our pruning method introduced in", + "metadata": {} + }, + { + "text": "\u00a75 achieves performance comparable to the original unpruned graph, despite using only half the edges. It outperforms the Random Prune baseline by up to 1 . 18 \u00d7 and the Small M baseline by up to 5 . 76 \u00d7 . We", + "metadata": {} + }, + { + "text": "omit the Small M data points at 94% and 96% recall targets due to their poor performance.", + "metadata": {} + }, + { + "text": "Degree Distribution in Pruned Graphs. To better understand the effectiveness of our pruning strategy, we analyze the out-degree distributions of the original graph, our approach, Random Prune, and Small M. As discussed in \u00a75, our design explicitly aims to preserve high-degree", + "metadata": {} + }, + { + "text": "'hub' nodes. As shown in Fig. 8, it successfully retains a substantial number of such nodes, whereas the other two baselines fail to do so. This underscores the critical role of hub nodes in supporting efficient graph-based vector search, a finding that aligns with", + "metadata": {} + }, + { + "text": "insights from prior work [39, 42, 51].", + "metadata": {} + }, + { + "text": "Figure 9. [Ablation Study]: Latency on the A10 GPU and accuracy of a smaller embedding model evaluated on a 2Mchunk datastore, using a fixed search queue length of ef=50 . The smaller embedding model significantly reduces latency without causing a", + "metadata": {} + }, + { + "text": "substantial drop in downstream accuracy.", + "metadata": {} + }, + { + "text": "Using different embedding model sizes. Since the primary bottleneck of our system lies in the recomputation process, as shown in Fig. 11 later, we further explore the potential for latency reduction by adopting a smaller embedding model. Specifically, we replace the original contriever model", + "metadata": {} + }, + { + "text": "(110M parameters) used in \u00a76.2 with the lightweight GTE-small model [36], which has only 34M parameters. We evaluate performance on a smaller 2M document datastore using a fixed search queue length of ef=50 ,", + "metadata": {} + }, + { + "text": "as shown in Fig. 9. The results show that GTE-small achieves a 2 . 3 \u00d7 speedup while maintaining downstream task accuracy within 2% of the Contriever baseline. This demonstrates the potential of LEANN to further reduce search latency by leveraging a", + "metadata": {} + }, + { + "text": "lightweight embedding model.", + "metadata": {} + }, + { + "text": "Relaxing disk constraint. As discussed in \u00a73, when disk storage constraints are relaxed, LEANN can materialize the embeddings of high-degree nodes to reduce recomputation overhead. This effectively builds an on-disk embedding cache, reducing the number of nodes that need to be", + "metadata": {} + }, + { + "text": "recomputed at query time. For instance, storing just 10% of the original embeddings yields a 1 . 47 \u00d7 speedup, with a cache hit rate of up to 41.9%. This high cache hit rate arises from the skewed access pattern", + "metadata": {} + }, + { + "text": "characteristic of graph-based traversal. However, the observed speedup does not fully align with the hit rate due to the non-negligible loading overhead introduced by SSDs with limited bandwidth.", + "metadata": {} + }, + { + "text": "Graph-based recomputation breakdown. Fig. 11 breaks down the time cost of a single batch in graph-based recomputation into three stages, categorized by the primary system resource used. Each batch aggregates multiple hops of recomputation, as described in \u00a74.2. First,", + "metadata": {} + }, + { + "text": "LEANN performs PQ lookups to select promising nodes, then retrieves and tokenizes the corresponding raw text. The tokenized inputs are sent to the embedding server. Finally, LEANN performs embedding recomputation and distance calculation.", + "metadata": {} + }, + { + "text": "Figure 10. [Ablation Study]: Latency and cache hit rate comparison under varying storage constraints across four datasets. The x-axis indicates total storage size (graph size + cached embeddings on disk) and the corresponding percentage of cached embeddings.", + "metadata": {} + }, + { + "text": "Figure 11. [Ablation Study]: Latency breakdown of a batch of requests in graph-based recomputation.", + "metadata": {} + }, + { + "text": "Although embedding recomputation is the primary bottleneck in LEANN, accounting for 76% of total latency, the three stages-spanning I/O, CPU, and GPU resources-can potentially be overlapped to improve overall efficiency. We leave this optimization for future work.", + "metadata": {} + }, + { + "text": "General Vector Search. Vector search primarily follows two paradigms: IVF [33] and proximity graphs [38]. IVF clusters vectors and probes relevant subsets during search, while graph-based methods such as HNSW [38], NSG [21], Vamana", + "metadata": {} + }, + { + "text": "[59], and others [8, 20, 41] connect similar vectors to enable efficient traversal. Graph-based approaches are widely regarded as state of the art due to their favorable trade-offs between accuracy and efficiency [65]. Prior work has explored reducing graph size through learned neighbor", + "metadata": {} + }, + { + "text": "selection [5, 73], but these methods are often impractical due to the high training cost and the need for labeled data.", + "metadata": {} + }, + { + "text": "Resource-Constrained Vector Search. Numerous efforts have aimed to reduce the memory footprint of vector search. Disk-based approaches such as DiskANN [59] store both vectors and graph structures on disk, leveraging in-memory compressed embeddings for navigation. Starling [64] improves I/O efficiency for", + "metadata": {} + }, + { + "text": "disk-resident graphs, while FusionANNS [61] enables cost-effective search through coordinated use of SSD, CPU, and GPU resources. AiSAQ [60], LM-DiskANN [46] further minimizes DRAM usage by storing compressed embeddings directly on disk.", + "metadata": {} + }, + { + "text": "EdgeRAG [55] alleviates memory pressure by generating embeddings online using an IVF-based index. However, it still incurs substantial storage overhead due to the need to maintain large clusters on disk as dictated by its design, and its performance degrades at scale owing to the high recomputation", + "metadata": {} + }, + { + "text": "cost introduced by an inefficient index structure. An alternative approach is embedding compression, such as PQ [29], or more recent methods like RabitQ [23], which offers quantization with theoretical error bounds. Yet, these methods struggle to maintain high search accuracy under tight storage budgets. In", + "metadata": {} + }, + { + "text": "contrast, LEANN integrates on-the-fly embedding recomputation with a graph-based index, incorporating highdegree preserving graph pruning and a specialized traversal algorithm optimized for edge devices.", + "metadata": {} + }, + { + "text": "Vector Search Applications on Edge Devices. On-device vector search enables privacy-preserving, low-latency, and offline capabilities across diverse applications. On-device RAGsystems ground language models in personal document collections while maintaining data privacy [32, 53, 66, 72].", + "metadata": {} + }, + { + "text": "Personalized recommendation systems [69] match user profiles with item embeddings directly on the device, while content-based search over large collections of locally stored images and videos employs efficient vision embedding models [50] to generate vector representations for fast retrieval. These applications motivate the design of LEANN to enable", + "metadata": {} + }, + { + "text": "efficient, low-overhead vector search on edge devices.", "metadata": {} }, { @@ -157,75 +1061,2251 @@ "metadata": {} }, { - "text": "The core techniques of LEANN, including on-the-fly recomputation from on-disk data, graph pruning, and a recomputationfriendly search algorithm, are broadly applicable across a range of graph-based approximate nearest neighbor frameworks. While we use HNSW as a concrete implementation example, these techniques are compatible with many other graph structures discussed earlier. Furthermore, LEANN's methodology can be naturally adapted to alternative algorithmic and system designs. For instance, in a DiskANN-style architecture, one can keep PQ-compressed embeddings in memory, store the graph structure on disk, and traverse the graph using PQ results. Instead of loading exact embeddings from disk, embeddings are recomputed on demand, and final reranking is performed using the recomputed values.", + "text": "The core techniques of LEANN, including on-the-fly recomputation from on-disk data, graph pruning, and a recomputationfriendly search algorithm, are broadly applicable across a range of graph-based approximate nearest neighbor frameworks. While we use HNSW as a concrete", "metadata": {} }, { - "text": "LEANN requires computing embeddings for all passages in advance in order to build the graph, after which the embeddings can be discarded. In other words, while LEANN incurs low storage overhead during search, the peak storage usage during index construction can be high. There are multiple potential solutions to tackle storage-efficient index building. One approach is to pre-cluster the data, then embed and construct the graph structure independently within each cluster. This process is performed sequentially, and for each cluster, the embeddings are discarded after the graph is built. During the search, results from all clusters are simply aggregated to form the final result.", + "text": "implementation example, these techniques are compatible with many other graph structures discussed earlier. Furthermore, LEANN's methodology can be naturally adapted to alternative algorithmic and system designs. For instance, in a DiskANN-style architecture, one can keep PQ-compressed embeddings in memory,", "metadata": {} }, { - "text": "For edge deployments, the latency overhead of LEANN is expected to decrease as consumer-grade GPUs continue to advance. For example, the RTX 5090 is projected to deliver over three times the FP16 Tensor throughput (419 TFLOPS [44]) compared to the NVIDIA A10 (125 TFLOPS [43]). In parallel, ongoing progress in compact and efficient embedding models is expected to reduce the cost of the forward pass, further accelerating LEANN and broadening its applicability across diverse hardware platforms. Building on our core algorithm, we envision future optimizations that further reduce latency and improve responsiveness.\nBeyond edge devices, our solution has broader applicability. In datacenter environments, where high-dimensional vectors are used to represent each object (e.g., text or image) for semantic search and other downstream tasks, storage quickly becomes a significant burden. Efficiently managing these representations is therefore essential. LEANN introduces fine-grained, on-demand embedding computation, offering a promising strategy for reducing storage overhead. We hope that LEANN will inspire further research into addressing storage challenges in large-scale ANN systems, particularly as the adoption of advanced embedding models continues to accelerate in datacenter settings.", + "text": "store the graph structure on disk, and traverse the graph using PQ results. Instead of loading exact embeddings from disk, embeddings are recomputed on demand, and final reranking is performed using the recomputed values.", "metadata": {} }, { - "text": "Similarity search over high-dimensional embeddings underpins many generative AI applications such as retrievalaugmented generation (RAG). However, enabling such capabilities on personal devices remains challenging due to the substantial storage required for storing embeddings and rich vector index metadata. In this paper, we present LEANN, a\nstorage-efficient neural retrieval system that leverages graphbased recomputation . By combining a two-level search algorithm with batch execution , LEANN achieves efficient query processing without storing the full embedding set. Furthermore, we introduce a high degree preserving pruning strategy to reduce graph storage overhead while maintaining accuracy. Together, these techniques enable LEANN to operate with less than 5% of the original data size - achieving a 50 \u00d7 storage reduction compared to existing methods - while maintaining fast and accurate retrieval.", + "text": "LEANN requires computing embeddings for all passages in advance in order to build the graph, after which the embeddings can be discarded. In other words, while LEANN incurs low storage overhead during search, the peak storage usage during index construction can be high. There are multiple potential solutions to tackle", "metadata": {} }, { - "text": "- [1] Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. In The Twelfth International Conference on Learning Representations .\n- [3] AWS. 2025. Amazon EC2 G5 instance. https://aws.amazon.com/ec2/i nstance-types/mac/ . [Online; accessed April-2025].\n- [2] Martin Aum\u00fcller, Erik Bernhardsson, and Alexander Faithfull. 2020. ANN-Benchmarks: A benchmarking tool for approximate nearest neighbor algorithms. Information Systems 87 (2020), 101374.\n- [4] AWS. 2025. Amazon EC2 G5 instance. https://aws.amazon.com/ec2/i nstance-types/g5 . [Online; accessed April-2025].\n- [6] Dongqi Cai, Shangguang Wang, Chen Peng, et al. 2024. Recall: Empowering Multimodal Embedding for Edge Devices. arXiv:2409.15342.\n- [5] Dmitry Baranchuk and Artem Babenko. 2019. Towards similarity graphs constructed by deep reinforcement learning. arXiv preprint arXiv:1911.12122 (2019).\n- [7] Pablo Castro. 2024. Announcing cost-effective RAG at scale with Azure AI Search. https://techcommunity.microsoft.com/blog/azure-aiservices-blog/announcing-cost-effective-rag-at-scale-with-azureai-search/4104961 .\n- [9] Davin Choo, Christoph Grunau, Julian Portmann, and V\u00e1clav Rozhon. 2020. k-means++: few more steps yield constant approximation. In International Conference on Machine Learning . PMLR, 1909-1917.", + "text": "storage-efficient index building. One approach is to pre-cluster the data, then embed and construct the graph structure independently within each cluster. This process is performed sequentially, and for each cluster, the embeddings are discarded after the graph is built. During the search, results from all clusters are simply aggregated", "metadata": {} }, { - "text": "- [8] Qi Chen, Bing Zhao, Haidong Wang, Mingqin Li, Chuanjie Liu, Zengzhong Li, Mao Yang, and Jingdong Wang. 2021. SPANN: Highlyefficient Billion-scale Approximate Nearest Neighbor Search. In 35th Conference on Neural Information Processing Systems (NeurIPS 2021) .\n- [10] Together Computer. 2023. RedPajama: An Open Source Recipe to Reproduce LLaMA Training Dataset. https://github.com/togethercom puter/RedPajama-Data . Accessed: May 10, 2025.\n- [12] CPU-Monkey. n.d.. Apple M1 Ultra 64-Core GPU. https://www.cpumonkey.com/en/igpu-apple_m1_ultra_64_core . Accessed: 2025-05-10.\n- [11] KVCACHE.AI Contributors. 2025. KTransformers: A Flexible Framework for Experiencing Cutting-edge LLM Inference Optimizations. https://github.com/kvcache-ai/ktransformers . Accessed: 2025-05-14.\n- [13] Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, and Jimmy Lin. 2021. Ms marco: Benchmarking ranking models in the large-data regime. In proceedings of the 44th International ACM SIGIR conference on research and development in information retrieval . 15661576.\n- [15] Weihao Cui, Han Zhao, Quan Chen, Hao Wei, Zirui Li, Deze Zeng, Chao Li, and Minyi Guo. 2022. { DVABatch } : Diversity-aware { MultiEntry }{ Multi-Exit } batching for efficient processing of { DNN } services on { GPUs } . In 2022 USENIX Annual Technical Conference (USENIX ATC 22) . 183-198.", + "text": "to form the final result.", "metadata": {} }, { - "text": "- [14] Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, and Ellen M Voorhees. 2020. Overview of the TREC 2019 deep learning track. arXiv preprint arXiv:2003.07820 (2020).\n- [16] Matthijs Douze. 2020. Indexing 1T Vectors. https://github.com/faceb ookresearch/faiss/wiki/Indexing-1T-vectors .\n- [17] Matthijs Douze, Alexandr Guzhva, Chengqi Deng, Jeff Johnson, Gergely Szilvasy, Pierre-Emmanuel Mazar\u00e9, Maria Lomeli, Lucas Hosseini, and Herv\u00e9 J\u00e9gou. 2025. The Faiss library. arXiv:2401.08281 [cs.LG] https://arxiv.org/abs/2401.08281\n- [19] Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024).\n- [18] Matthijs Douze, Alexandre Sablayrolles, and Herv\u00e9 J\u00e9gou. 2018. Link and code: Fast indexing with graphs and compact regression codes. In Proceedings of the IEEE conference on computer vision and pattern recognition . 3646-3654.\n- [20] Cong Fu, Changxu Wang, and Deng Cai. 2021. High Dimensional Similarity Search with Satellite System Graph: Efficiency, Scalability, and Unindexed Query Compatibility. arXiv:1907.06146 [cs.IR] https: //arxiv.org/abs/1907.06146", + "text": "For edge deployments, the latency overhead of LEANN is expected to decrease as consumer-grade GPUs continue to advance. For example, the RTX 5090 is projected to deliver over three times the FP16 Tensor throughput (419 TFLOPS [44]) compared", "metadata": {} }, { - "text": "- [22] Jianyang Gao and Cheng Long. 2023. High-Dimensional Approximate Nearest Neighbor Search: with Reliable and Efficient Distance Comparison Operations. Proc. ACM Manag. Data 1, 2, Article 137 (June 2023), 27 pages. https://doi.org/10.1145/3589282\n- [21] Cong Fu, Chao Xiang, Changxu Wang, and Deng Cai. 2019. Fast approximate nearest neighbor search with the navigating spreadingout graph. Proc. VLDB Endow. 12, 5 (Jan. 2019), 461-474. https: //doi.org/10.14778/3303753.3303754\n- [23] Jianyang Gao and Cheng Long. 2024. RabitQ: Quantizing HighDimensional Vectors with a Theoretical Error Bound for Approximate Nearest Neighbor Search. In Proceedings of the ACM on Management of Data (SIGMOD '24) , Vol. 2. Article 167.\n- [25] Alexandra Henzinger, Emma Dauterman, Henry Corrigan-Gibbs, and Nickolai Zeldovich. 2023. Private Web Search with Tiptoe. Cryptology ePrint Archive, Paper 2023/1438. https://doi.org/10.1145/3600006.36 13134\n- [24] Yanzhang He, Tara N. Sainath, Rohit Prabhavalkar, Ian McGraw, Raziel Alvarez, Ding Zhao, et al. 2019. Streaming End-to-End Speech Recognition for Mobile Devices. In Proc. IEEE ICASSP . 6381-6385.", + "text": "to the NVIDIA A10 (125 TFLOPS [43]). In parallel, ongoing progress in compact and efficient embedding models is expected to reduce the cost of the forward pass, further accelerating LEANN and broadening its applicability across diverse hardware platforms. Building on our core", "metadata": {} }, { - "text": "- [26] Piotr Indyk and Rajeev Motwani. 1998. Approximate nearest neighbors: towards removing the curse of dimensionality. In Proceedings of the Thirtieth Annual ACM Symposium on Theory of Computing (Dallas, Texas, USA) (STOC '98) . Association for Computing Machinery, New York, NY, USA, 604-613. https://doi.org/10.1145/276698.276876\n- [28] Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large scale distantly supervised challenge dataset for reading comprehension. arXiv preprint arXiv:1705.03551 (2017).\n- [27] Gautier Izacard, Mathilde Caron, Lucas Hosseini, Sebastian Riedel, Piotr Bojanowski, Armand Joulin, and Edouard Grave. 2021. Unsupervised dense information retrieval with contrastive learning. arXiv preprint arXiv:2112.09118 (2021).\n- [29] Herve J\u00e9gou, Matthijs Douze, and Cordelia Schmid. 2011. Product Quantization for Nearest Neighbor Search. IEEE Transactions on Pattern Analysis and Machine Intelligence 33, 1 (2011), 117-128. https://doi.or g/10.1109/TPAMI.2010.57", + "text": "algorithm, we envision future optimizations that further reduce latency and improve responsiveness.", "metadata": {} }, { - "text": "- [31] Tom Kwiatkowski, Jennimaria Palomaki, Olivia Redfield, Michael Collins, Ankur Parikh, Chris Alberti, Danielle Epstein, Illia Polosukhin, Jacob Devlin, Kenton Lee, Kristina Toutanova, Llion Jones, Matthew Kelcey, Ming-Wei Chang, Andrew M. Dai, Jakob Uszkoreit, Quoc Le, and Slav Petrov. 2019. Natural Questions: A Benchmark for Question Answering Research. Transactions of the Association for Computational Linguistics 7 (2019), 452-466. https://doi.org/10.1162/tacl_a_00276\n- [30] Vladimir Karpukhin, Barlas Oguz, Sewon Min, Patrick SH Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 2020. Dense Passage Retrieval for Open-Domain Question Answering.. In EMNLP (1) . 6769-6781.\n- [32] Chanhee Lee, Deeksha Prahlad, Dongha Kim, and Hokeun Kim. 2024. Work-in-Progress: On-device Retrieval Augmented Generation with", + "text": "Beyond edge devices, our solution has broader applicability. In datacenter environments, where high-dimensional vectors are used to represent each object (e.g., text or image) for semantic search and other downstream tasks, storage quickly becomes a significant burden. Efficiently managing these representations is therefore essential.", "metadata": {} }, { - "text": "- Knowledge Graphs for Personalized Large Language Models. In 2024 International Conference on Embedded Software (EMSOFT) . 1-1. https: //doi.org/10.1109/EMSOFT60242.2024.00006\n- [34] Muyang Li, Yujun Lin, Zhekai Zhang, Tianle Cai, Xiuyu Li, Junxian Guo, Enze Xie, Chenlin Meng, Jun-Yan Zhu, and Song Han. 2024. Svdqunat: Absorbing outliers by low-rank components for 4-bit diffusion models. arXiv preprint arXiv:2411.05007 (2024).\n- [33] Victor Lempitsky. 2012. The inverted multi-index. In Proceedings of the 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (CVPR '12) . IEEE Computer Society, USA, 3069-3076.\n- [35] Wen Li, Ying Zhang, Yifang Sun, Wei Wang, Mingjie Li, Wenjie Zhang, and Xuemin Lin. 2019. Approximate nearest neighbor search on high dimensional data-experiments, analyses, and improvement. IEEE Transactions on Knowledge and Data Engineering 32, 8 (2019), 14751488.\n- [37] Jimmy Lin, Rodrigo Nogueira, and Andrew Yates. 2022. Pretrained transformers for text ranking: Bert and beyond . Springer Nature.\n- [36] Zehan Li, Xin Zhang, Yanzhao Zhang, Dingkun Long, Pengjun Xie, and Meishan Zhang. 2023. Towards general text embeddings with multistage contrastive learning. arXiv preprint arXiv:2308.03281 (2023).\n- [38] Yu A Malkov and Dmitry A Yashunin. 2018. Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. IEEE transactions on pattern analysis and machine intelligence 42, 4 (2018), 824-836.", + "text": "LEANN introduces fine-grained, on-demand embedding computation, offering a promising strategy for reducing storage overhead. We hope that LEANN will inspire further research into addressing storage challenges in large-scale ANN systems, particularly as the adoption of advanced embedding models continues to accelerate in datacenter settings.", "metadata": {} }, { - "text": "- [40] Microsoft Learn. 2025. Vector index size and staying under limits . https: //learn.microsoft.com/en-us/azure/search/vector-search-indexsize?utm_source=chatgpt.com&tabs=portal-vector-quota\n- [39] Magdalen Dobson Manohar, Zheqi Shen, Guy Blelloch, Laxman Dhulipala, Yan Gu, Harsha Vardhan Simhadri, and Yihan Sun. 2024. Parlayann: Scalable and deterministic parallel graph-based approximate nearest neighbor search algorithms. In Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming . 270-285.\n- [41] Javier Vargas Munoz, Marcos A Gon\u00e7alves, Zanoni Dias, and Ricardo da S Torres. 2019. Hierarchical clustering-based graphs for large scale approximate nearest neighbor search. Pattern Recognition 96 (2019), 106970.\n- [43] NVIDIA. n.d.. NVIDIA A10 Tensor Core GPU. https://www.nvidia.c om/en-us/data-center/products/a10-gpu/ . Accessed: 2025-05-10.\n- [42] Blaise Munyampirwa, Vihan Lakshman, and Benjamin Coleman. 2024. Down with the Hierarchy: The'H'in HNSW Stands for\" Hubs\". arXiv preprint arXiv:2412.01940 (2024).\n- [44] NVIDIA Corporation. 2024. NVIDIA RTX Blackwell GPU Architecture. https://images.nvidia.com/aem-dam/Solutions/geforce/blackwell/nv idia-rtx-blackwell-gpu-architecture.pdf . Whitepaper.\n- [46] Yu Pan, Jianxin Sun, and Hongfeng Yu. 2023. LM-DiskANN: Low Memory Footprint in Disk-Native Dynamic Graph-Based ANN Indexing. In 2023 IEEE International Conference on Big Data (BigData) . 5987-5996. https://doi.org/10.1109/BigData59044.2023.10386517", + "text": "Similarity search over high-dimensional embeddings underpins many generative AI applications such as retrievalaugmented generation (RAG). However, enabling such capabilities on personal devices remains challenging due to the substantial storage required for storing embeddings and rich vector index metadata. In this paper, we present LEANN, a", "metadata": {} }, { - "text": "- [45] ObjectBox Ltd. 2024. Edge AI: The era of on-device AI. https://obje ctbox.io/on-device-vector-databases-and-edge-ai/ . Accessed May 2025.\n- [47] Pinecone. n.d.. Vector Search: Hierarchical Navigable Small Worlds. https://www.pinecone.io/learn/series/faiss/hnsw/ . Accessed: 2025-05-10.\n- [49] Navid Rekabsaz, Oleg Lesota, Markus Schedl, Jon Brassey, and Carsten Eickhoff. 2021. TripClick: the log files of a large health web search engine. In Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval . 2507-2513.\n- [48] David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, and Samuel R Bowman. 2024. Gpqa: A graduate-level google-proof q&a benchmark. In First Conference on Language Modeling .", + "text": "storage-efficient neural retrieval system that leverages graphbased recomputation . By combining a two-level search algorithm with batch execution , LEANN achieves efficient query processing without storing the full embedding set. Furthermore, we introduce a high degree preserving pruning strategy to reduce graph storage overhead while maintaining accuracy. Together, these techniques enable", "metadata": {} }, { - "text": ", 1 = Humphrey Shi. 2023. Efficient Neural Networks: From Algorithm Design to Practical Mobile Deployments. CVPR 2023 Tutorial. https: //snap-research.github.io/efficient-nn-tutorial/ .. [51], 1 = Jie Ren, Minjia Zhang, and Dong Li. 2020. HM-ANN: efficient billion- point nearest neighbor search on heterogeneous memory. In Proceed- ings of the 34th International Conference on Neural Information Process- ing Systems (Vancouver, BC, Canada) (NIPS '20) . Curran Associates Inc., Red Hook, NY, USA, Article 895, 13 pages.. [52], 1 = Facebook AI Research. n.d.. Guidelines to Choose an Index. https: //github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an- index/28074dc0ddc733f84b06fa4d99b3f6e2ef65613d#if-below-1m- vectors-ivfx . Accessed: 2025-05-10.. [53], 1 = Michael J. Ryan, Danmei Xu, Chris Nivera, and Daniel Campos. 2024. EnronQA: Towards Personalized RAG over Private Documents. arXiv preprint arXiv:2505.00263 (2024).. [54], 1 = Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis, Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip- filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021).. [55], 1 = Korakit Seemakhupt, Sihang Liu, and Samira Khan. 2024. EdgeRAG: Online-Indexed RAG for Edge Devices. arXiv preprint arXiv:2412.21023 (2024).. [56],", + "text": "LEANN to operate with less than 5% of the original data size - achieving a 50 \u00d7 storage reduction compared to existing methods - while maintaining fast and accurate retrieval.", "metadata": {} }, { - "text": "1 = Daniel Severo, Giuseppe Ottaviano, Matthew Muckley, Karen Ullrich, and Matthijs Douze. 2025. Lossless Compression of Vector IDs for Approximate Nearest Neighbor Search. arXiv preprint arXiv:2501.10479 (2025).. [57], 1 = Rulin Shao, Jacqueline He, Akari Asai, Weijia Shi, TimDettmers,Sewon Min, Luke Zettlemoyer, and Pang Wei WKoh. 2024. Scaling retrieval- based language models with a trillion-token datastore. Advances in Neural Information Processing Systems 37 (2024), 91260-91299.. [58], 1 = Michael Shen, Muhammad Umar, Kiwan Maeng, G. Edward Suh, and Udit Gupta. 2024. Towards Understanding Systems Trade-offs in Retrieval-Augmented Generation Model Inference. arXiv:2412.11854 [cs.AR] https://arxiv.org/abs/2412.11854. [59], 1 = Suhas Jayaram Subramanya, Devvrit, Rohan Kadekodi, Ravishankar Krishaswamy, and Harsha Vardhan Simhadri. 2019. DiskANN: fast accurate billion-point nearest neighbor search on a single node . Curran Associates Inc., Red Hook, NY, USA.. [60], 1 = Kento Tatsuno, Daisuke Miyashita, Taiga Ikeda, Kiyoshi Ishiyama, Kazunari Sumiyoshi, and Jun Deguchi. 2024. AiSAQ: All-in-Storage ANNS with Product Quantization for DRAM-free Information Re- trieval. arXiv preprint arXiv:2404.06004 (2024). arXiv:2404.06004 https://arxiv.org/abs/2404.06004. [61], 1 = Bing Tian, Haikun Liu, Yuhang Tang, Shihai Xiao, Zhuohui Duan, Xiaofei Liao, Hai Jin, Xuecang Zhang,", + "text": "- [1] Akari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, and Hannaneh Hajishirzi. 2023. Self-rag: Learning to retrieve, generate, and critique through self-reflection. In The Twelfth International", "metadata": {} }, { - "text": "Junhua Zhu, and Yu Zhang. 2025. Towards High-throughput and Low-latency Billion-scale Vector Search via CPU/GPU Collaborative Filtering and Re-ranking. In 23rd USENIX Conference on File and Storage Technologies (FAST 25) . USENIX Association, Santa Clara, CA, 171-185. https://www.usenix.org/con. [62], 1 = ference/fast25/presentation/tian-bing Vincent Totino. 2025. Phone Storage: How Much Do You Really Need? https://www.optimum.com/articles/mobile/choosing-phone- storage-amount-needs-guide. [63], 1 = Vincent Totino. 2025. Phone Storage: How Much Do You Really Need? https://www.optimum.com/articles/mobile/choosing-phone- storage-amount-needs-guide Accessed May 15, 2025.. [64], 1 = Mengzhao Wang, Weizhi Xu, Xiaomeng Yi, Songlin Wu, Zhangyang Peng, Xiangyu Ke, Yunjun Gao, Xiaoliang Xu, Rentong Guo, and Charles Xie. 2024. Starling: AnI/O-Efficient Disk-Resident Graph Index Framework for High-Dimensional Vector Similarity Search on Data Segment. In Proceedings of the ACM on Management of Data (SIGMOD", + "text": "Conference on Learning Representations .\n- [3] AWS. 2025. Amazon EC2 G5 instance. https://aws.amazon.com/ec2/i nstance-types/mac/ . [Online; accessed April-2025].", "metadata": {} }, { - "text": "- [65] Peng Wang, Chen Wang, Xiaofang Lin, Wenjie Zhang, and Qing He. 2021. A Comprehensive Survey and Experimental Comparison of Graph-Based Approximate Nearest Neighbor Search. Proc. VLDB Endow. 14, 11 (2021), 1964-1978. https://doi.org/10.14778/3476249.347 6258\n- [67] Zhenliang Xue, Yixin Song, et al. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv preprint arXiv:2406.06282 (2024).\n- [66] Zijie J Wang and Duen Horng Chau. 2024. MeMemo: On-device Retrieval Augmentation for Private and Personalized Text Generation. In Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval . 2765-2770.\n- [68] Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William W Cohen, Ruslan Salakhutdinov, and Christopher D Manning. 2018. HotpotQA: A dataset for diverse, explainable multi-hop question answering. arXiv preprint arXiv:1809.09600 (2018).\n- [70] Weiping Yu, Ningyi Liao, Siqiang Luo, and Junfeng Liu. 2025. RAGDoll: Efficient Offloading-based Online RAG System on a Single GPU. arXiv preprint arXiv:2504.15302 (2025).\n- [69] Hongzhi Yin, Tong Chen, Liang Qu, and Bin Cui. 2024. On-Device Recommender Systems: A Comprehensive Survey. arXiv preprint arXiv:2401.11441 (2024).", + "text": "- [2] Martin Aum\u00fcller, Erik Bernhardsson, and Alexander Faithfull. 2020. ANN-Benchmarks: A benchmarking tool for approximate nearest neighbor algorithms. Information Systems 87 (2020), 101374.", "metadata": {} }, { - "text": "- [71] Hamed Zamani, Johanne R Trippas, Jeff Dalton, Filip Radlinski, et al. 2023. Conversational information seeking. Foundations and Trends\u00ae in Information Retrieval 17, 3-4 (2023), 244-456.\n- [73] Minjia Zhang, Wenhan Wang, and Yuxiong He. 2020. Learning to Anneal and Prune Proximity Graphs for Similarity Search. In International Conference on Learning Representations (ICLR) . Available at https://openreview.net/forum?id=HJlXC3EtwB .\n- [72] Saber Zerhoudi and Michael Granitzer. 2024. PersonaRAG: Enhancing Retrieval-Augmented Generation Systems with User-Centric Agents. arXiv preprint arXiv:2407.09394 (2024).\n- [74] Yanhao Zhang, Pan Pan, Yun Zheng, Kang Zhao, Yingya Zhang, Xiaofeng Ren, and Rong Jin. 2018. Visual search at alibaba. In Proceedings of the 24th ACM SIGKDD international conference on knowledge discovery & data mining . 993-1001.\n- [76] Kan Zhu, Yilong Zhao, Liangyu Zhao, Gefei Zuo, Yile Gu, Dedong Xie, Yufei Gao, Qinyu Xu, Tian Tang, Zihao Ye, et al. 2024. Nanoflow: Towards optimal large language model serving throughput. arXiv preprint arXiv:2408.12757 (2024).\n- [75] Jinhao Zhu, Liana Patel, Matei Zaharia, and Raluca Ada Popa. 2024. Compass: Encrypted Semantic Search with High Accuracy. Cryptology ePrint Archive, Paper 2024/1255. https://eprint.iacr.org/2024/1255\n- [77] Zilliz AI FAQ. 2025. How much memory overhead is typically introduced by indexes like HNSW or IVF? Accessed May 2025.", + "text": "- [4] AWS. 2025. Amazon EC2 G5 instance. https://aws.amazon.com/ec2/i nstance-types/g5 . [Online; accessed April-2025].", + "metadata": {} + }, + { + "text": "- [6] Dongqi Cai, Shangguang Wang, Chen Peng, et al. 2024. Recall: Empowering Multimodal Embedding for Edge Devices. arXiv:2409.15342.", + "metadata": {} + }, + { + "text": "- [5] Dmitry Baranchuk and Artem Babenko. 2019. Towards similarity graphs constructed by deep reinforcement learning. arXiv preprint arXiv:1911.12122 (2019).", + "metadata": {} + }, + { + "text": "- [7] Pablo Castro. 2024. Announcing cost-effective RAG at scale with Azure AI Search. https://techcommunity.microsoft.com/blog/azure-aiservices-blog/announcing-cost-effective-rag-at-scale-with-azureai-search/4104961 .", + "metadata": {} + }, + { + "text": "- [9] Davin Choo, Christoph Grunau, Julian Portmann, and V\u00e1clav Rozhon. 2020. k-means++: few more steps yield constant approximation. In International Conference on Machine Learning . PMLR,", + "metadata": {} + }, + { + "text": "1909-1917.", + "metadata": {} + }, + { + "text": "- [8] Qi Chen, Bing Zhao, Haidong Wang, Mingqin Li, Chuanjie Liu, Zengzhong Li, Mao Yang, and Jingdong Wang. 2021. SPANN: Highlyefficient Billion-scale Approximate Nearest Neighbor Search. In", + "metadata": {} + }, + { + "text": "35th Conference on Neural Information Processing Systems (NeurIPS 2021) .", + "metadata": {} + }, + { + "text": "- [10] Together Computer. 2023. RedPajama: An Open Source Recipe to Reproduce LLaMA Training Dataset. https://github.com/togethercom puter/RedPajama-Data . Accessed: May 10, 2025.", + "metadata": {} + }, + { + "text": "- [12] CPU-Monkey. n.d.. Apple M1 Ultra 64-Core GPU. https://www.cpumonkey.com/en/igpu-apple_m1_ultra_64_core . Accessed: 2025-05-10.", + "metadata": {} + }, + { + "text": "- [11] KVCACHE.AI Contributors. 2025. KTransformers: A Flexible Framework for Experiencing Cutting-edge LLM Inference Optimizations. https://github.com/kvcache-ai/ktransformers . Accessed:", + "metadata": {} + }, + { + "text": "2025-05-14.", + "metadata": {} + }, + { + "text": "- [13] Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, and Jimmy Lin. 2021. Ms marco: Benchmarking ranking models in the large-data regime. In proceedings of the 44th International ACM SIGIR conference", + "metadata": {} + }, + { + "text": "on research and development in information retrieval . 15661576.", + "metadata": {} + }, + { + "text": "- [15] Weihao Cui, Han Zhao, Quan Chen, Hao Wei, Zirui Li, Deze Zeng, Chao Li, and Minyi Guo. 2022. { DVABatch } : Diversity-aware { MultiEntry }{ Multi-Exit", + "metadata": {} + }, + { + "text": "} batching for efficient processing of { DNN } services on { GPUs } . In 2022 USENIX Annual Technical Conference (USENIX ATC 22) . 183-198.", + "metadata": {} + }, + { + "text": "- [14] Nick Craswell, Bhaskar Mitra, Emine Yilmaz, Daniel Campos, and Ellen M Voorhees. 2020. Overview of the TREC 2019 deep learning track. arXiv preprint", + "metadata": {} + }, + { + "text": "arXiv:2003.07820 (2020).", + "metadata": {} + }, + { + "text": "- [16] Matthijs Douze. 2020. Indexing 1T Vectors. https://github.com/faceb ookresearch/faiss/wiki/Indexing-1T-vectors .", + "metadata": {} + }, + { + "text": "- [17] Matthijs Douze, Alexandr Guzhva, Chengqi Deng, Jeff Johnson, Gergely Szilvasy, Pierre-Emmanuel Mazar\u00e9, Maria Lomeli, Lucas Hosseini, and Herv\u00e9 J\u00e9gou.", + "metadata": {} + }, + { + "text": "2025. The Faiss library. arXiv:2401.08281 [cs.LG] https://arxiv.org/abs/2401.08281", + "metadata": {} + }, + { + "text": "- [19] Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et", + "metadata": {} + }, + { + "text": "al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024).", + "metadata": {} + }, + { + "text": "- [18] Matthijs Douze, Alexandre Sablayrolles, and Herv\u00e9 J\u00e9gou. 2018. Link and code: Fast indexing with graphs and compact regression codes. In Proceedings of the IEEE conference on computer vision and pattern recognition .", + "metadata": {} + }, + { + "text": "3646-3654.", + "metadata": {} + }, + { + "text": "- [20] Cong Fu, Changxu Wang, and Deng Cai. 2021. High Dimensional Similarity Search with Satellite System Graph: Efficiency, Scalability, and Unindexed Query Compatibility. arXiv:1907.06146", + "metadata": {} + }, + { + "text": "[cs.IR] https: //arxiv.org/abs/1907.06146", + "metadata": {} + }, + { + "text": "- [22] Jianyang Gao and Cheng Long. 2023. High-Dimensional Approximate Nearest Neighbor Search: with Reliable and Efficient Distance Comparison Operations. Proc. ACM Manag. Data 1, 2, Article 137 (June", + "metadata": {} + }, + { + "text": "2023), 27 pages. https://doi.org/10.1145/3589282", + "metadata": {} + }, + { + "text": "- [21] Cong Fu, Chao Xiang, Changxu Wang, and Deng Cai. 2019. Fast approximate nearest neighbor search with the navigating spreadingout graph. Proc. VLDB Endow. 12, 5 (Jan.", + "metadata": {} + }, + { + "text": "2019), 461-474. https: //doi.org/10.14778/3303753.3303754", + "metadata": {} + }, + { + "text": "- [23] Jianyang Gao and Cheng Long. 2024. RabitQ: Quantizing HighDimensional Vectors with a Theoretical Error Bound for Approximate Nearest Neighbor Search. In Proceedings of the ACM on Management of Data (SIGMOD '24) ,", + "metadata": {} + }, + { + "text": "Vol. 2. Article 167.", + "metadata": {} + }, + { + "text": "- [25] Alexandra Henzinger, Emma Dauterman, Henry Corrigan-Gibbs, and Nickolai Zeldovich. 2023. Private Web Search with Tiptoe. Cryptology ePrint Archive, Paper", + "metadata": {} + }, + { + "text": "2023/1438. https://doi.org/10.1145/3600006.36 13134", + "metadata": {} + }, + { + "text": "- [24] Yanzhang He, Tara N. Sainath, Rohit Prabhavalkar, Ian McGraw, Raziel Alvarez, Ding Zhao, et al. 2019. Streaming End-to-End Speech Recognition for Mobile Devices. In Proc. IEEE", + "metadata": {} + }, + { + "text": "ICASSP . 6381-6385.", + "metadata": {} + }, + { + "text": "- [26] Piotr Indyk and Rajeev Motwani. 1998. Approximate nearest neighbors: towards removing the curse of dimensionality. In Proceedings of the Thirtieth Annual ACM Symposium on Theory of Computing (Dallas, Texas, USA) (STOC", + "metadata": {} + }, + { + "text": "'98) . Association for Computing Machinery, New York, NY, USA, 604-613. https://doi.org/10.1145/276698.276876", + "metadata": {} + }, + { + "text": "- [28] Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer. 2017. Triviaqa: A large scale distantly supervised challenge dataset for reading comprehension. arXiv preprint", + "metadata": {} + }, + { + "text": "arXiv:1705.03551 (2017).", + "metadata": {} + }, + { + "text": "- [27] Gautier Izacard, Mathilde Caron, Lucas Hosseini, Sebastian Riedel, Piotr Bojanowski, Armand Joulin, and Edouard Grave. 2021. Unsupervised dense information retrieval with contrastive learning.", + "metadata": {} + }, + { + "text": "arXiv preprint arXiv:2112.09118 (2021).", + "metadata": {} + }, + { + "text": "- [29] Herve J\u00e9gou, Matthijs Douze, and Cordelia Schmid. 2011. Product Quantization for Nearest Neighbor Search. IEEE Transactions on Pattern Analysis and Machine Intelligence 33, 1 (2011),", + "metadata": {} + }, + { + "text": "117-128. https://doi.or g/10.1109/TPAMI.2010.57", + "metadata": {} + }, + { + "text": "- [31] Tom Kwiatkowski, Jennimaria Palomaki, Olivia Redfield, Michael Collins, Ankur Parikh, Chris Alberti, Danielle Epstein, Illia Polosukhin, Jacob Devlin, Kenton Lee, Kristina Toutanova, Llion Jones, Matthew", + "metadata": {} + }, + { + "text": "Kelcey, Ming-Wei Chang, Andrew M. Dai, Jakob Uszkoreit, Quoc Le, and Slav Petrov. 2019. Natural Questions: A Benchmark for Question Answering Research. Transactions of the Association for Computational Linguistics 7", + "metadata": {} + }, + { + "text": "(2019), 452-466. https://doi.org/10.1162/tacl_a_00276", + "metadata": {} + }, + { + "text": "- [30] Vladimir Karpukhin, Barlas Oguz, Sewon Min, Patrick SH Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. 2020. Dense Passage Retrieval for Open-Domain Question Answering..", + "metadata": {} + }, + { + "text": "In EMNLP (1) . 6769-6781.", + "metadata": {} + }, + { + "text": "- [32] Chanhee Lee, Deeksha Prahlad, Dongha Kim, and Hokeun Kim. 2024. Work-in-Progress: On-device Retrieval Augmented Generation with", + "metadata": {} + }, + { + "text": "- Knowledge Graphs for Personalized Large Language Models. In 2024 International Conference on Embedded Software (EMSOFT) . 1-1. https:", + "metadata": {} + }, + { + "text": "//doi.org/10.1109/EMSOFT60242.2024.00006", + "metadata": {} + }, + { + "text": "- [34] Muyang Li, Yujun Lin, Zhekai Zhang, Tianle Cai, Xiuyu Li, Junxian Guo, Enze Xie, Chenlin Meng, Jun-Yan Zhu, and Song Han. 2024.", + "metadata": {} + }, + { + "text": "Svdqunat: Absorbing outliers by low-rank components for 4-bit diffusion models. arXiv preprint arXiv:2411.05007 (2024).", + "metadata": {} + }, + { + "text": "- [33] Victor Lempitsky. 2012. The inverted multi-index. In Proceedings of the 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (CVPR '12) . IEEE Computer Society, USA,", + "metadata": {} + }, + { + "text": "3069-3076.", + "metadata": {} + }, + { + "text": "- [35] Wen Li, Ying Zhang, Yifang Sun, Wei Wang, Mingjie Li, Wenjie Zhang, and Xuemin Lin. 2019. Approximate nearest neighbor search on high dimensional data-experiments, analyses, and improvement. IEEE Transactions on Knowledge and Data Engineering", + "metadata": {} + }, + { + "text": "32, 8 (2019), 14751488.\n- [37] Jimmy Lin, Rodrigo Nogueira, and Andrew Yates. 2022. Pretrained transformers for text ranking: Bert and beyond . Springer Nature.", + "metadata": {} + }, + { + "text": "- [36] Zehan Li, Xin Zhang, Yanzhao Zhang, Dingkun Long, Pengjun Xie, and Meishan Zhang. 2023. Towards general text embeddings with multistage contrastive learning. arXiv preprint", + "metadata": {} + }, + { + "text": "arXiv:2308.03281 (2023).", + "metadata": {} + }, + { + "text": "- [38] Yu A Malkov and Dmitry A Yashunin. 2018. Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs. IEEE transactions on pattern analysis and machine intelligence 42, 4 (2018),", + "metadata": {} + }, + { + "text": "824-836.\n- [40] Microsoft Learn. 2025. Vector index size and staying under limits . https: //learn.microsoft.com/en-us/azure/search/vector-search-indexsize?utm_source=chatgpt.com&tabs=portal-vector-quota", + "metadata": {} + }, + { + "text": "- [39] Magdalen Dobson Manohar, Zheqi Shen, Guy Blelloch, Laxman Dhulipala, Yan Gu, Harsha Vardhan Simhadri, and Yihan Sun. 2024. Parlayann: Scalable", + "metadata": {} + }, + { + "text": "and deterministic parallel graph-based approximate nearest neighbor search algorithms. In Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming . 270-285.", + "metadata": {} + }, + { + "text": "- [41] Javier Vargas Munoz, Marcos A Gon\u00e7alves, Zanoni Dias, and Ricardo da S Torres. 2019. Hierarchical clustering-based graphs for large scale approximate nearest neighbor search. Pattern Recognition 96 (2019),", + "metadata": {} + }, + { + "text": "106970.\n- [43] NVIDIA. n.d.. NVIDIA A10 Tensor Core GPU. https://www.nvidia.c om/en-us/data-center/products/a10-gpu/ . Accessed: 2025-05-10.", + "metadata": {} + }, + { + "text": "- [42] Blaise Munyampirwa, Vihan Lakshman, and Benjamin Coleman. 2024. Down with the Hierarchy: The'H'in HNSW Stands for\" Hubs\". arXiv preprint", + "metadata": {} + }, + { + "text": "arXiv:2412.01940 (2024).", + "metadata": {} + }, + { + "text": "- [44] NVIDIA Corporation. 2024. NVIDIA RTX Blackwell GPU Architecture. https://images.nvidia.com/aem-dam/Solutions/geforce/blackwell/nv idia-rtx-blackwell-gpu-architecture.pdf . Whitepaper.", + "metadata": {} + }, + { + "text": "- [46] Yu Pan, Jianxin Sun, and Hongfeng Yu. 2023. LM-DiskANN: Low Memory Footprint in Disk-Native Dynamic Graph-Based ANN Indexing. In 2023 IEEE International Conference on Big Data (BigData) .", + "metadata": {} + }, + { + "text": "5987-5996. https://doi.org/10.1109/BigData59044.2023.10386517", + "metadata": {} + }, + { + "text": "- [45] ObjectBox Ltd. 2024. Edge AI: The era of on-device AI. https://obje ctbox.io/on-device-vector-databases-and-edge-ai/ . Accessed May 2025.", + "metadata": {} + }, + { + "text": "- [47] Pinecone. n.d.. Vector Search: Hierarchical Navigable Small Worlds. https://www.pinecone.io/learn/series/faiss/hnsw/ . Accessed: 2025-05-10.", + "metadata": {} + }, + { + "text": "- [49] Navid Rekabsaz, Oleg Lesota, Markus Schedl, Jon Brassey, and Carsten Eickhoff. 2021. TripClick: the log files of a large health web search engine. In Proceedings of the 44th International", + "metadata": {} + }, + { + "text": "ACM SIGIR Conference on Research and Development in Information Retrieval . 2507-2513.", + "metadata": {} + }, + { + "text": "- [48] David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, and Samuel R Bowman. 2024. Gpqa: A graduate-level google-proof q&a benchmark. In First Conference on", + "metadata": {} + }, + { + "text": "Language Modeling .", + "metadata": {} + }, + { + "text": ", 1 = Humphrey Shi. 2023. Efficient Neural Networks: From Algorithm Design to Practical Mobile Deployments. CVPR 2023 Tutorial. https: //snap-research.github.io/efficient-nn-tutorial/ .. [51], 1 = Jie", + "metadata": {} + }, + { + "text": "Ren, Minjia Zhang, and Dong Li. 2020. HM-ANN: efficient billion- point nearest neighbor search on heterogeneous memory. In Proceed- ings of the 34th International Conference on Neural Information Process- ing Systems (Vancouver, BC, Canada)", + "metadata": {} + }, + { + "text": "(NIPS '20) . Curran Associates Inc., Red Hook, NY, USA, Article 895, 13 pages.. [52], 1 = Facebook AI Research. n.d.. Guidelines to Choose an Index. https:", + "metadata": {} + }, + { + "text": "//github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an- index/28074dc0ddc733f84b06fa4d99b3f6e2ef65613d#if-below-1m-", + "metadata": {} + }, + { + "text": "vectors-ivfx . Accessed: 2025-05-10.. [53], 1 = Michael J. Ryan, Danmei Xu, Chris Nivera, and Daniel Campos. 2024. EnronQA: Towards Personalized RAG over", + "metadata": {} + }, + { + "text": "Private Documents. arXiv preprint arXiv:2505.00263 (2024).. [54], 1 = Christoph Schuhmann, Richard Vencu, Romain Beaumont, Robert Kaczmarczyk, Clayton Mullis,", + "metadata": {} + }, + { + "text": "Aarush Katta, Theo Coombes, Jenia Jitsev, and Aran Komatsuzaki. 2021. Laion-400m: Open dataset of clip- filtered 400 million image-text pairs. arXiv preprint", + "metadata": {} + }, + { + "text": "arXiv:2111.02114 (2021).. [55], 1 = Korakit Seemakhupt, Sihang Liu, and Samira Khan. 2024. EdgeRAG: Online-Indexed RAG for Edge", + "metadata": {} + }, + { + "text": "Devices. arXiv preprint arXiv:2412.21023 (2024).. [56], 1 = Daniel Severo, Giuseppe Ottaviano, Matthew Muckley, Karen Ullrich, and Matthijs Douze.", + "metadata": {} + }, + { + "text": "2025. Lossless Compression of Vector IDs for Approximate Nearest Neighbor Search. arXiv preprint arXiv:2501.10479 (2025).. [57], 1 = Rulin Shao, Jacqueline He, Akari", + "metadata": {} + }, + { + "text": "Asai, Weijia Shi, TimDettmers,Sewon Min, Luke Zettlemoyer, and Pang Wei WKoh. 2024. Scaling retrieval- based language models with a trillion-token datastore. Advances in Neural Information Processing Systems 37 (2024),", + "metadata": {} + }, + { + "text": "91260-91299.. [58], 1 = Michael Shen, Muhammad Umar, Kiwan Maeng, G. Edward Suh, and Udit Gupta. 2024. Towards Understanding Systems Trade-offs in Retrieval-Augmented Generation Model", + "metadata": {} + }, + { + "text": "Inference. arXiv:2412.11854 [cs.AR] https://arxiv.org/abs/2412.11854. [59], 1 = Suhas Jayaram Subramanya, Devvrit, Rohan", + "metadata": {} + }, + { + "text": "Kadekodi, Ravishankar Krishaswamy, and Harsha Vardhan Simhadri. 2019. DiskANN: fast accurate billion-point nearest neighbor search on a single node . Curran Associates Inc., Red Hook, NY, USA.. [60],", + "metadata": {} + }, + { + "text": "1 = Kento Tatsuno, Daisuke Miyashita, Taiga Ikeda, Kiyoshi Ishiyama, Kazunari Sumiyoshi, and Jun Deguchi. 2024. AiSAQ: All-in-Storage ANNS with Product Quantization for", + "metadata": {} + }, + { + "text": "DRAM-free Information Re- trieval. arXiv preprint arXiv:2404.06004 (2024). arXiv:2404.06004", + "metadata": {} + }, + { + "text": "https://arxiv.org/abs/2404.06004. [61], 1 = Bing Tian, Haikun Liu, Yuhang Tang, Shihai Xiao, Zhuohui Duan, Xiaofei Liao, Hai Jin,", + "metadata": {} + }, + { + "text": "Xuecang Zhang, Junhua Zhu, and Yu Zhang. 2025. Towards High-throughput and Low-latency Billion-scale Vector Search via CPU/GPU Collaborative Filtering and Re-ranking. In 23rd USENIX Conference on File and Storage Technologies (FAST", + "metadata": {} + }, + { + "text": "25) . USENIX Association, Santa Clara, CA, 171-185. https://www.usenix.org/con. [62], 1 = ference/fast25/presentation/tian-bing Vincent Totino. 2025. Phone", + "metadata": {} + }, + { + "text": "Storage: How Much Do You Really Need? https://www.optimum.com/articles/mobile/choosing-phone- storage-amount-needs-guide. [63], 1 = Vincent Totino. 2025. Phone Storage: How Much Do You Really Need?", + "metadata": {} + }, + { + "text": "https://www.optimum.com/articles/mobile/choosing-phone- storage-amount-needs-guide Accessed May 15, 2025.. [64], 1 = Mengzhao Wang, Weizhi Xu, Xiaomeng Yi, Songlin Wu, Zhangyang Peng,", + "metadata": {} + }, + { + "text": "Xiangyu Ke, Yunjun Gao, Xiaoliang Xu, Rentong Guo, and Charles Xie. 2024. Starling: AnI/O-Efficient Disk-Resident Graph Index Framework for High-Dimensional Vector Similarity Search on Data Segment. In Proceedings of", + "metadata": {} + }, + { + "text": "the ACM on Management of Data (SIGMOD", + "metadata": {} + }, + { + "text": "- [65] Peng Wang, Chen Wang, Xiaofang Lin, Wenjie Zhang, and Qing He. 2021. A Comprehensive Survey and Experimental Comparison of Graph-Based Approximate Nearest Neighbor Search. Proc. VLDB Endow. 14, 11", + "metadata": {} + }, + { + "text": "(2021), 1964-1978. https://doi.org/10.14778/3476249.347 6258", + "metadata": {} + }, + { + "text": "- [67] Zhenliang Xue, Yixin Song, et al. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone. arXiv preprint arXiv:2406.06282", + "metadata": {} + }, + { + "text": "(2024).", + "metadata": {} + }, + { + "text": "- [66] Zijie J Wang and Duen Horng Chau. 2024. MeMemo: On-device Retrieval Augmentation for Private and Personalized Text Generation. In Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval", + "metadata": {} + }, + { + "text": ". 2765-2770.", + "metadata": {} + }, + { + "text": "- [68] Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William W Cohen, Ruslan Salakhutdinov, and Christopher D Manning. 2018. HotpotQA: A dataset for diverse, explainable multi-hop question", + "metadata": {} + }, + { + "text": "answering. arXiv preprint arXiv:1809.09600 (2018).", + "metadata": {} + }, + { + "text": "- [70] Weiping Yu, Ningyi Liao, Siqiang Luo, and Junfeng Liu. 2025. RAGDoll: Efficient Offloading-based Online RAG System on a Single GPU. arXiv preprint", + "metadata": {} + }, + { + "text": "arXiv:2504.15302 (2025).", + "metadata": {} + }, + { + "text": "- [69] Hongzhi Yin, Tong Chen, Liang Qu, and Bin Cui. 2024. On-Device Recommender Systems: A Comprehensive Survey. arXiv preprint arXiv:2401.11441", + "metadata": {} + }, + { + "text": "(2024).", + "metadata": {} + }, + { + "text": "- [71] Hamed Zamani, Johanne R Trippas, Jeff Dalton, Filip Radlinski, et al. 2023. Conversational information seeking. Foundations and Trends\u00ae in Information Retrieval 17, 3-4 (2023),", + "metadata": {} + }, + { + "text": "244-456.", + "metadata": {} + }, + { + "text": "- [73] Minjia Zhang, Wenhan Wang, and Yuxiong He. 2020. Learning to Anneal and Prune Proximity Graphs for Similarity Search. In International Conference on Learning Representations (ICLR) . Available at", + "metadata": {} + }, + { + "text": "https://openreview.net/forum?id=HJlXC3EtwB .", + "metadata": {} + }, + { + "text": "- [72] Saber Zerhoudi and Michael Granitzer. 2024. PersonaRAG: Enhancing Retrieval-Augmented Generation Systems with User-Centric Agents. arXiv preprint arXiv:2407.09394", + "metadata": {} + }, + { + "text": "(2024).", + "metadata": {} + }, + { + "text": "- [74] Yanhao Zhang, Pan Pan, Yun Zheng, Kang Zhao, Yingya Zhang, Xiaofeng Ren, and Rong Jin. 2018. Visual search at alibaba. In Proceedings of the 24th ACM SIGKDD international conference on knowledge discovery", + "metadata": {} + }, + { + "text": "& data mining . 993-1001.", + "metadata": {} + }, + { + "text": "- [76] Kan Zhu, Yilong Zhao, Liangyu Zhao, Gefei Zuo, Yile Gu, Dedong Xie, Yufei Gao, Qinyu Xu, Tian Tang, Zihao Ye, et al. 2024.", + "metadata": {} + }, + { + "text": "Nanoflow: Towards optimal large language model serving throughput. arXiv preprint arXiv:2408.12757 (2024).", + "metadata": {} + }, + { + "text": "- [75] Jinhao Zhu, Liana Patel, Matei Zaharia, and Raluca Ada Popa. 2024. Compass: Encrypted Semantic Search with High Accuracy. Cryptology ePrint Archive, Paper 2024/1255.", + "metadata": {} + }, + { + "text": "https://eprint.iacr.org/2024/1255\n- [77] Zilliz AI FAQ. 2025. How much memory overhead is typically introduced by indexes like HNSW or IVF? Accessed May 2025.", + "metadata": {} + }, + { + "text": "Paper #130 (12 pages)", + "metadata": {} + }, + { + "text": "Large language model (LLM) inference workload dominates a wide variety of modern AI applications, ranging from multiturn conversation to document analysis. Balancing fairness and efficiency is critical for managing diverse client workloads with varying prefix patterns. Unfortunately, existing fair scheduling algorithms for LLM serving, such as Virtual Token Counter", + "metadata": {} + }, + { + "text": "(VTC), fail to take prefix caching locality into consideration and thus suffer from poor performance. On the other hand, prefix-aware scheduling algorithms in existing LLM serving frameworks tend to maximize the prefix cache hit rate without considering fair sharing among clients.", + "metadata": {} + }, + { + "text": "This paper introduces the first prefix-aware fair scheduling algorithm, Deficit Longest Prefix Match (DLPM), which can maintain a high degree of prefix locality with a fairness guarantee. We also introduce a novel algorithm, DoubleQ, extending DLPM for distributed setup that can find a balance point among fairness, locality,", + "metadata": {} + }, + { + "text": "and load-balancing. Our extensive evaluation demonstrates the superior performance of DLPM and DoubleQ in ensuring fairness while maintaining high throughput (up to 2.87 \u00d7 higher than VTC) and low per-client (up to 7.18 \u00d7 lower than state-of-the-art distributed LLM serving", + "metadata": {} + }, + { + "text": "system) latency.", + "metadata": {} + }, + { + "text": "Online inference workloads for large language models (LLMs) are rapidly becoming widespread, driven by their general-purpose capabilities and versatility across a wide range of tasks such as search engines [2], coding assistant [12], autonomous agents [30,34,48], and tool calling", + "metadata": {} + }, + { + "text": "[35,39]. The release of OpenAI's o1 model has further highlighted the testtime scaling phenomenon [4,8,32,43], where the allocation of increased computational resources during inference via techniques such as Monte Carlo Tree Search (MCTS)", + "metadata": {} + }, + { + "text": "[37,54], Best-of-N sampling [43] and Self-refine [25], can substantially improve the quality of LLM-generated answers across various tasks. The increasingly complex test-time compute re-", + "metadata": {} + }, + { + "text": "Figure 1: DLPM achieves a new Pareto frontier considering locality and fairness in LLM serving. Q is a hyper-parameter in DLPM, indicating how much we relax the fairness bound of DLPM. Results are obtained in a single A10 GPU.", + "metadata": {} + }, + { + "text": "quirements underscore the growing prominence of inference workloads in the LLM landscape.", + "metadata": {} + }, + { + "text": "Despite the advance in LLM generation quality, efficiently scaling online LLM inference services remains challenging, posing substantial barriers to their broad adoption. On the one hand, service providers need to provide isolation between concurrent tasks to ensure stable and predictable performance for all clients [31]: a client's experience should not be", + "metadata": {} + }, + { + "text": "negatively impacted by a dominant or malicious client. On the other hand, service providers want to maximize system efficiency to improve throughput and reduce cost.", + "metadata": {} + }, + { + "text": "Unfortunately, existing scheduling algorithms [21,41,44, 57] for LLM serving fall short of achieving these dual goals effectively, as shown in Fig. 1. Although fair scheduling algorithms such as Virtual Token Counter (VTC) [41], are workconserving", + "metadata": {} + }, + { + "text": "- ensuring the system is fully utilized as long as there are requests in the system - they are not locality-aware. Locality awareness is essential for enhancing memory and computational efficiency, particularly through mechanisms such as prefix sharing [57]. Reusing the prefix's key-value (KV) tensors across multiple requests allows", + "metadata": {} + }, + { + "text": "multiple requests sharing the same prefix to retain only one copy of the prefix's KV tensors in GPU memory. Moreover, it reduces redundant", + "metadata": {} + }, + { + "text": "computation of the prefix's KV tensors. Conversely, algorithms such as Longest Prefix Match (LPM) [57] enhance the system efficiency by prioritizing prefix locality: reordering the requests to maximize the prefix cache hit rate, yet they fail to guarantee effective isolation among clients - a malicious client", + "metadata": {} + }, + { + "text": "can monopolize shared resources by sending a large volume of requests with long identical prefix, significantly degrading the performance experienced by other clients.", + "metadata": {} + }, + { + "text": "Achieving both fairness and prefix locality in LLM inference scheduling is challenging, as these two goals inherently conflict with each other. Prefix sharing, for instance, may require reordering requests to group those with identical prefixes together. In contrast, fair scheduling algorithms prioritize serving requests in a specific order to ensure isolation and", + "metadata": {} + }, + { + "text": "prevent any single client from dominating resources. This necessary ordering can interfere with the efficiency gains from prefix sharing, as it restricts the flexibility to reorder requests for optimal resource utilization. This challenge is exacerbated in a distributed setting, where the algorithm must decide not only the order in which the requests are dispatched, but", + "metadata": {} + }, + { + "text": "also to which GPU they are dispatched to achieve load balancing and prefix locality. This dual consideration of dispatch order and location significantly complicates achieving efficient and fair resource allocation across multiple GPUs.", + "metadata": {} + }, + { + "text": "In this paper, we introduce the first prefix-aware fair scheduling algorithm Deficit Longest Prefix Match (DLPM) for LLM serving which relaxes the dispatch order required by VTC to better preserve prefix locality while still bounding the allocation fairness. As illustrated in Fig. 1, DLPM can achieve", + "metadata": {} + }, + { + "text": "throughput comparable to that of LPM while maintaining a degree of fairness close to that provided by VTC. We further propose a novel distributed scheduling algorithm Double Quantum (DoubleQ) that builds on top of DLPM to preserve high per-GPU prefix locality with a global fairness guarantee in a distributed setting.", + "metadata": {} + }, + { + "text": "In summary, this paper makes the following contributions:", + "metadata": {} + }, + { + "text": "- \u00b7 We introduce the first prefix-aware fair scheduling algorithm DLPM and its distributed version DoubleQ for LLM serving, which can achieve up to 2 . 87 \u00d7 higher throughput than VTC and up to 7 . 18 \u00d7 lower latency than the state-ofthe-art locality-aware", + "metadata": {} + }, + { + "text": "scheduling algorithm [44,57].", + "metadata": {} + }, + { + "text": "- \u00b7 We provide rigorous theoretical bounds on DLPM and DoubleQ's fairness property, including service bound and latency bound between various types of clients.\n- \u00b7 We conduct extensive evaluations on our proposed algorithms and demonstrate their superiority in achieving high system throughput while preserving fairness guarantees.", + "metadata": {} + }, + { + "text": "In this section, we first briefly introduce the basics of LLM inference, prefix caching, and fairness in LLM serving (\u00a72.1). We then discuss key issues with existing LLM serving scheduling algorithms and the challenges they pose (\u00a72.2).", + "metadata": {} + }, + { + "text": "LLMInference Modern transformer-based LLM inference consists of prefill and decode phases. The prefill phase takes input prompt, computes internal embedding vectors through the attention mechanism [47], and generates the first output token. These embedding vectors are normally stored inside the GPU", + "metadata": {} + }, + { + "text": "memory as the KV cache to avoid recomputation. In the decode phase, new tokens are generated auto-regressively until an End-Of-Sequence (EOS) token is encountered or the pre-defined maximum token length is reached. During each iteration of token generation, the", + "metadata": {} + }, + { + "text": "key-value (KV) cache of all previous tokens will be needed and the key-value tensors of the newly generated token will be appended to the KV cache. Such auto-regressive generation can lead to sub-optimal device utilization and decreased serving throughput [36]. To enhance", + "metadata": {} + }, + { + "text": "GPU utilization, [52] proposed continuous batching . However, limited memory capacity emerged as a critical bottleneck, restricting batch sizes and thus reducing GPU efficiency. To address this issue, [21] developed PagedAttention, which mitigates memory fragmentation inherent in continuous batching", + "metadata": {} + }, + { + "text": "and significantly enhances memory efficiency.", + "metadata": {} + }, + { + "text": "Prefix Caching and Locality To further improve the memory and computation efficiency, SGLang [57] introduced RadixAttention to facilitate the reuse of the KV cache of the shared prefix across multiple different LLM calls. By exploiting the prefix locality, memory usage for", + "metadata": {} + }, + { + "text": "the KV cache is reduced, allowing for larger batch sizes and improved GPU utilization. Additionally, it eliminates redundant computations for the shared KV cache,", + "metadata": {} + }, + { + "text": "This technique is increasingly crucial for emerging multicall LLM workloads such as Tree-of-Thought [51], Skeletonof-thought [29], MCTS [54], and Self-refine [25], where there are substantial opportunities for prefix sharing.", + "metadata": {} + }, + { + "text": "For instance, in a Tree-of-Thought program, all branches originating from the same node share the entire prefix up to the root. As the tree expands, the number of requests sharing the same prefix grows, and as the tree deepens, the length of the shared", + "metadata": {} + }, + { + "text": "prefix increases.", + "metadata": {} + }, + { + "text": "LLMServing Fairness Achieving efficient online LLM inference with Service Level Objective (SLO) guarantees necessitates isolation among different clients [41]. This need arises because clients share the same GPU accelerators and compete for these GPU resources. Without isolation, there is", + "metadata": {} + }, + { + "text": "a risk that one client might monopolize resources, leading to the starvation of others. Moreover, to optimize resource utilization, it is crucial to reallocate unused resources from one client to another rather than merely imposing a rate limit [31] on each client for isolation", + "metadata": {} + }, + { + "text": "purpose. Rate limit simply disallows clients to send requests beyond a certain rate which harms the resource utilization as shown in [41]. Formally, our goal is to achieve the classic max-min fairness [49], where the fair scheduling ensures each client receives at least", + "metadata": {} + }, + { + "text": "1/ n of the resources, with n representing the total number of clients. If", + "metadata": {} + }, + { + "text": "Figure 2: Requests from the same client share prefixes with each other. In LPM locality-aware scheduling, the system schedules the GPUs to process all requests from Client 1 to maximize prefix sharing while starving Client 2. In VTC fair scheduling, the system processes", + "metadata": {} + }, + { + "text": "requests in turn to maximize fairness, while ignoring the prefix sharing opportunity. Our DLPM scheduling achieves the best of two worlds through a novel quantum mechanism (\u00a74) to guarantee locality while not sacrificing fairness.", + "metadata": {} + }, + { + "text": "some clients do not fully utilize their allocated share, these resources can be redistributed to others.", + "metadata": {} + }, + { + "text": "VTC [41] proposes the first fair scheduling algorithm called Virtual Token Counter targeting the continuous batching mechanism in online LLM serving. It tracks the tokens serviced for each client as the virtual counter and prioritizes clients with the lowest counters in each batching iteration. By tracking", + "metadata": {} + }, + { + "text": "token-level resource usage, VTC achieves fair scheduling even when the output length of the request is unknown in advance.", + "metadata": {} + }, + { + "text": "Locality vs. Fairness Achieving both strong fairness and high locality for efficient online LLM serving is inherently challenging, since these two are usually at odds with each other, as illustrated in Fig. 2. On the one hand, locality-aware scheduling (Fig.", + "metadata": {} + }, + { + "text": "2a) reorders requests to group those with similar prefixes - often originating from the same client - to the same GPU to optimize for prefix locality. On the other hand, the VTC fair scheduler (Fig. 2b) adheres to a strict order based on per-client", + "metadata": {} + }, + { + "text": "resource usage counters to dispatch requests, ensuring no client continuously dominates the GPU usage; such an order compromises locality as it intersperses the requests of the same client with requests from other clients. Fig. 1 also demonstrates the vastly different prioritizations of these two techniques, highlighting the", + "metadata": {} + }, + { + "text": "trade-off between fairness and prefix locality.", + "metadata": {} + }, + { + "text": "Locality vs. Load-Balancing The challenge intensifies in distributed settings, where model replicas are served on multiple workers, each managed by its own local scheduler, with a global scheduler coordinating all these local workers. In this scenario, the scheduling algorithm on the global scheduler must balance a", + "metadata": {} + }, + { + "text": "trade-off between locality and load balancing.\nFigure 3: This paper addresses the conflict between fairness and locality through the DLPM mechanism (\u00a74). It further addresses the conflict between locality and load balancing in distributed settings with the DoubleQ mechanism (\u00a75).", + "metadata": {} + }, + { + "text": "For instance, simply distributing requests equally across the cluster is suboptimal due to the high prefix recompute overhead. Similarly, always dispatching requests with the same prefix to a single GPU can lead to workload imbalance.", + "metadata": {} + }, + { + "text": "Design Goals The main goal of this paper is to provide a principled way of navigating the trade-off between strong fairness and high locality in online LLM serving, as well as between locality and load-balancing in distributed settings. Our methodology ensures that the algorithms for single and distributed settings can", + "metadata": {} + }, + { + "text": "be combined to maintain global fairness effectively. In the remainder of the paper, we begin by discussing preliminary concepts related to fairness in LLM serving (\u00a73), then we introduce our fair scheduling design for a single worker (\u00a74), and finally, we expand this approach to distributed fair", + "metadata": {} + }, + { + "text": "scheduling (\u00a75).", + "metadata": {} + }, + { + "text": "In this section, we first formally define the properties a fair scheduling algorithm needs to meet for LLM serving, following those described in VTC [41]. We then discuss the cost function we adopt for service measurement.", + "metadata": {} + }, + { + "text": "Definition 3.1 (Backlog) . Aclient u is backlogged if dispatching additional requests cannot further increase throughput and can only incur additional queueing delay. In distributed settings, depending on the load-balancing policy, a backlogged client may have requests in queues of certain workers or", + "metadata": {} + }, + { + "text": "all workers.", + "metadata": {} + }, + { + "text": "Fairness Properties Similar to VTC, our goal is to achieve approximate max-min fairness [49] on the service received by each client; different from VTC, we also want to preserve prefix cache locality. More formally, an LLM serving system that can achieve approximate max-min fairness should", + "metadata": {} + }, + { + "text": "satisfy the following three properties [41]:", + "metadata": {} + }, + { + "text": "- 1. During any time interval [ t 1 , t 2 ) , if two clients f and g are continuously backlogged, they should receive a similar level of service, i.e. | Wf ( t 1 , t 2 ) -Wg ( t", + "metadata": {} + }, + { + "text": "1 , t 2 ) | \u2264 \u03b4 , where \u03b4 is a constant value independent of t 2 -t 1.\n", + "metadata": {} + }, + { + "text": "Table 1: The upper half includes notations for service measurement. The lower half includes notations for the DLPM and DoubleQ algorithm and their analysis. *The extend tokens are the input tokens excluding prefix tokens.", + "metadata": {} + }, + { + "text": "W f ( t 1 , t 2 ) n e n q w e w q Q u q i Q w q i , w L input L out put M U, Explanation = service received by f during interval [ t 1 , t 2 ) number of processed extend", + "metadata": {} + }, + { + "text": "tokens* number of processed output tokens weight of extend tokens in the cost function weight of output tokens in the cost function. , Explanation = the quantum assigned to each client in DLPM the deficit counter of client i in DLPM. , Explanation = the quantum assigned to each worker in. , Explanation", + "metadata": {} + }, + { + "text": "= DoubleQ the deficit counter of worker w for client i in DoubleQ. , Explanation = maximum number of input tokens in a request. , Explanation = maximum number of output tokens in a request maximum number of tokens that can be fitted in a running batch. , Explanation = maximum number of counter", + "metadata": {} + }, + { + "text": "that a single request can consume w e \u00b7 L input + w q \u00b7 M. , Explanation = data parallelism degrees. D, Explanation = ", + "metadata": {} + }, + { + "text": "- 2. A client f that is continuously backlogged during a time interval should not receive less service than another client g that is not continuously backlogged during the same time interval, i.e. Wg ( t 1 , t 2 ) -Wf ( t 1 ,", + "metadata": {} + }, + { + "text": "t 2 ) \u2264 \u03b4 , where \u03b4 is a constant value.\n- 3. The scheduling policy should be work-conserving: no worker should be idle if there are requests in the queue.", + "metadata": {} + }, + { + "text": "The first property states that a client sending at a high request rate is guaranteed to not receive more than their fair share of service and will not impact other normal-behaved clients. The second property prevents clients from accumulating unused service by first sending at a low request rate and later monopolizing the", + "metadata": {} + }, + { + "text": "system. The third property guarantees that no resources are wasted in order to enforce fairness.", + "metadata": {} + }, + { + "text": "Measurement of Service Another important aspect in designing a fair scheduling algorithm for LLM serving is how the service should be measured. In VTC, the cost function is defined as a weighted sum of the number of input tokens and the number of output tokens. To incorporate the impact of prefix sharing", + "metadata": {} + }, + { + "text": "(i.e., reduced memory and computations), we introduce a slightly different measure. Intuitively, with prefix sharing, the prefix tokens' cost should only be counted once when it is first calculated and stored in the GPU memory. Our prefix-aware version of the cost function is then defined as W (", + "metadata": {} + }, + { + "text": "t 1 , t 2 ) = we \u00b7 ne ( t 1 , t 2 ) + wq \u00b7 nq ( t 1 , t 2 ) . The notations are explained in Tab. 1. Here we and wq are set to be 1", + "metadata": {} + }, + { + "text": "and 2, inspired by OpenAI's pricing for GPT4 1\n1 https://openai.com/api/pricing/", + "metadata": {} + }, + { + "text": "In this section, we present our algorithm DLPM for the single worker in \u00a74.1 and the proved fairness guarantees in \u00a74.2.", + "metadata": {} + }, + { + "text": "In the Longest Prefix Match (LPM) algorithm [57], at each continuous batching step, the scheduler first sorts current requests in the waiting queue based on their matched prefix length and then adds them to the new batch until the memory pool is full. LPM efficiently utilizes memory by", + "metadata": {} + }, + { + "text": "grouping requests that can share a common prefix, thus maximizing the decoding batch size, which in turn leads to better operational intensity and throughput for the decoding phase.", + "metadata": {} + }, + { + "text": "To maintain the cache hit rate while introducing a fairness guarantee, it is essential not to disrupt the LPM order of the requests excessively. To achieve this, we incorporate a quantum mechanism inspired by the deficit round robin (DRR) approach [42]. This mechanism compels the scheduler to", + "metadata": {} + }, + { + "text": "occasionally prioritize requests from less-served clients over those with the longest matching prefixes. Intuitively, this mechanism is effective because it preserves the local ordering inherent to the LPM. As a result, the system continues to benefit significantly from the memory savings brought by the shared prefixes, while", + "metadata": {} + }, + { + "text": "the additional cost of prefix recomputation is incurred only when switching to serve less-served clients. This balanced approach allows DLPM to uphold the core efficiencies of the original LPM algorithm while enhancing fairness across client requests, ensuring that no clients monopolize the batching process to the detriment of others.", + "metadata": {} + }, + { + "text": "The core algorithm of DLPM is presented in Algorithm 1. Initially, the algorithm initializes all clients' deficit counter qi to zero, with Q u representing the service quantum replenished to each client in a cycle. At each continuous batching step, DLPM performs the following steps:", + "metadata": {} + }, + { + "text": "1) It sorts the requests in the waiting queue by their matched prefix length and then tries to add them to the currently running batch ( B ) until the memory pool is full. 2) The request will be added to B if the corresponding client has a positive deficit counter ( qi >", + "metadata": {} + }, + { + "text": "0). Otherwise, the request will be skipped. When all the active clients have q \u2264 0, they will be replenished by Q u at Line 7. 3) After each request is added to B , the corresponding client's deficit counter will deduct the amount of service invoked", + "metadata": {} + }, + { + "text": "by the extend tokens. 4) The new batch B then goes through one model forward step. After each decoding step, the service invoked by the output tokens will be deducted from the client's deficit counter accordingly.", + "metadata": {} + }, + { + "text": "In this section, we provide the theoretical fairness guarantees of DLPM that correspond to the three properties we introduced in \u00a73. The full proofs are provided in Appendix A.1.", + "metadata": {} + }, + { + "text": "- 1: let l denotes the client list 2: let B denotes current running batch 3: function CHECKREFILL( l , Queue ) 4: for all i \u2208 { client ( r ) | r \u2208 Queue } do", + "metadata": {} + }, + { + "text": "5: if qi > 0 then return 6: for all i \u2208 l do 7: if qi \u2264 0 then qi \u2190 qi + Q u 8: end function 9: \u25b7 with monitoring stream: 10:", + "metadata": {} + }, + { + "text": "while True do 11: if new request r from client i arrived then 12: if i / \u2208 l then qi \u2190 0, l \u2190 l + u 13: Queue \u2190 Queue + r 14: \u25b7", + "metadata": {} + }, + { + "text": "with execution stream 1: 15: while True do 16: Queue \u2190 SORTBYPREFIX( Queue ) 17: while not Queue . empty () do 18: for each r \u2208 Queue do 19:", + "metadata": {} + }, + { + "text": "i \u2190 client ( r ) 20: if qi \u2264 0 then CHECKREFILL( l , Queue ) 21: if qi > 0 then 22: if CANADD( r ) then 23: B \u2190", + "metadata": {} + }, + { + "text": "B + r 24: qi \u2190 qi - we \u00b7 extend _ length ( r ) 25: Queue \u2190 Queue - r 26: FORWARDSTEP( B ) 27: qi \u2190 qi - wq \u00b7 |{ r", + "metadata": {} + }, + { + "text": "| client ( r ) = i , r \u2208 B }| 28: B \u2190 filter_finished_requests( B )", + "metadata": {} + }, + { + "text": "Under the DLPM scheme: for any time interval [ t 1 , t 2 ) , if two clients f and g are continuously backlogged. Then the difference in their received service are bounded: | Wf ( t 1 ,", + "metadata": {} + }, + { + "text": "t 2 ) -Wg ( t 1 , t 2 ) | \u2264 2 \u00b7 ( U + Q u ) , where U = we \u00b7 Linput + wq \u00b7 M.", + "metadata": {} + }, + { + "text": "Proof. Let the client with maximum service be f , and the client with minimum service be g . Consider t 1 and t 2.", + "metadata": {} + }, + { + "text": "- \u00b7 At t 2, since both clients f and g are backlogged and are in client list l , both client f and client g have been replenished the same k number of times in Line 7 since t 1. f", + "metadata": {} + }, + { + "text": "and g are backlogged, Line 5 ensures that both clients have negative qi before reaching Line 7 and be replenished.", + "metadata": {} + }, + { + "text": "- \u00b7 Since t 1, client f at t 2 has received service Wf ( t 1 , t 2 ) = qf ( t 1 ) + k \u00b7 Q u -qf ( t 2 ) .", + "metadata": {} + }, + { + "text": "client g at t 2 has received service Wg ( t 1 , t 2 ) = qg ( t 1 ) + k \u00b7 Q u -qg ( t 2 ) .", + "metadata": {} + }, + { + "text": "- \u00b7 | Wf ( t 1 , t 2 ) -Wg ( t 1 , t 2 ) | = | qf ( t 1 ) -qf ( t 2 ) -qg ( t", + "metadata": {} + }, + { + "text": "1 ) + qg ( t 2 ) | \u2264 | qf ( t 1 ) -qf ( t 2 ) | + | qg ( t 2 ) -qg ( t 1 ) | \u2264", + "metadata": {} + }, + { + "text": "2 \u00b7 ( U + Q u ) , according to Theorem A.1.\nTheorem 4.2 ( Service bound between backlogged and", + "metadata": {} + }, + { + "text": "non-backlogged clients ) . Under the DLPM scheme: Client f that is continuously backlogged during time interval [ t 1 , t 2 ) should not receive less service than another client, g, that is not continuously backlogged during", + "metadata": {} + }, + { + "text": "the same time interval, that is Wf ( t 1 , t 2 ) \u2265 Wg ( t 1 , t 2 ) -2 U -2 Q u .", + "metadata": {} + }, + { + "text": "Proof. \u00b7 Consider client f and client g . f is continuously backlogged and g is not continuously backlogged.", + "metadata": {} + }, + { + "text": "- \u00b7 If g is not backlogged during the entire duration from t 1 to t 2, Wg ( t 1 , t 2 ) \u2264 U , with no new request arrival.", + "metadata": {} + }, + { + "text": "- \u00b7 Let client f be replenished k t f at time t in Line 7.", + "metadata": {} + }, + { + "text": "- \u00b7 Since f is continuously backlogged from t 1 to t 2, k t 2 f -k t 1 f \u2265 k t 2 g -k t 1 g . A backlogged client will be replenished", + "metadata": {} + }, + { + "text": "for the same time as another backlogged client, from Theorem 4.1. A non-backlogged client will be replenished less as it is not in the active client list (Line 5).", + "metadata": {} + }, + { + "text": "- \u00b7 Wg ( t 1 , t 2 ) -Wf ( t 1 , t 2 ) = ( qg ( t 1 )+ k t 2 g Q u -qg ( t 2", + "metadata": {} + }, + { + "text": ") -k t 1 g Q u ) -( qf ( t 1 )+ k t 2 f Q u -qf ( t 2 )+ k t 1 f Q u ) \u2264 2 ( U +", + "metadata": {} + }, + { + "text": "Q u ) -Q u \u00b7 ( k t 2 f -k t 1 f -k t 2 g + k t 1 g ) \u2264 2 ( U + Q u ) , since Q u \u00b7 ( k t", + "metadata": {} + }, + { + "text": "2 f -k t 1 f -k t 2 g + k t 1 g ) > 0.", + "metadata": {} + }, + { + "text": "The DLPM algorithm is work-conserving since it only manipulates the dispatch order and does not reject a request if it fits into the running batch.", + "metadata": {} + }, + { + "text": "Theorem 4.1 and Theorem 4.2 reflect the first and second properties introduced in \u00a73. Illustrative examples for Theorem 4.1 can be found in Fig. 8 and Fig. 12 in", + "metadata": {} + }, + { + "text": "\u00a77.1, where within any time interval, the difference of the received service of two continuously backlogged clients is bounded.", + "metadata": {} + }, + { + "text": "In this section, we first present the strawman solution of centralized DLPM for distributed scheduling that ignores the scheduling overhead (\u00a75.1). We then proposed a decentralized DLPM solution that hides this overhead while preserving fairness property (\u00a75.2).", + "metadata": {} + }, + { + "text": "The DLPM algorithm works perfectly when there is no scheduling overhead such that the DLPM scheduler could immediately make decisions based on freshest GPU states. Unfortunately, in real-world distributed scenarios, scheduling overhead happens significantly because of concurrent request handling and synchronization, prefix tree traversing", + "metadata": {} + }, + { + "text": "and maintenance, and more. Recent work has also shown that the CPU scheduling overhead occupies nearly half of the inference time for two popular LLM inference engines [45].", + "metadata": {} + }, + { + "text": "Global-local States Synchronization To enable global DLPM for fair scheduling in distributed setups, we need to synchronize local and global prefix caching information. This", + "metadata": {} + }, + { + "text": "(a) Scheduler Overhead Breakdown. The global queue size is 200. Decode batch size is 25.\n(b) Prefix match overhead w.r.t global queue size.", + "metadata": {} + }, + { + "text": "Figure 4: Global scheduler overhead breakdown w.r.t data parallelism degree and global queue size. The time for one decode step with bs=25 is also reported for reference. Existing serving engines such as vLLM [21] and SGLang", + "metadata": {} + }, + { + "text": "[57] normally perform a continuous batching step after multiple (e.g., 10 in SGLang) decoding steps.", + "metadata": {} + }, + { + "text": "synchronization ensures that the global scheduler can replicate the decision-making process typical of a single worker. Using the token RadixTree from SGLang [57] as an example, to construct an accurate global RadixTree at time t i (assume the last", + "metadata": {} + }, + { + "text": "time the global scheduler dispatches the requests at time t i -1), updates from each worker s are encapsulated as \u2206 Trees , defined as:\n", + "metadata": {} + }, + { + "text": "where N inserted and N evicted are sets of nodes that have been inserted to or evicted from the RadixTree, between the last dispatch time t i -1 and the current time t i . M KV indicates the current available KV cache memory.", + "metadata": {} + }, + { + "text": "Upon sending these updates, the worker enters a blocked state, awaiting new requests from the global scheduler. The global scheduler then updates the RadixTree accordingly and dispatches new requests to the local worker following the DLPM algorithm. Such a synchronous approach guarantees the effectiveness and", + "metadata": {} + }, + { + "text": "correctness of DLPM in the distributed setup; however, it incurs significant overhead due to the need to block workers while awaiting new requests, and the race conditions on the global waiting queue across workers.", + "metadata": {} + }, + { + "text": "Overhead Analysis The global scheduler's overhead primarily stems from synchronization overhead, algorithmic overhead (e.g., the frequent tree-matching overhead for the global waiting queue), and metadata updates overhead. Among these, the metadata updates overhead per worker remains relatively constant as the system", + "metadata": {} + }, + { + "text": "scales. However, the synchronization and algorithmic overhead increase dramatically as the data parallelism degree ( D ) 2 and global queue size increases, as shown in Fig. 4. The prefix matching process (algorithmic overhead) involves matching all incoming requests in", + "metadata": {} + }, + { + "text": "the global waiting queue against each worker's radix tree and sorting them based on prefix length to determine the dispatch order. The \"Prefix Match\" time (blue) increases significantly\n2 Here data parallelism degree refers to the number of model replicas in the distributed settings.", + "metadata": {} + }, + { + "text": "Update Deficit Counter", + "metadata": {} + }, + { + "text": "Figure 5: An overview of the DoubleQ scheduler. The global scheduler tracks the deficit counters for each client per worker to control the 'stickiness' of a client to a worker. The local schedulers maintain the deficit counters for each client to enforce the fair", + "metadata": {} + }, + { + "text": "sharing of the local GPU resources.\nas the global queue size increases (Fig. 4b), which is normally the case when the data parallelism degree grows.", + "metadata": {} + }, + { + "text": "Overall, Fig. 4 demonstrates how synchronization and algorithmic overheads dominate as the data parallelism degree increases, particularly for higher degrees ( D = 8) - they add to around 40% decoding overhead in the demonstrated case. This analysis underscores the", + "metadata": {} + }, + { + "text": "challenges of designing scalable global schedulers to mitigate synchronization and algorithmic bottlenecks as the system scales.", + "metadata": {} + }, + { + "text": "Besides the significant scheduling overhead, the Global DLPM scheduler also requires extensive modification of the local worker to enable local-global information synchronization and the blocking operation to wait for the global scheduler dispatching requests.", + "metadata": {} + }, + { + "text": "To mitigate the global scheduling overhead and tight coupling between the global scheduler and the local worker, we resort to decentralized scheduling: dispatching the requests directly to local workers and queueing them at the local worker instead of the global scheduler. Most of the existing distributed", + "metadata": {} + }, + { + "text": "schedulers for LLM serving (e.g., Preble [44] and SGLang [56]) follow this design.", + "metadata": {} + }, + { + "text": "In such a decentralized design, the local worker can directly run a fair scheduling algorithm (e.g., DLPM); as long as the global scheduler can balance the per-client service on all the local workers, we could achieve global fairness guarantee [3]. Previous works", + "metadata": {} + }, + { + "text": "in CPU scheduling [1] and wireless LANs bandwidth sharing [3] also demonstrate the effectiveness of such design. Therefore, the challenge now becomes how to strike a good trade-off between load balancing and locality.", + "metadata": {} + }, + { + "text": "Double Quantum (DoubleQ) Our key insight is to prioritize", + "metadata": {} + }, + { + "text": "- then", + "metadata": {} + }, + { + "text": "- 1: let sw denotes the current queue size of worker w . 2: W \u2190 GETWORKERS ( ), R \u2190 INITRADIXTREE ( | W | ) 3: function SELECTWORKER( G , i ) 4: Gavail \u2190{ w", + "metadata": {} + }, + { + "text": "| qi , w > 0 } 5: while Gavail == / 0 do 6: for all w \u2208 W do qi , w \u2190 qi , w + Q w 7: Gcand \u2190 G \u2229 Gavail 8: if", + "metadata": {} + }, + { + "text": "Gcand == / 0 then return argmin w \u2208 Gavail sw return argmin w \u2208 Gcand sw 9: end function 10: \u25b7 with concurrent stream 1: 11: while True do 12: if new request", + "metadata": {} + }, + { + "text": "r from client i arrived then 13: G \u2190 R.LONGESTMATCHWORKERS ( r ) 14: w \u2190 SELECTWORKER( G , client ( r ) ) 15: DISPATCH( w , r ) 16: qi , w", + "metadata": {} + }, + { + "text": "\u2190 qi , w -we \u00b7 r . input_tokens 17: sw \u2190 sw + 1 18: R.INSERT( r .input_tokens, w ) 19: \u25b7 with concurrent stream 2: 20: while True do", + "metadata": {} + }, + { + "text": "21: if request r from client i has finished at worker w 22: qi , w \u2190 qi , w -wq \u00b7 r .output_tokens 23: sw \u2190 sw -1 24: \u25b7 with concurrent stream 3:", + "metadata": {} + }, + { + "text": "25: while True do 26: if prefix P has been evicted at worker w then 27: R.EVICT( P , w )", + "metadata": {} + }, + { + "text": "locality first until certain limits are met: we use the quantum mechanism again to avoid a client becoming too sticky to a single worker due to the prefix cache locality by assigning quantum to each worker for each client. As demonstrated in Fig. 5 and Algorithm 2, for each", + "metadata": {} + }, + { + "text": "new request, the global scheduler first matches it with the global radix tree and get the workers G that have its longest-matched prefix (Line 13). Then in the SELECTWORKER function (Line 3), if any wokers in G has deficit counter larger than", + "metadata": {} + }, + { + "text": "0 ( Gavail ), the worker with minimum queue size in G \u2229 Gavail will be chosen. Otherwise, the worker with minimum queue size in Gavail will be selected. After each request is dispatched, the request's input tokens will be inserted into the global radix tree and", + "metadata": {} + }, + { + "text": "the corresponding deficit counter will be updated (Line 17). The global scheduler will periodically update corresponding deficit counter when there are requests finished (Line 23) as well as prune the global radix tree with collected local workers' eviction information (Line 27). Note that", + "metadata": {} + }, + { + "text": "unlike the centralized DLPM where the eviction information needed to be passed to the global scheduler synchronously, in DoubleQ this happens asynchronously with negligible overhead.", + "metadata": {} + }, + { + "text": "We note that our DoubleQ scheduling (with local workers running DLPM) 3 provides global fairness guarantees corresponding to the properties introduced in \u00a73 through the following theorems.", + "metadata": {} + }, + { + "text": "At any time interval [ t 1 , t 2 ) , max iWi ( t 1 , t 2 ) -min iWi ( t 1 , t 2 ) \u2264 2 \u00b7 | W | \u00b7 ( U", + "metadata": {} + }, + { + "text": "+ Q u ) . The difference between the maximum service among all backlogged clients and the minimum service among all backlogged clients is bounded by 2 \u00b7 | W | \u00b7 ( U + Q u ) , where | W | is the number", + "metadata": {} + }, + { + "text": "of workers.", + "metadata": {} + }, + { + "text": "Theorem 5.2 ( Service bound between backlogged and non-backlogged clients ) . Consider any execution of the DoubleQ scheme. Client f that is continuously backlogged during time interval [ t 1 , t 2 ) should not", + "metadata": {} + }, + { + "text": "receive less service than another client, g, that is not continuously backlogged during the same time interval, where Wg ( t 1 , t 2 ) -Wf ( t 1 , t 2 ) \u2264 2 \u00b7", + "metadata": {} + }, + { + "text": "( U + Q u ) \u00b7 | W | .\nSince there are no requests rejected to enforce fairness, DoubleQ scheduling is work-conserving.", + "metadata": {} + }, + { + "text": "Implementation We implement our DLPM and DoubleQ schedulers with 1000 LoC in Python on top of SGLang [57], a fast industry-standard LLM inference system.", + "metadata": {} + }, + { + "text": "Models and Hardware Our evaluation is conducted on the widely-used model Llama-3.1-8B and Llama-3.2-3BInstruct [10]. Other transformer-based LLMs such as Qwen [50], DeepSeek [7], and Mistral", + "metadata": {} + }, + { + "text": "[18] share a similar backbone architecture and are also compatible with our system. For hardware, we test on NVIDIA A100 80GB and A10G GPUs.\n\nTable 2: Workload configurations.", + "metadata": {} + }, + { + "text": "Long-context QA, Dataset = LooGLE [23]. Long-context QA, Avg Prefix Len. = 21449. Long-context QA, Avg Output Len. = 15. Tree of Thoughts, Dataset = GSM8K [6]. Tree of Thoughts, Avg", + "metadata": {} + }, + { + "text": "Prefix Len. = 546. Tree of Thoughts, Avg Output Len. = 256. LLM-as-a-Judge, Dataset = Synthetic articles [57]. LLM-as-a-Judge, Avg Prefix Len. = 2701.", + "metadata": {} + }, + { + "text": "LLM-as-a-Judge, Avg Output Len. = 256. Real multi-turn, Dataset = Chatbot Arena [56]. Real multi-turn, Avg Prefix Len. = 56. Real multi-turn, Avg Output Len. = 142", + "metadata": {} + }, + { + "text": "Workloads and Datasets We evaluate the efficiency and effectiveness of the schedulers on 4 diverse LLM-based workloads, each characterized by its unique execution graph structures (Fig. 6) and variations in prefix and output length distributions. as detailed in Tab. 2. Specifically, we", + "metadata": {} + }, + { + "text": "evaluate long document understanding using the LooGLE [23] dataset. We implement the Tree-of-Thought [51] program for solving GSM8K [6] problems (with a tree height of 4), and the", + "metadata": {} + }, + { + "text": "3 Generally, in DoubleQ, the local worker can run any other fair scheduling algorithms such as VTC. In this paper, DoubleQ specifically refers to the implementation using DLPM at the local workers.", + "metadata": {} + }, + { + "text": "LLM-as-a-Judge [57] program, which utilizes the branchsolve-merge technique to evaluate synthetic articles. We also conduct experiments on real-world multi-turn conversation traces from Chatbot Arena [56].\n(a) Long-Context QA\n(b) LLM-as-a-Judge", + "metadata": {} + }, + { + "text": "(c) Tree of Thoughts\n(d) Multi-Turn Chat\nFigure 6: Illustration of the execution graphs of different workloads in our benchmark.", + "metadata": {} + }, + { + "text": "Synthetic Traces For Long-context QA, Tree-of-Thoughts, and LLM-as-a-Judge, we generate synthetic client request traces following the Gamma process, as done in [24,40,41], with the request rate increasing as the number of GPUs scales.", + "metadata": {} + }, + { + "text": "For these three workloads, we evaluate two distinct types of misbehaving patterns, as detailed in Tab. 3. The first type (S1) involves a misbehaving client sending more requests than well-behaved clients. Specifically, although all clients send programs at the same request", + "metadata": {} + }, + { + "text": "rate, the misbehaving client submits programs with a more complex execution graph (e.g., more branches in Tree-of-thought). The second type (S2) features a misbehaving client sending programs with the same structural complexity and at the same request rate as wellbehaved clients,", + "metadata": {} + }, + { + "text": "but with the input altered to increase the prefix length. These workloads are evaluated with the Llama3.1-8B model served on NVIDIA A100 GPUs. The related results are reported in \u00a76.2.\n", + "metadata": {} + }, + { + "text": "Table 3: Synthetic Workload Configurations. /thumbs_down_alt stands for misbehaving client and /thumbs_up_alt denotes well-behaved clients.", + "metadata": {} + }, + { + "text": "S1: More Requests Long-context QA Tree-of-thought LLM-as-Judge, Detailed Behavior = /thumbs_down_alt : Higher req rate /thumbs_down_alt : Trees of 4 branches (340 req per tree) /thumbs_up_alt : Trees of 2 branches", + "metadata": {} + }, + { + "text": "(30 req per tree) /thumbs_down_alt : Evaluation with 16 dimensions /thumbs_up_alt : Evaluation with 2 dimensions", + "metadata": {} + }, + { + "text": "Real-world Traces For real-world multi-turn conversation, we re-scale the request time stamps provided in the dataset 4 and aggregate multiple clients' requests to closely mimic highdemand scenarios. This workload is evaluated with the Llama-", + "metadata": {} + }, + { + "text": "4 https://huggingface.co/datasets/lmsys/chatbot_arena_ conversations\n3.2-3B-Instruct model served on NVIDIA A10G GPUs. The related results are reported in \u00a76.3.", + "metadata": {} + }, + { + "text": "Baselines We compare DLPM and DoubleQ with three baseline scheduling algorithms.\n- \u00b7 DoubleQ : The local worker adopts DLPM, and the global scheduler runs the DoubleQ algorithm when Data Parallelism Degree D > 1.", + "metadata": {} + }, + { + "text": "- \u00b7 RR + LPM : The local scheduler runs LPM, and the global scheduler uses the round-robin (RR) algorithm when D > 1. It is the default distributed scheduling algorithm in SGLang [57] without fairness guarantees.", + "metadata": {} + }, + { + "text": "- \u00b7 Preble [44]: Preble is a state-of-the-art distributed LLM serving system that aims to provide high serving throughput by balancing load distribution and locality, yet without fairness guarantee. Specifically, it dispatches requests based on a pre-defined prefix-matching ratio to decide whether to", + "metadata": {} + }, + { + "text": "explore a new GPU or exploit locality.", + "metadata": {} + }, + { + "text": "- \u00b7 VTC : The local scheduler runs VTC [41], and the global scheduler applies a per-client round-robin strategy when D > 1. Extending VTC with round-robin scheduling is the straightforward approach to ensuring fairness in distributed settings, with fairness bound proven in Appendix", + "metadata": {} + }, + { + "text": "A.3.\nMetrics To measure the system efficiency and fairness achieved by different scheduling algorithms, we use the following three metrics:", + "metadata": {} + }, + { + "text": "- \u00b7 Service Rate : We measure the clients' service as a weighted sum of the number of input tokens and the number of output tokens, following VTC [41] 5 . As discussed in \u00a73, the weight for input token is 1 and the weight for output token is", + "metadata": {} + }, + { + "text": "2.\n- \u00b7 Jain's Fairness Index [17] is a widely-used metric for evaluating the fairness of resource allocation in networked systems [22]. The index is mathematically defined as:\n", + "metadata": {} + }, + { + "text": "where xi represents the allocation for the i th client, and n is the total number of clients. The value of J ranges from 1 n (minimum fairness, when one client monopolizes all resources) to 1 (maximum fairness, when resources are equally distributed). In our context, we", + "metadata": {} + }, + { + "text": "compute the Jain's Fairness Index by letting xi denote the service rate of client i . The calculation is based on the time interval during which all clients are active, ensuring an accurate representation of fairness across the system.", + "metadata": {} + }, + { + "text": "- \u00b7 P50 and P99 Latency : We assess the scheduler's effectiveness in maintaining service quality for well-behaved clients by measuring their P50 and P99 latency. We measure latency using the end-to-end completion time for program evaluation. We use the TTFT", + "metadata": {} + }, + { + "text": "(Time to First Token) latencyc 6 metric for long-context QA tasks.\n5 Note that this service rate is from clients' perspective. From the system's perspective, the actual service is measured by the cost function using the number of extend tokens.", + "metadata": {} + }, + { + "text": "6 For QA tasks, a shorter TTFT contributes to improved client experiences.", + "metadata": {} + }, + { + "text": "Figure 7: Summary of results across three datasets and two types of misbehaving clients on up to 8 A100 GPUs (8B model). The reported latency represents the average latency for well-behaved clients. The data point for S2@Tree of Thoughts with D", + "metadata": {} + }, + { + "text": "= 8 is omitted, as it takes too long to complete.", + "metadata": {} + }, + { + "text": "We present all three metrics across three workloads and two types of misbehaving clients in Fig. 7. Both VTC and DoubleQ provide theoretical fairness guarantees, whereas Preble and RR + LPM do not. The data point for Preble with D =", + "metadata": {} + }, + { + "text": "1 is omitted because Preble is designed as a multi-GPU cacheaware prompt dispatch system.", + "metadata": {} + }, + { + "text": "compared to D = 4 can be attributed to the complex sharing patterns inherent in Tree of Thoughts. Round Robin fails to preserve locality among GPUs, leading to a significant drop in cache hit rate (i.e., from 95% to 50%). This limitation", + "metadata": {} + }, + { + "text": "indicates that RR + LPM does not scale effectively when clients submit complex LLM programs.", + "metadata": {} + }, + { + "text": "Throughput Analysis As previously discussed, ensuring fairness inherently competes with maximizing throughput. However, DoubleQ's effective global cache-aware scheduler and DLPM enable significant performance gains, achieving up to a 2.87 \u00d7 improvement compared to the only other fair algorithm,", + "metadata": {} + }, + { + "text": "VTC.\nDoubleQ achieves better throughput than RR + LPM, with improvements of up to 2.22 \u00d7 . In the case of S2@Tree of Thoughts, the poor performance of RR + LPM with D = 8", + "metadata": {} + }, + { + "text": "Compared to Preble, DoubleQ consistently matches or exceeds its performance across all workloads and GPU configurations, demonstrating its ability to sustain high throughput while ensuring fairness. An exception arises for S2@Longcontext QA with D = 8, where Preble outperforms", + "metadata": {} + }, + { + "text": "DoubleQ in throughput. This discrepancy occurs because DoubleQ sacrifices some locality to maintain fairness, resulting in increased prefix recompute overhead. As indicated in Tab. 2, the LooGLE dataset features an exceptionally high prefix lengthto-output length ratio. In this case, the", + "metadata": {} + }, + { + "text": "cost of recomputing long documents becomes substantial, with the prefill stage", + "metadata": {} + }, + { + "text": "significantly dominating the generation time. Consequently, the Long-context QA workload serves as a worst-case scenario that adversely impacts DoubleQ's throughput. However, when the prefix length-to-output length ratio falls within a reasonable range, the DoubleQ algorithm consistently matches and even slightly surpasses", + "metadata": {} + }, + { + "text": "the performance of state-of-the-art non-fair scheduling algorithms. This is achieved through the careful management of load balance and locality trade-offs in the global scheduler, as well as locality and fairness trade-offs in the local scheduler.", + "metadata": {} + }, + { + "text": "Jain's Fairness Index Analysis From the second column in Fig. 7, it is clear that DoubleQ consistently outperforms both Preble and RR + LPM. This is because DoubleQ provides strict fairness guarantees. However, it is slightly less fair than", + "metadata": {} + }, + { + "text": "VTC, as DoubleQ relaxes the fairness bounds to improve locality, which leads to higher throughput but slightly worse fairness control. Preble performs slightly better than RR + LPM due to its multi-level priority wait queue, which avoids starvation but cannot provide isolation and strict fairness", + "metadata": {} + }, + { + "text": "guarantee. As a result, there remains a notable gap between Preble and DoubleQ.", + "metadata": {} + }, + { + "text": "Well-behaved Clients' Latency Analysis We use the average P50 and P99 latency of well-behaved clients to evaluate the experience of well-behaved clients when a misbehaving client is present. Algorithms focusing on high system efficiency might inadvertently increase", + "metadata": {} + }, + { + "text": "latency for well-behaved clients as these schedulers may prioritize the requests from the misbehaving clients to optimize the prefix cache hit rate. Preble and RR + LPM, therefore, can result in up to 7 . 18 \u00d7 and 9", + "metadata": {} + }, + { + "text": ". 55 \u00d7 higher latency, respectively, compared to DoubleQ. On average, DoubleQ achieves 2 . 90 \u00d7 and 4 . 06 \u00d7 lower latency than Preble and RR + LPM. On the other hand, algorithms that focus", + "metadata": {} + }, + { + "text": "solely on fairness will also incur high latency for well-behaved clients due to reduced overall system efficiency. For instance, VTC can lead to latency up to 7 . 96 \u00d7 higher than DoubleQ, with an average latency increase of 2 .", + "metadata": {} + }, + { + "text": "98 \u00d7 .", + "metadata": {} + }, + { + "text": "Figure 8 demonstrates the fairness and performance comparison of different schedulers on the real-world multi-turn conversation workload. In this workload, clients 2 and 3 initially send excessive number of requests, and then return to normal midway. A fair scheduler should prevent these two clients", + "metadata": {} + }, + { + "text": "from impacting other clients. Due to LPM's prioritization strategy, which favors requests with longer prefix matches, Clients 2 and 3 receive a disproportionately large share of resources. As a result, Clients 0 and 1 suffer from high response times and reduced", + "metadata": {} + }, + { + "text": "throughput. In contrast, VTC achieves relatively low response times and maintains high throughput for Clients 0 and 1. However, such strict fair allocation comes at the expense of Clients 2 and 3, who endure substantial response delays, reaching up to", + "metadata": {} + }, + { + "text": "80 seconds.\nDLPM achieves a more reasonable distribution of re-", + "metadata": {} + }, + { + "text": "Figure 8: Fairness and performance visualization for the real-world multi-turn conversation workload ( D = 2). Clients 2 and 3 send requests at a much higher rate than Clients 0 and 1.", + "metadata": {} + }, + { + "text": "sources, protecting well-behaved clients from the disruptive effects of high request rates by the misbehaving clients. DLPM ensures consistently low response time and high throughput for both well-behaved and previously misbehaving clients. Thus, DLPM not only mitigates", + "metadata": {} + }, + { + "text": "the impact of malicious usage patterns but also improves overall system performance and fairness compared to the baseline approaches.", + "metadata": {} + }, + { + "text": "We visualize the response time and the services provided by the server to different clients over time in Fig. 9. The experiments use 4 A10G GPUs as the testbed, with all clients sending Tree-of-Thought programs at the same rate and with a consistent", + "metadata": {} + }, + { + "text": "branch count of 3. However, client 0 is misbehaving by sending a longer prefix, i.e. 10 \u00d7 longer than well-behaved clients. The maximum value on the x-axis represents the end-to-end completion time of all programs. As", + "metadata": {} + }, + { + "text": "observed, DoubleQ achieves the shortest execution time, demonstrating up to 2 \u00d7 speedup compared to VTC and Preble.", + "metadata": {} + }, + { + "text": "From the first row of the figure, we observe that DoubleQ consistently maintains lower response times compared to VTC as it preserves a higher degree of locality, which enhances overall system efficiency. Furthermore, it avoids the excessively high response time caused by schedulers like RR + LPM", + "metadata": {} + }, + { + "text": "and Preble, which lack fairness control. These schedulers tend to prioritize serving client 0, resulting in substantial delays for other clients. For instance, as shown in the figure, the service received by clients 1 and 2 between 100s and", + "metadata": {} + }, + { + "text": "700s is almost zero for Preble, causing a queuing latency of up to 600 seconds.", + "metadata": {} + }, + { + "text": "The second and third rows depict the actual service and the service received by each client, respectively. As shown in the second row of Fig. 9, both VTC and DoubleQ achieve an ideal sharing of resources across the 4 GPUs in terms of actual service. In", + "metadata": {} + }, + { + "text": "the third row, we can observe that the", + "metadata": {} + }, + { + "text": "Figure 9: Fairness and performance visualization of different schedulers on Tree-of-Thought workloads with D = 4 (3B model + 4 A10G GPUs). The maximum value on the X-axis represents the end-to-end completion time for each", + "metadata": {} + }, + { + "text": "scheduler. The actual service is calculated using the cost function defined in \u00a73, which is a weighted sum of the number of extend tokens and the number of output tokens.", + "metadata": {} + }, + { + "text": "service rate of client 0 is higher than clients 1 and 2 - this is because client 0 has longer prefix sharing and thus lower cost per token. However, due to the relatively low cache hit rate of VTC, it experiences worse end-to-end performance.", + "metadata": {} + }, + { + "text": "In contrast, the other two algorithms demonstrate significant unfairness in resource allocation across clients.", + "metadata": {} + }, + { + "text": "A key highlight here is the extremely low throughput observed with Preble. Preble prioritizes dispatching requests to the GPU with the longest prefix-matching length, provided the matching length exceeds a predefined threshold. Between 300 and 600 seconds, client", + "metadata": {} + }, + { + "text": "0's requests are continuously dispatched to a single GPU as the prefix-matching ratio will always exceed the pre-defined threshold. Some requests from clients 1 and 2 get queued at this monopolized GPU, which blocks these clients from generating new requests (i.e.,", + "metadata": {} + }, + { + "text": "\"deeper\" thoughts), due to the inherent LLM call dependencies in the Tree-of-thought programs. This results in severe workload imbalance among the GPUs, with the cluster at merely 1/4 of its potential computational capacity.", + "metadata": {} + }, + { + "text": "We now examine the trade-off between locality and fairness using Q u and Q w . The impact of Q u is illustrated in Fig. 1, where increasing Q u enhances throughput but compromises fairness control. By adjusting the value of Q", + "metadata": {} + }, + { + "text": "u , the server can achieve a tailored trade-off between performance and fairness, defining a new Pareto frontier compared to VTC and LPM.", + "metadata": {} + }, + { + "text": "Fig. 10 illustrates the impact of Q w on throughput in DoubleQ. To recap, Q w represents the quantum of service assigned to each worker in DoubleQ, where a larger Q w typically implies a better locality for requests", + "metadata": {} + }, + { + "text": "within a client. As shown in\n(a) Throughput of Tree-ofThoughts with one misbehaving client (b) Throughput of Tree-ofThoughts with all well-behaved clients", + "metadata": {} + }, + { + "text": "Figure 10: Impact of Q w on throughput under different workloads ( D = 4). The solid line represents throughput, while the dashed line represents Jain's Index. The fairness index in (b) is omitted as it", + "metadata": {} + }, + { + "text": "consistently equals 1.", + "metadata": {} + }, + { + "text": "Fig. 10, as Q w increases, the throughput of DoubleQ also increases, eventually stabilizing and surpassing all other schedulers. The low throughput of Preble, as seen in Fig. 10a,", + "metadata": {} + }, + { + "text": "has been explained earlier in \u00a77.1.", + "metadata": {} + }, + { + "text": "Although Q w is not explicitly included in the fairness bound of DoubleQ as demonstrated in Appendix A.2, it does slightly affect Jain's Fairness Index. Specifically, the index decreases from 0.855 to", + "metadata": {} + }, + { + "text": "0.83 when Q w increases from 2000 to 40000, due to the more unbalanced dispatching of requests within a client 7 .", + "metadata": {} + }, + { + "text": "We assess DLPM's performance as we increase the number of clients from 5 to 50, using a single A10 GPU as the testbed, while maintaining a constant total request rate. As depicted in Fig. 11, DLPM consistently achieves a", + "metadata": {} + }, + { + "text": "service rate comparable to LPM, even as the number of clients increases, whereas VTC consistently underperforms.\nFigure 11: Service rate w.r.t the number of clients on a single A10 GPU (3B model).", + "metadata": {} + }, + { + "text": "Note that as the number of clients rises, the number of distinct prefixes in the same volume of requests increases, which marginally reduces the cache hit rate for both DLPM and LPM, leading to a slight decrease in service rate as the number of clients increases.", + "metadata": {} + }, + { + "text": "7 When Q w is set to infinity, the algorithm is reduced to be similar as Preble, which lacks fairness guarantees since the difference in load across workers becomes unbounded, as proven in Theorem A.5", + "metadata": {} + }, + { + "text": "In contrast, VTC's performance is less affected since its cache hit rate is consistently low regardless of the number of clients.", + "metadata": {} + }, + { + "text": "As a complement to the single-workload scenario discussed earlier, we now explore a more realistic setting where clients handle diverse workloads. As shown in Fig. 12, DLPM consistently achieves better response time and end-to-end execution times compared to the other schedulers. In", + "metadata": {} + }, + { + "text": "the LPM scheduler, clients sending Tree-of-Thoughts programs act as misbehaving clients, significantly increasing the response time for other clients. From the second row, we observe that VTC exhibits better fairness control than DLPM, as it provides more evenly distributed actual service across", + "metadata": {} + }, + { + "text": "clients. This demonstrates that DLPM sacrifices some degree of fairness to achieve higher throughput.", + "metadata": {} + }, + { + "text": "Figure 12: Mix of workloads among four clients: two engage in multi-turn conversations, while the other two send different programs, all within a single-GPU setup (3B model + an A10G GPU).", + "metadata": {} + }, + { + "text": "Fairness in ML Workloads ML training workloads have extensively studied the fairness problems in shared clusters [5, 26, 27, 38]. Due to their unique characteristics such as long running time, placement sensitivity, and statistical efficiency (i.e., the amount of progress per", + "metadata": {} + }, + { + "text": "unit of data consumed), traditional fair scheduling for big data workloads [15,16] does not work well. To handle the long-running and placementsensitive natures of ML training workloads, Themsis [26] proposes new finish-time fairness metrics, and leverages", + "metadata": {} + }, + { + "text": "multiround partial allocation auctions to provide Pareto-efficient and envy-free resource allocations. To consider statistical efficiency for higher cluster-wide resource utilization, Pollux [38] introduces goodput-driven cluster scheduling by jointly optimizing resource allocations and job batch sizes. On the other hand, prior work VTC and", + "metadata": {} + }, + { + "text": "our work focus on the LLM inference-time fairness. Compared to VTC, our work co-optimizes both fairness and prefix sharing for higher performance without losing fairness.", + "metadata": {} + }, + { + "text": "Fairness in Other Workloads Fairness is a long-existing topic in networking and operating systems. For example, networking needs to guarantee fairness among different switching ports [42] and during link bandwidth allocation [9, 13, 14,19,33]; OS scheduling needs", + "metadata": {} + }, + { + "text": "to guarantee fair CPU time share among different processes [20, 46], and fair memory allocations [28]. Fairness is also extensively studied in big data workload scheduling with prominent prior work of Delay Scheduling [53] and Dominant Resource Fairness [11]. Our fair", + "metadata": {} + }, + { + "text": "scheduling design is inspired by many prior work such as Deficit Round Robin [42] and Delay Scheduling [53]; but differently, we explicitly optimize for the prefix sharing property in LLM inference workloads while guaranteeing fairness.", + "metadata": {} + }, + { + "text": "Locality in LLM Inference Previous advances in LLM inference focus on batching and memory optimization [21,52]. SGLang further exploits locality in scheduling to improve LLM inference performance for emerging applications such as multi-turn chatting [57]. It leverages the LPM scheduling with", + "metadata": {} + }, + { + "text": "RadixTree to save GPU memory and avoid redundant computations through prefix sharing. Preble [44] further extends LPM into distributed settings to jointly optimize load balancing and prefix caching locality for high throughput. BlendServe [55] co-optimizes GPU resource overlapping and prefix sharing for offline", + "metadata": {} + }, + { + "text": "LLM inference, achieving nearly optimal inference throughput. Unlike the above work which only focuses on inference throughput, our work presents a principled way of navigating the trade-off between performance and fairness in multi-client scenarios.", + "metadata": {} + }, + { + "text": "This paper introduces the first prefix-aware fair scheduling algorithm for LLM serving, namely, DLPM. We also propose an extension of the algorithm, DoubleQ, to preserve locality with global fairness guarantee in a distributed setup. Our algorithm achieves up to 2.87 \u00d7 higher throughput than", + "metadata": {} + }, + { + "text": "stateof-the-art fair scheduling algorithms in LLM like VTC, and 7.18 \u00d7 lower latency for victim clients compared to localityaware scheduling algorithms like Preble.", + "metadata": {} + }, + { + "text": "- [1] Linux 2.6.23. Completely fair scheduler. https://docs.kernel.org/scheduler/ sched-design-CFS.html .\n- [2] Perplexity AI. Perplexity: Conversational Search Assistant. https://www.perplexity.ai .", + "metadata": {} + }, + { + "text": "- [3] Yigal Bejerano, Seung-Jae Han, and Li Li. Fairness and load balancing in wireless lans using association control. In Proceedings of the 10th annual international conference on Mobile computing and networking , pages 315-329,", + "metadata": {} + }, + { + "text": "2004.\n- [4] Bradley Brown, Jordan Juravsky, Ryan Ehrlich, Ronald Clark, Quoc V Le, Christopher R\u00e9, and Azalia Mirhoseini. Large language monkeys: Scaling inference compute with repeated sampling. arXiv preprint", + "metadata": {} + }, + { + "text": "arXiv:2407.21787 , 2024.", + "metadata": {} + }, + { + "text": "- [5] Shubham Chaudhary, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, and Srinidhi Viswanatha. Balancing efficiency and fairness in heterogeneous gpu clusters for deep learning. In Proceedings of the Fifteenth European", + "metadata": {} + }, + { + "text": "Conference on Computer Systems , pages 116, 2020.", + "metadata": {} + }, + { + "text": "- [6] Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al. Training verifiers to solve math word problems.", + "metadata": {} + }, + { + "text": "arXiv preprint arXiv:2110.14168 , 2021.", + "metadata": {} + }, + { + "text": "- [7] Damai Dai, Chengqi Deng, Chenggang Zhao, R. X. Xu, Huazuo Gao, Deli Chen, Jiashi Li, Wangding Zeng, Xingkai Yu, Y. Wu, Zhenda Xie, Y. K. Li,", + "metadata": {} + }, + { + "text": "Panpan Huang, Fuli Luo, Chong Ruan, Zhifang Sui, and Wenfeng Liang. Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. CoRR , abs/2401.06066,", + "metadata": {} + }, + { + "text": "2024.\n- [8] DeepSeek. Deepseek-r1-lite-preview release. https: //api-docs.deepseek.com/news/news1120 , 2024. Accessed: 2024-11-20.", + "metadata": {} + }, + { + "text": "- [9] Alan J. Demers, Srinivasan Keshav, and Scott Shenker. Analysis and simulation of a fair queueing algorithm. In Lawrence H. Landweber, editor, ACM Symposium on Communications Architectures & Protocols (SIGCOMM) , pages", + "metadata": {} + }, + { + "text": "1-12. ACM, 1989.", + "metadata": {} + }, + { + "text": "- [10] Abhimanyu Dubey, Abhinav Jauhri, Abhinav Pandey, Abhishek Kadian, Ahmad Al-Dahle, Aiesha Letman, Akhil Mathur, Alan Schelten, Amy Yang, Angela Fan, et", + "metadata": {} + }, + { + "text": "al. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 , 2024.", + "metadata": {} + }, + { + "text": "- [11] Ali Ghodsi, Matei Zaharia, Benjamin Hindman, Andy Konwinski, Scott Shenker, and Ion Stoica. Dominant resource fairness: fair allocation of multiple resource types. In Proceedings of Networks and Systems Design and Implementation (NSDI) ,", + "metadata": {} + }, + { + "text": "2011.\n- [12] Github. Github copilot: Your ai pair programmer. https://github.com/features/copilot .", + "metadata": {} + }, + { + "text": "- [13] S. Jamaloddin Golestani. A self-clocked fair queueing scheme for broadband applications. In Proceedings IEEE INFOCOM '94, The Conference on Computer Communications, Thirteenth Annual Joint Conference of the IEEE Computer and Communications Societies, Networking for Global Communications , pages", + "metadata": {} + }, + { + "text": "636-646. IEEE Computer Society, 1994.", + "metadata": {} + }, + { + "text": "- [14] Pawan Goyal, Harrick M. Vin, and Haichen Cheng. Start-time fair queueing: A scheduling algorithm for integrated services packet switching networks. In Conference on Applications, Technologies, Architectures, and Protocols for Computer Communication (SIGCOMM) , pages", + "metadata": {} + }, + { + "text": "157-168. ACM, 1996.", + "metadata": {} + }, + { + "text": "- [15] Robert Grandl, Mosharaf Chowdhury, Aditya Akella, and Ganesh Ananthanarayanan. Altruistic scheduling in Multi-Resource clusters. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI", + "metadata": {} + }, + { + "text": "16) , pages 65-80, Savannah, GA, November 2016. USENIX Association.", + "metadata": {} + }, + { + "text": "- [16] Michael Isard, Vijayan Prabhakaran, Jon Currey, Udi Wieder, Kunal Talwar, and Andrew Goldberg. Quincy: Fair scheduling for distributed computing clusters. In ACM Symposium on Operating Systems Principles (SOSP) , page", + "metadata": {} + }, + { + "text": "261-276. Association for Computing Machinery, 2009.", + "metadata": {} + }, + { + "text": "- [17] Rajendra K Jain, Dah-Ming W Chiu, William R Hawe, et al. A quantitative measure of fairness and discrimination. Eastern Research Laboratory, Digital Equipment Corporation, Hudson, MA , 21:1, 1984.", + "metadata": {} + }, + { + "text": "- [18] Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. Mistral", + "metadata": {} + }, + { + "text": "7b. arXiv preprint arXiv:2310.06825 , 2023.", + "metadata": {} + }, + { + "text": "- [19] Wei Jin, Jeffrey S. Chase, and Jasleen Kaur. Interposed proportional sharing for a storage service utility. In International Conference on Measurements and Modeling of Computer Systems (SIGMETRICS) , pages 37-48. ACM, 2004.", + "metadata": {} + }, + { + "text": "- [20] The kernel development community. CFS Scheduler. https://docs.kernel.org/scheduler/ sched-design-CFS.html .", + "metadata": {} + }, + { + "text": "- [21] Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph Gonzalez, Hao Zhang, and Ion Stoica. Efficient memory management for large language model serving with paged attention. In Proceedings of the", + "metadata": {} + }, + { + "text": "29th Symposium on Operating Systems Principles , pages 611-626, 2023.", + "metadata": {} + }, + { + "text": "- [22] Tian Lan, David Kao, Mung Chiang, and Ashutosh Sabharwal. An axiomatic theory of fairness in network resource allocation. In 2010 Proceedings IEEE INFOCOM , pages 1-9, 2010.", + "metadata": {} + }, + { + "text": "- [23] Jiaqi Li, Mengmeng Wang, Zilong Zheng, and Muhan Zhang. Loogle: Can long-context language models understand long contexts? arXiv preprint arXiv:2311.04939 ,\n2023.", + "metadata": {} + }, + { + "text": "- [24] Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, et al. { AlpaServe } : Statistical multiplexing with model parallelism", + "metadata": {} + }, + { + "text": "for deep learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23) , pages 663-679, 2023.", + "metadata": {} + }, + { + "text": "- [25] Aman Madaan, Niket Tandon, Prakhar Gupta, Skyler Hallinan, Luyu Gao, Sarah Wiegreffe, Uri Alon, Nouha Dziri, Shrimai Prabhumoye, Yiming Yang, et", + "metadata": {} + }, + { + "text": "al. Selfrefine: Iterative refinement with self-feedback. Advances in Neural Information Processing Systems , 36, 2024.", + "metadata": {} + }, + { + "text": "- [26] Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. Themis: Fair and efficient gpu cluster scheduling.", + "metadata": {} + }, + { + "text": "In 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20) , pages 289-304, 2020.", + "metadata": {} + }, + { + "text": "- [27] Deepak Narayanan, Keshav Santhanam, Fiodar Kazhamiaka, Amar Phanishayee, and Matei Zaharia. { Heterogeneity-Aware } cluster scheduling policies for deep learning workloads. In 14th", + "metadata": {} + }, + { + "text": "USENIX Symposium on Operating Systems Design and Implementation (OSDI 20) , pages 481-498, 2020.", + "metadata": {} + }, + { + "text": "- [28] Kyle J. Nesbit, Nidhi Aggarwal, James Laudon, and James E. Smith. Fair queuing memory systems. In 2006 39th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO'06) ,", + "metadata": {} + }, + { + "text": "pages 208-222, 2006.", + "metadata": {} + }, + { + "text": "- [29] Xuefei Ning, Zinan Lin, Zixuan Zhou, Zifu Wang, Huazhong Yang, and Yu Wang. Skeleton-of-thought: Prompting LLMs for efficient parallel generation. In The Twelfth International Conference on Learning Representations ,", + "metadata": {} + }, + { + "text": "2024.", + "metadata": {} + }, + { + "text": "- [30] OpenAI. Gpt-4 technical report, 2023.\n- [31] OpenAI. Rate limit. https://platform.openai. com/docs/guides/rate-limits?context= tier-free , 2023.", + "metadata": {} + }, + { + "text": "- [32] OpenAI. Learning to reason with llms. https://openai.com/index/ learning-to-reason-with-llms/ , 2024. Accessed: 2024-11-20.", + "metadata": {} + }, + { + "text": "- [33] A.K. Parekh and R.G. Gallager. A generalized processor sharing approach to flow control in integrated services networks: the single-node case. IEEE/ACM Transactions on Networking , 1(3):344-357, 1993.", + "metadata": {} + }, + { + "text": "- [34] Joon Sung Park, Joseph C. O'Brien, Carrie J. Cai, Meredith Ringel Morris, Percy Liang, and Michael S. Bernstein. Generative agents: Interactive simulacra of human behavior. In In the 36th Annual ACM Symposium on User Interface", + "metadata": {} + }, + { + "text": "Software and Technology (UIST '23) , UIST '23, New York, NY, USA, 2023. Association for Computing Machinery.", + "metadata": {} + }, + { + "text": "- [35] Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. Gorilla: Large language model connected with massive apis. arXiv preprint arXiv:2305.15334 ,\n2023.", + "metadata": {} + }, + { + "text": "- [36] Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems ,", + "metadata": {} + }, + { + "text": "5, 2023.", + "metadata": {} + }, + { + "text": "- [37] Pranav Putta, Edmund Mills, Naman Garg, Sumeet Motwani, Chelsea Finn, Divyansh Garg, and Rafael Rafailov. Agent q: Advanced reasoning and learning for autonomous ai agents. arXiv preprint", + "metadata": {} + }, + { + "text": "arXiv:2408.07199 , 2024.", + "metadata": {} + }, + { + "text": "- [38] Aurick Qiao, Sang Keun Choe, Suhas Jayaram Subramanya, Willie Neiswanger, Qirong Ho, Hao Zhang, Gregory R Ganger, and Eric P Xing. Pollux: Co-adaptive cluster scheduling for goodput-optimized", + "metadata": {} + }, + { + "text": "deep learning. In 15th { USENIX } Symposium on Operating Systems Design and Implementation ( { OSDI } 21) , 2021.", + "metadata": {} + }, + { + "text": "- [39] Timo Schick, Jane Dwivedi-Yu, Roberto Dess\u00ec, Roberta Raileanu, Maria Lomeli, Luke Zettlemoyer, Nicola Cancedda, and Thomas Scialom. Toolformer: Language models can teach themselves to use tools.", + "metadata": {} + }, + { + "text": "arXiv preprint arXiv:2302.04761 , 2023.", + "metadata": {} + }, + { + "text": "- [40] Ying Sheng, Shiyi Cao, Dacheng Li, Coleman Hooper, Nicholas Lee, Shuo Yang, Christopher Chou, Banghua Zhu, Lianmin Zheng, Kurt Keutzer, Joseph E. Gonzalez, and Ion Stoica. S-lora:", + "metadata": {} + }, + { + "text": "Serving thousands of concurrent lora adapters. arXiv preprint arXiv:2311.03285 , 2023.", + "metadata": {} + }, + { + "text": "- [41] Ying Sheng, Shiyi Cao, Dacheng Li, Banghua Zhu, Zhuohan Li, Danyang Zhuo, Joseph E Gonzalez, and Ion Stoica. Fairness in Serving Large Language Models. In 18th USENIX Symposium on Operating", + "metadata": {} + }, + { + "text": "Systems Design and Implementation (OSDI 24) , pages 965-988, 2024.", + "metadata": {} + }, + { + "text": "- [42] M. Shreedhar and George Varghese. Efficient fair queueing using deficit round-robin. IEEE/ACM Trans. Netw. , 4(3):375-385, 1996.", + "metadata": {} + }, + { + "text": "- [43] Charlie Snell, Jaehoon Lee, Kelvin Xu, and Aviral Kumar. Scaling llm test-time compute optimally can be more effective than scaling model parameters. arXiv preprint arXiv:2408.03314 ,", + "metadata": {} + }, + { + "text": "2024.\n- [44] Vikranth Srivatsa, Zijian He, Reyna Abhyankar, Dongming Li, and Yiying Zhang. Preble: Efficient Distributed Prompt Scheduling for LLM Serving. 2024.", + "metadata": {} + }, + { + "text": "- [45] Vikranth Srivatsa, Dongming Li, Yiying Zhang, and Reyna Abhyankar. Can Scheduling Overhead Dominate LLM Inference Performance? A Study of CPU Scheduling Overhead on Two Popular LLM Inference Systems.", + "metadata": {} + }, + { + "text": "https://mlsys.wuklab.io/posts/ scheduling_overhead/ .", + "metadata": {} + }, + { + "text": "- [46] Ion Stoica and Hussein Abdel-Wahab. Earliest Eligible Virtual Deadline First: A Flexible and Accurate Mechanism for Proportional Share Resource Allocation. Old Dominion Univ., Norfolk, VA, Tech. Rep. TR-95-22 ,", + "metadata": {} + }, + { + "text": "1995.", + "metadata": {} + }, + { + "text": "- [47] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is all you need. Advances in neural information processing", + "metadata": {} + }, + { + "text": "systems , 30, 2017.", + "metadata": {} + }, + { + "text": "- [48] Guanzhi Wang, Yuqi Xie, Yunfan Jiang, Ajay Mandlekar, Chaowei Xiao, Yuke Zhu, Linxi Fan, and Anima Anandkumar. Voyager: An open-ended embodied agent with large language models. arXiv", + "metadata": {} + }, + { + "text": "preprint arXiv: Arxiv-2305.16291 , 2023.\n- [49] Wikipedia. Max-min fairness. https://en. wikipedia.org/wiki/Max-min_fairness .", + "metadata": {} + }, + { + "text": "- [50] An Yang, Baosong Yang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Zhou, Chengpeng Li, Chengyuan Li, Dayiheng Liu, Fei Huang, et al. Qwen2 technical report. arXiv preprint", + "metadata": {} + }, + { + "text": "arXiv:2407.10671 , 2024.", + "metadata": {} + }, + { + "text": "- [51] Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan. Tree of thoughts: Deliberate problem solving with large language models. Advances in Neural Information Processing Systems ,", + "metadata": {} + }, + { + "text": "36, 2024.", + "metadata": {} + }, + { + "text": "- [52] Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for transformer-based generative models. In 16th USENIX Symposium on Operating", + "metadata": {} + }, + { + "text": "Systems Design and Implementation (OSDI 22) , pages 521-538, 2022.", + "metadata": {} + }, + { + "text": "- [53] Matei Zaharia, Dhruba Borthakur, Joydeep Sen Sarma, Khaled Elmeleegy, Scott Shenker, and Ion Stoica. Delay Scheduling: a Simple Technique for Achieving Locality and Fairness in Cluster Scheduling. In", + "metadata": {} + }, + { + "text": "Proceedings of the 5th European conference on Computer systems , pages 265-278, 2010.", + "metadata": {} + }, + { + "text": "- [54] Dan Zhang, Sining Zhoubian, Ziniu Hu, Yisong Yue, Yuxiao Dong, and Jie Tang. Rest-mcts*: Llm selftraining via process reward guided tree search. arXiv preprint", + "metadata": {} + }, + { + "text": "arXiv:2406.03816 , 2024.", + "metadata": {} + }, + { + "text": "- [55] Yilong Zhao, Shuo Yang, Kan Zhu, Lianmin Zheng, Baris Kasikci, Yang Zhou, Jiarong Xing, and Ion Stoica. BlendServe: Optimizing Offline Inference for Autoregressive Large Models with Resource-aware Batching.", + "metadata": {} + }, + { + "text": "arXiv preprint arXiv:2411.16102 , 2024.", + "metadata": {} + }, + { + "text": "- [56] Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric. P Xing, Hao Zhang, Joseph E.", + "metadata": {} + }, + { + "text": "Gonzalez, and Ion Stoica. Judging llm-as-a-judge with mt-bench and chatbot arena, 2023.", + "metadata": {} + }, + { + "text": "- [57] Lianmin Zheng, Liangsheng Yin, Zhiqiang Xie, Jeff Huang, Chuyue Sun, Cody Hao Yu, Shiyi Cao, Christos Kozyrakis, Ion Stoica, Joseph E Gonzalez, et al. Efficiently programming", + "metadata": {} + }, + { + "text": "large language models using sglang. arXiv preprint arXiv:2312.07104 , 2023.", + "metadata": {} + }, + { + "text": "Theorem A.1 ( Service Bound ) . Consider any execution of the DLPM scheme in which client i is backlogged. After any Ki rounds (where qi is replenished Ki times) from t 1 to t 2 , the difference between Ki \u00b7 Q u", + "metadata": {} + }, + { + "text": "(i.e., the service that client i should have sent) and Wi ( t 1 , t 2 ) (i.e., the service that client i actually received) is bounded by max ( Q u , U ) , where U = we \u00b7 Linput +", + "metadata": {} + }, + { + "text": "wq \u00b7 M.\nProof. Let qi ( t ) denote the deficit counter value of client i at time t . Since the deficit counter will only be refilled when qi \u2264 0 (line 7) by Q u , we have", + "metadata": {} + }, + { + "text": "\nNow we prove through induction:\n", + "metadata": {} + }, + { + "text": "- \u00b7 At the beginning, all qi ( 0 ) = 0. Equation (2) holds.\n- \u00b7 We then prove if at time t , Equation (2) holds, then for t \u2032 > t , Equation (2) also holds.", + "metadata": {} + }, + { + "text": "- \u00b7 At line 7, qi ( t \u2032 ) = qi + Q u > qi > -U . Equation (2) holds.", + "metadata": {} + }, + { + "text": "- \u00b7 Since line 24 will be reached only when qi > 0, qi ( t \u2032 ) = qi -we \u00b7 extend _ length ( r ) > -we \u00b7 Linput . Equation (2) holds.", + "metadata": {} + }, + { + "text": "- \u00b7 At line 27, since qi ( t \u2032 ) = qi -wq \u00b7 |{ r | client ( r ) = i , r \u2208 B }| will be repeated for n steps until some requests are finished. Therefore, we have qi (", + "metadata": {} + }, + { + "text": "t \u2032 ) \u2265 qi -n \u00b7 wq \u00b7 |{ r | client ( r ) = i , r \u2208 B }| . Since the number of decoded tokens cannot exceed the server's maximum token capacity M , n \u00b7 |{ r | client ( r )", + "metadata": {} + }, + { + "text": "= i , r \u2208 B }| \u2264 M . We then have qi ( t \u2032 ) = qi -wq \u00b7 M > -U . Equation (2) holds.", + "metadata": {} + }, + { + "text": "Therefore, we have Wi ( t 1 , t 2 ) = Ki \u00b7 Q u -qi ( t 2 ) . Combining Equation (2) and Equation (1), we have:\n", + "metadata": {} + }, + { + "text": "Theorem A.2 ( Latency Bound ) . Let A ( r ) and D ( r ) denote the arrival time and dispatch time of a request r. Assume there are in total n clients, \u2200 t 1 , t 2 , if at t 1", + "metadata": {} + }, + { + "text": ", a client f is not backlogged and has no requests in the running batch, then the next request r f with t 1 < A ( r f ) < t 2 will have its response time bounded: D ( r f ) -A ( r f )", + "metadata": {} + }, + { + "text": "\u2264 2 \u00b7 ( n -1 ) \u00b7 Q u + U a , where a is the lower bound of the server capacity.\nProof. \u00b7 Since there is no running batch of f in the system, r f will be selected for the next request for f .", + "metadata": {} + }, + { + "text": "- \u00b7 Earlier, we have shown that the service bound for backlogged clients compared to either backlogged or nonbacklogged clients is 2 \u00b7 ( Q u + U ) .", + "metadata": {} + }, + { + "text": "- \u00b7 From t 1 to D ( r f ) , Wf ( t 1 , D ( r f )) will be within 2 \u00b7 ( Q u + U ) of service received by other clients.", + "metadata": {} + }, + { + "text": "- \u00b7 Since at Line 12, qf is set to 0 when f rejoins, the maximum number of tokens served before f is served again is: 2 \u00b7 ( n -1 ) \u00b7 ( Q u + U ) , where n -1 is", + "metadata": {} + }, + { + "text": "the n -1 other clients.", + "metadata": {} + }, + { + "text": "- \u00b7 Given that a is the lower bound of the server capacity, the dispatch time for f is therefore bounded: D ( r f ) -A ( r f ) \u2264 2 \u00b7 ( n -1 ) \u00b7 Q u + U a .", + "metadata": {} + }, + { + "text": "Theorem A.3 ( Service Bound ) . Consider any execution of the DoubleQ Scheduling scheme in which client i is backlogged. The difference between \u2211 w \u2208 W ki , w \u00b7 Q u (i.e., the service that client i should have", + "metadata": {} + }, + { + "text": "sent) and Wi (i.e., the service that client i actually received) is bounded by max ( Q u , U ) \u00d7| W | , where U = we \u00b7 Linput + wq \u00b7 M. Let ki , w is the number of times client", + "metadata": {} + }, + { + "text": "i has been replenished at worker w.\nProof. Let ki , w be the number of times the client i has replenished quantum locally at worker w . We want to show for a client i :\n", + "metadata": {} + }, + { + "text": "Let q u i , w ( t ) denote the deficit counter value for worker w of client i at time t . Since the deficit counter will only be refilled when q u i , w ( t ) \u2264 0 (line 7) by Q u ,", + "metadata": {} + }, + { + "text": "we have\n\nNow we prove through induction:\n\n- \u00b7 At the beginning, all q u i , w ( t ) = 0. Equation (5) holds.", + "metadata": {} + }, + { + "text": "- \u00b7 We then prove if at time t , Equation (5) holds, then for t \u2032 > t , Equation (5) also holds.", + "metadata": {} + }, + { + "text": "- \u00b7 At line 7, q u i , w ( t \u2032 ) = q u i , w ( t ) + Q u > q u i , w ( t ) > -U . Equation (5) holds.", + "metadata": {} + }, + { + "text": "- \u00b7 Since line 24 will be reached only when q u i , w ( t ) > 0, q u i , w ( t \u2032 ) = q u i , w ( t ) -we \u00b7 Linput > -we \u00b7 Linput", + "metadata": {} + }, + { + "text": ". Equation (5) holds.", + "metadata": {} + }, + { + "text": "- \u00b7 At line 23, since q u i , w ( t \u2032 ) = q u i , w ( t ) -wq \u00b7 |{ r | client ( r ) = i , r \u2208 B }| will be repeated for n steps", + "metadata": {} + }, + { + "text": "until some requests are finished. Therefore, we have q u i , w ( t \u2032 ) = q u i , w ( t ) -n \u00b7 wq \u00b7 |{ r | client ( r ) = i , r \u2208 B }| . Since the", + "metadata": {} + }, + { + "text": "number of decoded tokens cannot exceed the server's maximum token capacity M , n \u00b7 |{ r | client ( r ) = i , r \u2208 B }| \u2264 M . We then have q u i , w = q u i , w ( t )", + "metadata": {} + }, + { + "text": "-wq \u00b7 M > -we \u00b7 Linput -wq \u00b7 M . Equation (5) holds.", + "metadata": {} + }, + { + "text": "Wehave Wi ( t 1 , t 2 ) = \u2211 w \u2208 W ( ki , w \u00b7 Q u -q u i , w ( t 2 )) . Combining Equation (5) and Equation (4), we have:", + "metadata": {} + }, + { + "text": "", + "metadata": {} + }, + { + "text": "Theorem 5.1 ( Service bound between backlogged clients ) . At any time interval [ t 1 , t 2 ) , max iWi ( t 1 , t 2 ) -min iWi ( t 1 , t 2 )", + "metadata": {} + }, + { + "text": "\u2264 2 \u00b7 | W | \u00b7 ( U + Q u ) . The difference between the maximum service among all backlogged clients and the minimum service among all backlogged clients is bounded by 2 \u00b7 | W | \u00b7 ( U + Q u ) , where |", + "metadata": {} + }, + { + "text": "W | is the number of workers.\nProof. \u00b7 From Theorem 4.1, the service bound for each worker is: 2 \u00b7 ( U + Q u ) .", + "metadata": {} + }, + { + "text": "- \u00b7 Since if a client is backlogged, it will have a request and hence be backlogged in all workers. This is because from Line 3, requests will be distributed to all workers and credit for each worker is exhausted, before replenishing the credits for all", + "metadata": {} + }, + { + "text": "workers.\n- \u00b7 Therefore, the service bound for DoubleQ is 2 \u00b7 | W | ( U + Q u ) .", + "metadata": {} + }, + { + "text": "Theorem 5.2 ( Service bound between backlogged and non-backlogged clients ) . Consider any execution of the DoubleQ scheme. Client f that is continuously backlogged during time interval [ t 1 , t 2 ) should not receive less service than another", + "metadata": {} + }, + { + "text": "client, g, that is not continuously backlogged during the same time interval, where Wg ( t 1 , t 2 ) -Wf ( t 1 , t 2 ) \u2264 2 \u00b7 ( U + Q u ) \u00b7 | W |", + "metadata": {} + }, + { + "text": ".\nProof. \u00b7 f is continuously backlogged in all workers.", + "metadata": {} + }, + { + "text": "- \u00b7 g is not continuously backlogged in at least one worker.\n- \u00b7 From Lemma 4.2, the service bound is | W | \u00b7 ( 2 U + 2 Q u ) between backlogged and either backlogged or non-backlogged clients.", + "metadata": {} + }, + { + "text": "Theorem A.4 ( Latency Bound ) . Let A ( r ) and D ( r ) denote the arrival time and dispatch time of a request r. Assume there are in total n clients, \u2200 t 1 , t 2 , if at t", + "metadata": {} + }, + { + "text": "1 , a client f is not backlogged and has no requests in the running batch, then the next request r f with t 1 < A ( r f ) < t 2 will have its response time bounded: D ( r f ) -A ( r", + "metadata": {} + }, + { + "text": "f ) \u2264 ( n -1 ) | W | \u00b7 2 U + 2 Q u a , where a is the lower bound of the server capacity.", + "metadata": {} + }, + { + "text": "Proof. \u00b7 Since there is no running batch of f in the system, r f will be selected for the next request for f .", + "metadata": {} + }, + { + "text": "- \u00b7 Earlier, we have shown that the bound between a backlogged client and a non-backlogged client in DoubleQ to be max iWi -min iWi \u2264 ( 2 U + 2 Q u ) | W | .", + "metadata": {} + }, + { + "text": "- \u00b7 Therefore the maximum number of tokens served before f is served again is: ( n -1 ) \u00b7 ( 2 U + 2 Q u ) | W | , where n -1 is the n -1 other client.", + "metadata": {} + }, + { + "text": "- \u00b7 Given that a is the lower bound of the server capacity, the dispatch time for f is therefore bounded: D ( r f ) -A ( r f ) \u2264 ( n -1 ) | W | \u00b7 2 U + 2 Q u a .", + "metadata": {} + }, + { + "text": "Theorem A.5 ( Infinite Qw is not fair ) . Consider any execution of the DoubleQ Scheduling scheme in which client Qw is infinite. Such scheduling scheme is not fair.\nProof. \u00b7 The requests will not be perfectly load balanced to all workers.", + "metadata": {} + }, + { + "text": "- \u00b7 Proof by counterexample: client f sends requests with large prefix matching. Requests from that client will be sent to the same worker hosting the prefix.", + "metadata": {} + }, + { + "text": "- \u00b7 Another client g sends requests with zero prefix matching, the requests will be load-balanced to all workers because of Line 8.", + "metadata": {} + }, + { + "text": "- \u00b7 Client g will be able to get unboundedly more service compared to f as it is replenished more, due to being scheduled to more workers, despite both being backlogged.", + "metadata": {} + }, + { + "text": "Theorem A.6 ( Service between backlogged or non-backlogged clients is unbounded ) . In any interval [ t 1 , t 2 ) , The difference between the maximum service among all backlogged clients and the minimum service", + "metadata": {} + }, + { + "text": "among all backlogged or non-backlogged clients is bounded by a constant independent of the time interval t 2 -t 1 .\nProof. \u00b7 The client requests will be load-balanced to all workers.", + "metadata": {} + }, + { + "text": "- \u00b7 Therefore, when a client is backlogged, it is backlogged on all workers.\n- \u00b7 We can apply the bound derived in DoubleQ, multiplied by the number of workers | W | , similar to \u00a7A.2.", "metadata": {} } ]