diff --git a/README.md b/README.md
index 357d1b1..ae991eb 100755
--- a/README.md
+++ b/README.md
@@ -133,6 +133,10 @@ LEANN supports RAG on various data sources including documents (.pdf, .txt, .md)
Ask questions directly about your personal PDFs, documents, and any directory containing your files!
+
+
+
+
The example below asks a question about summarizing two papers (uses default data in `examples/data`):
```bash
@@ -150,6 +154,10 @@ python ./examples/main_cli_example.py
### π§ Your Personal Email Secretary: RAG on Apple Mail!
+
+
+
+
**Note:** You need to grant full disk access to your terminal/VS Code in System Preferences β Privacy & Security β Full Disk Access.
```bash
python examples/mail_reader_leann.py --query "What's the food I ordered by doordash or Uber eat mostly?"
@@ -188,6 +196,11 @@ Once the index is built, you can ask questions like:
### π Time Machine for the Web: RAG Your Entire Google Browser History!
+
+
+
+
+
```bash
python examples/google_history_reader_leann.py --query "Tell me my browser history about machine learning?"
```
@@ -242,6 +255,10 @@ Once the index is built, you can ask questions like:
### π¬ WeChat Detective: Unlock Your Golden Memories!
+
+
+
+
```bash
python examples/wechat_history_reader_leann.py --query "Show me all group chats about weekend plans"
```
diff --git a/examples/google_history_reader_leann.py b/examples/google_history_reader_leann.py
index 0098353..3809466 100644
--- a/examples/google_history_reader_leann.py
+++ b/examples/google_history_reader_leann.py
@@ -222,14 +222,15 @@ async def query_leann_index(index_path: str, query: str):
"max_tokens": 1000
}
)
- print(f"Leann: {chat_response}")
+
+ print(f"Leann chat response: \033[36m{chat_response}\033[0m")
async def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index')
parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE,
help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this')
- parser.add_argument('--index-dir', type=str, default="./all_google_new",
+ parser.add_argument('--index-dir', type=str, default="./google_history_index",
help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)')
parser.add_argument('--max-entries', type=int, default=1000,
help='Maximum number of history entries to process (default: 1000)')
diff --git a/examples/mail_reader_leann.py b/examples/mail_reader_leann.py
index cd3f540..43c073a 100644
--- a/examples/mail_reader_leann.py
+++ b/examples/mail_reader_leann.py
@@ -224,15 +224,16 @@ async def query_leann_index(index_path: str, query: str):
beam_width=1,
)
end_time = time.time()
- print(f"Time taken: {end_time - start_time} seconds")
- print(f"Leann: {chat_response}")
+ # print(f"Time taken: {end_time - start_time} seconds")
+ # highlight the answer
+ print(f"Leann chat response: \033[36m{chat_response}\033[0m")
async def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
# Remove --mail-path argument and auto-detect all Messages directories
# Remove DEFAULT_MAIL_PATH
- parser.add_argument('--index-dir', type=str, default="./mail_index_index_file",
+ parser.add_argument('--index-dir', type=str, default="./mail_index",
help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
parser.add_argument('--max-emails', type=int, default=1000,
help='Maximum number of emails to process (-1 means all)')
diff --git a/examples/main_cli_example.py b/examples/main_cli_example.py
index 00b9eb6..1854ed0 100644
--- a/examples/main_cli_example.py
+++ b/examples/main_cli_example.py
@@ -63,16 +63,14 @@ async def main(args):
llm_config = {"type": "openai", "model": "gpt-4o"}
chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
-
- query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
-
# query = (
# "δ»δΉζ―ηε€ε€§ζ¨‘εδ»₯εηε€εΌεθΏη¨δΈιε°δΊδ»δΉι΄ζι’οΌδ»»ε‘什δΈθ¬ε¨δ»δΉεεΈι’ε"
# )
+ query = args.query
print(f"You: {query}")
chat_response = chat.ask(query, top_k=20, recompute_embeddings=True, complexity=32)
- print(f"Leann: {chat_response}")
+ print(f"Leann chat response: \033[36m{chat_response}\033[0m")
if __name__ == "__main__":
@@ -110,6 +108,12 @@ if __name__ == "__main__":
default="examples/data",
help="Directory containing documents to index (PDF, TXT, MD files).",
)
+ parser.add_argument(
+ "--query",
+ type=str,
+ default="Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?",
+ help="The query to ask the Leann chat system.",
+ )
args = parser.parse_args()
asyncio.run(main(args))
diff --git a/examples/wechat_history_reader_leann.py b/examples/wechat_history_reader_leann.py
index d002174..a16979b 100644
--- a/examples/wechat_history_reader_leann.py
+++ b/examples/wechat_history_reader_leann.py
@@ -234,7 +234,7 @@ async def query_leann_index(index_path: str, query: str):
},
llm_kwargs={"temperature": 0.0, "max_tokens": 1000},
)
- print(f"Leann: {chat_response}")
+ print(f"Leann chat response: \033[36m{chat_response}\033[0m")
async def main():
diff --git a/packages/leann-backend-diskann/third_party/DiskANN b/packages/leann-backend-diskann/third_party/DiskANN
index 25339b0..af2a264 160000
--- a/packages/leann-backend-diskann/third_party/DiskANN
+++ b/packages/leann-backend-diskann/third_party/DiskANN
@@ -1 +1 @@
-Subproject commit 25339b03413b5067c25b6092ea3e0f77ef8515c8
+Subproject commit af2a26481e65232b57b82d96e68833cdee9f7635
diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py
index 0bfb328..1f2c9d8 100644
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -441,9 +441,9 @@ class LeannSearcher:
use_server_if_available=recompute_embeddings,
zmq_port=zmq_port,
)
- logger.info(f" Generated embedding shape: {query_embedding.shape}")
+ # logger.info(f" Generated embedding shape: {query_embedding.shape}")
embedding_time = time.time() - start_time
- logger.info(f" Embedding time: {embedding_time} seconds")
+ # logger.info(f" Embedding time: {embedding_time} seconds")
start_time = time.time()
results = self.backend_impl.search(
@@ -458,7 +458,7 @@ class LeannSearcher:
**kwargs,
)
search_time = time.time() - start_time
- logger.info(f" Search time: {search_time} seconds")
+ # logger.info(f" Search time: {search_time} seconds")
logger.info(
f" Backend returned: labels={len(results.get('labels', [[]])[0])} results"
)
@@ -479,15 +479,25 @@ class LeannSearcher:
metadata=passage_data.get("metadata", {}),
)
)
+
+ # Color codes for better logging
+ GREEN = "\033[92m"
+ BLUE = "\033[94m"
+ YELLOW = "\033[93m"
+ RESET = "\033[0m"
+
+ # Truncate text for display (first 100 chars)
+ display_text = passage_data['text']
logger.info(
- f" {i + 1}. passage_id='{string_id}' -> SUCCESS: {passage_data['text']}..."
+ f" {GREEN}β{RESET} {BLUE}[{i + 1:2d}]{RESET} {YELLOW}ID:{RESET} '{string_id}' {YELLOW}Score:{RESET} {dist:.4f} {YELLOW}Text:{RESET} {display_text}"
)
except KeyError:
+ RED = "\033[91m"
logger.error(
- f" {i + 1}. passage_id='{string_id}' -> ERROR: Passage not found in PassageManager!"
+ f" {RED}β{RESET} [{i + 1:2d}] ID: '{string_id}' -> {RED}ERROR: Passage not found!{RESET}"
)
- logger.info(f" Final enriched results: {len(enriched_results)} passages")
+ logger.info(f" {GREEN}β Final enriched results: {len(enriched_results)} passages{RESET}")
return enriched_results
@@ -517,7 +527,7 @@ class LeannChat:
):
if llm_kwargs is None:
llm_kwargs = {}
-
+ search_time = time.time()
results = self.searcher.search(
question,
top_k=top_k,
@@ -529,6 +539,8 @@ class LeannChat:
expected_zmq_port=expected_zmq_port,
**search_kwargs,
)
+ search_time = time.time() - search_time
+ # logger.info(f" Search time: {search_time} seconds")
context = "\n\n".join([r.text for r in results])
prompt = (
"Here is some retrieved context that might help answer your question:\n\n"
diff --git a/videos/google_clear.gif b/videos/google_clear.gif
new file mode 100644
index 0000000..c348b45
Binary files /dev/null and b/videos/google_clear.gif differ
diff --git a/videos/mail_clear.gif b/videos/mail_clear.gif
new file mode 100644
index 0000000..22452dc
Binary files /dev/null and b/videos/mail_clear.gif differ
diff --git a/videos/paper_clear.gif b/videos/paper_clear.gif
new file mode 100644
index 0000000..d27b4de
Binary files /dev/null and b/videos/paper_clear.gif differ
diff --git a/videos/wechat_clear.gif b/videos/wechat_clear.gif
new file mode 100644
index 0000000..3f976e6
Binary files /dev/null and b/videos/wechat_clear.gif differ