This commit is contained in:
yichuan520030910320
2025-07-25 00:12:35 -07:00
parent 43eb4f9a1d
commit b6d43f5fd9
11 changed files with 53 additions and 18 deletions

View File

@@ -133,6 +133,10 @@ LEANN supports RAG on various data sources including documents (.pdf, .txt, .md)
Ask questions directly about your personal PDFs, documents, and any directory containing your files! Ask questions directly about your personal PDFs, documents, and any directory containing your files!
<p align="center">
<img src="videos/paper_clear.gif" alt="LEANN Document Search Demo" width="600">
</p>
The example below asks a question about summarizing two papers (uses default data in `examples/data`): The example below asks a question about summarizing two papers (uses default data in `examples/data`):
```bash ```bash
@@ -150,6 +154,10 @@ python ./examples/main_cli_example.py
### 📧 Your Personal Email Secretary: RAG on Apple Mail! ### 📧 Your Personal Email Secretary: RAG on Apple Mail!
<p align="center">
<img src="videos/mail_clear.gif" alt="LEANN Email Search Demo" width="600">
</p>
**Note:** You need to grant full disk access to your terminal/VS Code in System Preferences → Privacy & Security → Full Disk Access. **Note:** You need to grant full disk access to your terminal/VS Code in System Preferences → Privacy & Security → Full Disk Access.
```bash ```bash
python examples/mail_reader_leann.py --query "What's the food I ordered by doordash or Uber eat mostly?" python examples/mail_reader_leann.py --query "What's the food I ordered by doordash or Uber eat mostly?"
@@ -188,6 +196,11 @@ Once the index is built, you can ask questions like:
</details> </details>
### 🔍 Time Machine for the Web: RAG Your Entire Google Browser History! ### 🔍 Time Machine for the Web: RAG Your Entire Google Browser History!
<p align="center">
<img src="videos/google_clear.gif" alt="LEANN Browser History Search Demo" width="600">
</p>
```bash ```bash
python examples/google_history_reader_leann.py --query "Tell me my browser history about machine learning?" python examples/google_history_reader_leann.py --query "Tell me my browser history about machine learning?"
``` ```
@@ -242,6 +255,10 @@ Once the index is built, you can ask questions like:
### 💬 WeChat Detective: Unlock Your Golden Memories! ### 💬 WeChat Detective: Unlock Your Golden Memories!
<p align="center">
<img src="videos/wechat_clear.gif" alt="LEANN WeChat Search Demo" width="600">
</p>
```bash ```bash
python examples/wechat_history_reader_leann.py --query "Show me all group chats about weekend plans" python examples/wechat_history_reader_leann.py --query "Show me all group chats about weekend plans"
``` ```

View File

@@ -222,14 +222,15 @@ async def query_leann_index(index_path: str, query: str):
"max_tokens": 1000 "max_tokens": 1000
} }
) )
print(f"Leann: {chat_response}")
print(f"Leann chat response: \033[36m{chat_response}\033[0m")
async def main(): async def main():
# Parse command line arguments # Parse command line arguments
parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index') parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index')
parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE, parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE,
help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this') help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this')
parser.add_argument('--index-dir', type=str, default="./all_google_new", parser.add_argument('--index-dir', type=str, default="./google_history_index",
help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)') help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)')
parser.add_argument('--max-entries', type=int, default=1000, parser.add_argument('--max-entries', type=int, default=1000,
help='Maximum number of history entries to process (default: 1000)') help='Maximum number of history entries to process (default: 1000)')

View File

@@ -224,15 +224,16 @@ async def query_leann_index(index_path: str, query: str):
beam_width=1, beam_width=1,
) )
end_time = time.time() end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds") # print(f"Time taken: {end_time - start_time} seconds")
print(f"Leann: {chat_response}") # highlight the answer
print(f"Leann chat response: \033[36m{chat_response}\033[0m")
async def main(): async def main():
# Parse command line arguments # Parse command line arguments
parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index') parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
# Remove --mail-path argument and auto-detect all Messages directories # Remove --mail-path argument and auto-detect all Messages directories
# Remove DEFAULT_MAIL_PATH # Remove DEFAULT_MAIL_PATH
parser.add_argument('--index-dir', type=str, default="./mail_index_index_file", parser.add_argument('--index-dir', type=str, default="./mail_index",
help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)') help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
parser.add_argument('--max-emails', type=int, default=1000, parser.add_argument('--max-emails', type=int, default=1000,
help='Maximum number of emails to process (-1 means all)') help='Maximum number of emails to process (-1 means all)')

View File

@@ -63,16 +63,14 @@ async def main(args):
llm_config = {"type": "openai", "model": "gpt-4o"} llm_config = {"type": "openai", "model": "gpt-4o"}
chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config) chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
# query = ( # query = (
# "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发" # "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
# ) # )
query = args.query
print(f"You: {query}") print(f"You: {query}")
chat_response = chat.ask(query, top_k=20, recompute_embeddings=True, complexity=32) chat_response = chat.ask(query, top_k=20, recompute_embeddings=True, complexity=32)
print(f"Leann: {chat_response}") print(f"Leann chat response: \033[36m{chat_response}\033[0m")
if __name__ == "__main__": if __name__ == "__main__":
@@ -110,6 +108,12 @@ if __name__ == "__main__":
default="examples/data", default="examples/data",
help="Directory containing documents to index (PDF, TXT, MD files).", help="Directory containing documents to index (PDF, TXT, MD files).",
) )
parser.add_argument(
"--query",
type=str,
default="Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?",
help="The query to ask the Leann chat system.",
)
args = parser.parse_args() args = parser.parse_args()
asyncio.run(main(args)) asyncio.run(main(args))

View File

@@ -234,7 +234,7 @@ async def query_leann_index(index_path: str, query: str):
}, },
llm_kwargs={"temperature": 0.0, "max_tokens": 1000}, llm_kwargs={"temperature": 0.0, "max_tokens": 1000},
) )
print(f"Leann: {chat_response}") print(f"Leann chat response: \033[36m{chat_response}\033[0m")
async def main(): async def main():

View File

@@ -441,9 +441,9 @@ class LeannSearcher:
use_server_if_available=recompute_embeddings, use_server_if_available=recompute_embeddings,
zmq_port=zmq_port, zmq_port=zmq_port,
) )
logger.info(f" Generated embedding shape: {query_embedding.shape}") # logger.info(f" Generated embedding shape: {query_embedding.shape}")
embedding_time = time.time() - start_time embedding_time = time.time() - start_time
logger.info(f" Embedding time: {embedding_time} seconds") # logger.info(f" Embedding time: {embedding_time} seconds")
start_time = time.time() start_time = time.time()
results = self.backend_impl.search( results = self.backend_impl.search(
@@ -458,7 +458,7 @@ class LeannSearcher:
**kwargs, **kwargs,
) )
search_time = time.time() - start_time search_time = time.time() - start_time
logger.info(f" Search time: {search_time} seconds") # logger.info(f" Search time: {search_time} seconds")
logger.info( logger.info(
f" Backend returned: labels={len(results.get('labels', [[]])[0])} results" f" Backend returned: labels={len(results.get('labels', [[]])[0])} results"
) )
@@ -479,15 +479,25 @@ class LeannSearcher:
metadata=passage_data.get("metadata", {}), metadata=passage_data.get("metadata", {}),
) )
) )
# Color codes for better logging
GREEN = "\033[92m"
BLUE = "\033[94m"
YELLOW = "\033[93m"
RESET = "\033[0m"
# Truncate text for display (first 100 chars)
display_text = passage_data['text']
logger.info( logger.info(
f" {i + 1}. passage_id='{string_id}' -> SUCCESS: {passage_data['text']}..." f" {GREEN}{RESET} {BLUE}[{i + 1:2d}]{RESET} {YELLOW}ID:{RESET} '{string_id}' {YELLOW}Score:{RESET} {dist:.4f} {YELLOW}Text:{RESET} {display_text}"
) )
except KeyError: except KeyError:
RED = "\033[91m"
logger.error( logger.error(
f" {i + 1}. passage_id='{string_id}' -> ERROR: Passage not found in PassageManager!" f" {RED}{RESET} [{i + 1:2d}] ID: '{string_id}' -> {RED}ERROR: Passage not found!{RESET}"
) )
logger.info(f" Final enriched results: {len(enriched_results)} passages") logger.info(f" {GREEN} Final enriched results: {len(enriched_results)} passages{RESET}")
return enriched_results return enriched_results
@@ -517,7 +527,7 @@ class LeannChat:
): ):
if llm_kwargs is None: if llm_kwargs is None:
llm_kwargs = {} llm_kwargs = {}
search_time = time.time()
results = self.searcher.search( results = self.searcher.search(
question, question,
top_k=top_k, top_k=top_k,
@@ -529,6 +539,8 @@ class LeannChat:
expected_zmq_port=expected_zmq_port, expected_zmq_port=expected_zmq_port,
**search_kwargs, **search_kwargs,
) )
search_time = time.time() - search_time
# logger.info(f" Search time: {search_time} seconds")
context = "\n\n".join([r.text for r in results]) context = "\n\n".join([r.text for r in results])
prompt = ( prompt = (
"Here is some retrieved context that might help answer your question:\n\n" "Here is some retrieved context that might help answer your question:\n\n"

BIN
videos/google_clear.gif Normal file
View File

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.2 MiB

BIN
videos/mail_clear.gif Normal file
View File

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 MiB

BIN
videos/paper_clear.gif Normal file
View File

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 MiB

BIN
videos/wechat_clear.gif Normal file
View File

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB