From ff1b622bddfb03f669519eca43b8f535a40a01ea Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Tue, 29 Jul 2025 12:39:36 -0700 Subject: [PATCH] refactor: Remove old example scripts and migration references - Delete old example scripts (mail_reader_leann.py, google_history_reader_leann.py, etc.) - Remove migration hints and backward compatibility - Update tests to use new unified examples directly - Clean up all references to old script names - Users now only see the new unified interface --- docs/normalized_embeddings.md | 2 +- examples/PARAMETER_CONSISTENCY.md | 64 ----- examples/google_history_reader_leann.py | 362 ------------------------ examples/mail_reader_leann.py | 342 ---------------------- examples/main_cli_example.py | 31 -- examples/wechat_history_reader_leann.py | 320 --------------------- tests/README.md | 2 +- tests/test_ci_minimal.py | 2 +- tests/test_document_rag.py | 12 - 9 files changed, 3 insertions(+), 1134 deletions(-) delete mode 100644 examples/PARAMETER_CONSISTENCY.md delete mode 100644 examples/google_history_reader_leann.py delete mode 100644 examples/mail_reader_leann.py delete mode 100644 examples/main_cli_example.py delete mode 100644 examples/wechat_history_reader_leann.py diff --git a/docs/normalized_embeddings.md b/docs/normalized_embeddings.md index 46213e5..777b218 100644 --- a/docs/normalized_embeddings.md +++ b/docs/normalized_embeddings.md @@ -72,4 +72,4 @@ Using the wrong distance metric with normalized embeddings can lead to: - **Incorrect ranking** of search results - **Suboptimal performance** compared to using the correct metric -For more details on why this happens, see our analysis of [OpenAI embeddings with MIPS](../examples/main_cli_example.py). +For more details on why this happens, see our analysis of [OpenAI embeddings with MIPS](../examples/document_rag.py). diff --git a/examples/PARAMETER_CONSISTENCY.md b/examples/PARAMETER_CONSISTENCY.md deleted file mode 100644 index 794fe2c..0000000 --- a/examples/PARAMETER_CONSISTENCY.md +++ /dev/null @@ -1,64 +0,0 @@ -# Parameter Consistency Guide - -This document ensures that the new unified interface maintains exact parameter compatibility with the original examples. - -## Parameter Mapping - -### Common Parameters (All Examples) - -| Parameter | Default Value | Notes | -|-----------|--------------|-------| -| `backend_name` | `"hnsw"` | All examples use HNSW backend | -| `graph_degree` | `32` | Consistent across all | -| `complexity` | `64` | Consistent across all | -| `is_compact` | `True` | NOT `compact_index` | -| `is_recompute` | `True` | NOT `use_recomputed_embeddings` | -| `num_threads` | `1` | Force single-threaded mode | -| `chunk_size` | `256` | Consistent across all | - -### Example-Specific Defaults - -#### document_rag.py (replaces main_cli_example.py) -- `index_dir`: `"./test_doc_files"` (matches original) -- `chunk_overlap`: `128` (matches original) -- `embedding_model`: `"facebook/contriever"` -- `embedding_mode`: `"sentence-transformers"` -- No max limit by default - -#### email_rag.py (replaces mail_reader_leann.py) -- `index_dir`: `"./mail_index"` (matches original) -- `max_items`: `1000` (was `max_emails`) -- `chunk_overlap`: `25` (matches original) -- `embedding_model`: `"facebook/contriever"` -- NO `embedding_mode` parameter in LeannBuilder (original doesn't have it) - -#### browser_rag.py (replaces google_history_reader_leann.py) -- `index_dir`: `"./google_history_index"` (matches original) -- `max_items`: `1000` (was `max_entries`) -- `chunk_overlap`: `25` (primary value in original) -- `embedding_model`: `"facebook/contriever"` -- `embedding_mode`: `"sentence-transformers"` - -#### wechat_rag.py (replaces wechat_history_reader_leann.py) -- `index_dir`: `"./wechat_history_magic_test_11Debug_new"` (matches original) -- `max_items`: `50` (was `max_entries`, much lower default) -- `chunk_overlap`: `25` (matches original) -- `embedding_model`: `"Qwen/Qwen3-Embedding-0.6B"` (special model for Chinese) -- NO `embedding_mode` parameter in LeannBuilder (original doesn't have it) - -## Implementation Notes - -1. **Parameter Names**: The original files use `is_compact` and `is_recompute`, not the newer names. - -2. **Chunk Overlap**: Most examples use `25` except for documents which uses `128`. - -3. **Embedding Mode**: Only `google_history_reader_leann.py` and `main_cli_example.py` have this parameter. - -4. **Max Items**: Each example has different defaults: - - Email/Browser: 1000 - - WeChat: 50 - - Documents: unlimited - -5. **Special Cases**: - - WeChat uses a specific Chinese embedding model - - Email reader includes HTML processing option diff --git a/examples/google_history_reader_leann.py b/examples/google_history_reader_leann.py deleted file mode 100644 index 62781a4..0000000 --- a/examples/google_history_reader_leann.py +++ /dev/null @@ -1,362 +0,0 @@ -import argparse -import asyncio -import os - -try: - import dotenv - - dotenv.load_dotenv() -except ModuleNotFoundError: - # python-dotenv is not installed; skip loading environment variables - dotenv = None -from pathlib import Path - -from leann.api import LeannBuilder, LeannChat -from llama_index.core.node_parser import SentenceSplitter - -# dotenv.load_dotenv() # handled above if python-dotenv is available - -# Default Chrome profile path -DEFAULT_CHROME_PROFILE = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default") - - -def create_leann_index_from_multiple_chrome_profiles( - profile_dirs: list[Path], - index_path: str = "chrome_history_index.leann", - max_count: int = -1, - embedding_model: str = "facebook/contriever", - embedding_mode: str = "sentence-transformers", -): - """ - Create LEANN index from multiple Chrome profile data sources. - - Args: - profile_dirs: List of Path objects pointing to Chrome profile directories - index_path: Path to save the LEANN index - max_count: Maximum number of history entries to process per profile - embedding_model: The embedding model to use - embedding_mode: The embedding backend mode - """ - print("Creating LEANN index from multiple Chrome profile data sources...") - - # Load documents using ChromeHistoryReader from history_data - from history_data.history import ChromeHistoryReader - - reader = ChromeHistoryReader() - - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - all_documents = [] - total_processed = 0 - - # Process each Chrome profile directory - for i, profile_dir in enumerate(profile_dirs): - print(f"\nProcessing Chrome profile {i + 1}/{len(profile_dirs)}: {profile_dir}") - - try: - documents = reader.load_data( - chrome_profile_path=str(profile_dir), max_count=max_count - ) - if documents: - print(f"Loaded {len(documents)} history documents from {profile_dir}") - all_documents.extend(documents) - total_processed += len(documents) - - # Check if we've reached the max count - if max_count > 0 and total_processed >= max_count: - print(f"Reached max count of {max_count} documents") - break - else: - print(f"No documents loaded from {profile_dir}") - except Exception as e: - print(f"Error processing {profile_dir}: {e}") - continue - - if not all_documents: - print("No documents loaded from any source. Exiting.") - # highlight info that you need to close all chrome browser before running this script and high light the instruction!! - print( - "\033[91mYou need to close or quit all chrome browser before running this script\033[0m" - ) - return None - - print( - f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles" - ) - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in all_documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - text = node.get_content() - # text = '[Title] ' + doc.metadata["title"] + '\n' + text - all_texts.append(text) - - print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents") - - # Create LEANN index directory - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - # LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric - builder = LeannBuilder( - backend_name="hnsw", - embedding_model=embedding_model, - embedding_mode=embedding_mode, - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} history chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -def create_leann_index( - profile_path: str | None = None, - index_path: str = "chrome_history_index.leann", - max_count: int = 1000, - embedding_model: str = "facebook/contriever", - embedding_mode: str = "sentence-transformers", -): - """ - Create LEANN index from Chrome history data. - - Args: - profile_path: Path to the Chrome profile directory (optional, uses default if None) - index_path: Path to save the LEANN index - max_count: Maximum number of history entries to process - embedding_model: The embedding model to use - embedding_mode: The embedding backend mode - """ - print("Creating LEANN index from Chrome history data...") - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Load documents using ChromeHistoryReader from history_data - from history_data.history import ChromeHistoryReader - - reader = ChromeHistoryReader() - - documents = reader.load_data(chrome_profile_path=profile_path, max_count=max_count) - - if not documents: - print("No documents loaded. Exiting.") - return None - - print(f"Loaded {len(documents)} history documents") - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - all_texts.append(node.get_content()) - - print(f"Created {len(all_texts)} text chunks from {len(documents)} documents") - - # Create LEANN index directory - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - # LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric - builder = LeannBuilder( - backend_name="hnsw", - embedding_model=embedding_model, - embedding_mode=embedding_mode, - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} history chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -async def query_leann_index(index_path: str, query: str): - """ - Query the LEANN index. - - Args: - index_path: Path to the LEANN index - query: The query string - """ - print("\n[PHASE 2] Starting Leann chat session...") - chat = LeannChat(index_path=index_path) - - print(f"You: {query}") - chat_response = chat.ask( - query, - top_k=10, - recompute_beighbor_embeddings=True, - complexity=32, - beam_width=1, - llm_config={ - "type": "openai", - "model": "gpt-4o", - "api_key": os.getenv("OPENAI_API_KEY"), - }, - llm_kwargs={"temperature": 0.0, "max_tokens": 1000}, - ) - - print(f"Leann chat response: \033[36m{chat_response}\033[0m") - - -async def main(): - # Parse command line arguments - parser = argparse.ArgumentParser( - description="LEANN Chrome History Reader - Create and query browser history index" - ) - parser.add_argument( - "--chrome-profile", - type=str, - default=DEFAULT_CHROME_PROFILE, - help=f"Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this", - ) - parser.add_argument( - "--index-dir", - type=str, - default="./google_history_index", - help="Directory to store the LEANN index (default: ./chrome_history_index_leann_test)", - ) - parser.add_argument( - "--max-entries", - type=int, - default=1000, - help="Maximum number of history entries to process (default: 1000)", - ) - parser.add_argument( - "--query", - type=str, - default=None, - help="Single query to run (default: runs example queries)", - ) - parser.add_argument( - "--auto-find-profiles", - action="store_true", - default=True, - help="Automatically find all Chrome profiles (default: True)", - ) - parser.add_argument( - "--embedding-model", - type=str, - default="facebook/contriever", - help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small')", - ) - parser.add_argument( - "--embedding-mode", - type=str, - default="sentence-transformers", - choices=["sentence-transformers", "openai", "mlx"], - help="The embedding backend mode", - ) - parser.add_argument( - "--use-existing-index", - action="store_true", - help="Use existing index without rebuilding", - ) - - args = parser.parse_args() - - INDEX_DIR = Path(args.index_dir) - INDEX_PATH = str(INDEX_DIR / "chrome_history.leann") - - print(f"Using Chrome profile: {args.chrome_profile}") - print(f"Index directory: {INDEX_DIR}") - print(f"Max entries: {args.max_entries}") - - if args.use_existing_index: - # Use existing index without rebuilding - if not Path(INDEX_PATH).exists(): - print(f"Error: Index file not found at {INDEX_PATH}") - return - print(f"Using existing index at {INDEX_PATH}") - index_path = INDEX_PATH - else: - # Find Chrome profile directories - from history_data.history import ChromeHistoryReader - - if args.auto_find_profiles: - profile_dirs = ChromeHistoryReader.find_chrome_profiles() - if not profile_dirs: - print("No Chrome profiles found automatically. Exiting.") - return - else: - # Use single specified profile - profile_path = Path(args.chrome_profile) - if not profile_path.exists(): - print(f"Chrome profile not found: {profile_path}") - return - profile_dirs = [profile_path] - - # Create or load the LEANN index from all sources - index_path = create_leann_index_from_multiple_chrome_profiles( - profile_dirs, INDEX_PATH, args.max_entries, args.embedding_model, args.embedding_mode - ) - - if index_path: - if args.query: - # Run single query - await query_leann_index(index_path, args.query) - else: - # Example queries - queries = [ - "What websites did I visit about machine learning?", - "Find my search history about programming", - ] - - for query in queries: - print("\n" + "=" * 60) - await query_leann_index(index_path, query) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/mail_reader_leann.py b/examples/mail_reader_leann.py deleted file mode 100644 index 6aa7536..0000000 --- a/examples/mail_reader_leann.py +++ /dev/null @@ -1,342 +0,0 @@ -import argparse -import asyncio -import os -import sys -from pathlib import Path - -import dotenv - -# Add the project root to Python path so we can import from examples -project_root = Path(__file__).parent.parent -sys.path.insert(0, str(project_root)) - -from leann.api import LeannBuilder, LeannChat -from llama_index.core.node_parser import SentenceSplitter - -dotenv.load_dotenv() - - -# Auto-detect user's mail path -def get_mail_path(): - """Get the mail path for the current user""" - home_dir = os.path.expanduser("~") - return os.path.join(home_dir, "Library", "Mail") - - -# Default mail path for macOS -DEFAULT_MAIL_PATH = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data" - - -def create_leann_index_from_multiple_sources( - messages_dirs: list[Path], - index_path: str = "mail_index.leann", - max_count: int = -1, - include_html: bool = False, - embedding_model: str = "facebook/contriever", -): - """ - Create LEANN index from multiple mail data sources. - - Args: - messages_dirs: List of Path objects pointing to Messages directories - index_path: Path to save the LEANN index - max_count: Maximum number of emails to process per directory - include_html: Whether to include HTML content in email processing - """ - print("Creating LEANN index from multiple mail data sources...") - - # Load documents using EmlxReader from LEANN_email_reader - from examples.email_data.LEANN_email_reader import EmlxReader - - reader = EmlxReader(include_html=include_html) - # from email_data.email import EmlxMboxReader - # from pathlib import Path - # reader = EmlxMboxReader() - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - all_documents = [] - total_processed = 0 - - # Process each Messages directory - for i, messages_dir in enumerate(messages_dirs): - print(f"\nProcessing Messages directory {i + 1}/{len(messages_dirs)}: {messages_dir}") - - try: - documents = reader.load_data(messages_dir) - if documents: - print(f"Loaded {len(documents)} email documents from {messages_dir}") - all_documents.extend(documents) - total_processed += len(documents) - - # Check if we've reached the max count - if max_count > 0 and total_processed >= max_count: - print(f"Reached max count of {max_count} documents") - break - else: - print(f"No documents loaded from {messages_dir}") - except Exception as e: - print(f"Error processing {messages_dir}: {e}") - continue - - if not all_documents: - print("No documents loaded from any source. Exiting.") - return None - - print( - f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories and starting to split them into chunks" - ) - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in all_documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - text = node.get_content() - # text = '[subject] ' + doc.metadata["subject"] + '\n' + text - all_texts.append(text) - - print( - f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks" - ) - - # Create LEANN index directory - - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - builder = LeannBuilder( - backend_name="hnsw", - embedding_model=embedding_model, - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} email chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -def create_leann_index( - mail_path: str, - index_path: str = "mail_index.leann", - max_count: int = 1000, - include_html: bool = False, - embedding_model: str = "facebook/contriever", -): - """ - Create LEANN index from mail data. - - Args: - mail_path: Path to the mail directory - index_path: Path to save the LEANN index - max_count: Maximum number of emails to process - include_html: Whether to include HTML content in email processing - """ - print("Creating LEANN index from mail data...") - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Load documents using EmlxReader from LEANN_email_reader - from examples.email_data.LEANN_email_reader import EmlxReader - - reader = EmlxReader(include_html=include_html) - # from email_data.email import EmlxMboxReader - # from pathlib import Path - # reader = EmlxMboxReader() - documents = reader.load_data(Path(mail_path)) - - if not documents: - print("No documents loaded. Exiting.") - return None - - print(f"Loaded {len(documents)} email documents") - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - all_texts.append(node.get_content()) - - print(f"Created {len(all_texts)} text chunks from {len(documents)} documents") - - # Create LEANN index directory - - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - builder = LeannBuilder( - backend_name="hnsw", - embedding_model=embedding_model, - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} email chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -async def query_leann_index(index_path: str, query: str): - """ - Query the LEANN index. - - Args: - index_path: Path to the LEANN index - query: The query string - """ - print("\n[PHASE 2] Starting Leann chat session...") - chat = LeannChat(index_path=index_path, llm_config={"type": "openai", "model": "gpt-4o"}) - - print(f"You: {query}") - import time - - time.time() - chat_response = chat.ask( - query, - top_k=20, - recompute_beighbor_embeddings=True, - complexity=32, - beam_width=1, - ) - time.time() - # print(f"Time taken: {end_time - start_time} seconds") - # highlight the answer - print(f"Leann chat response: \033[36m{chat_response}\033[0m") - - -async def main(): - # Parse command line arguments - parser = argparse.ArgumentParser(description="LEANN Mail Reader - Create and query email index") - # Remove --mail-path argument and auto-detect all Messages directories - # Remove DEFAULT_MAIL_PATH - parser.add_argument( - "--index-dir", - type=str, - default="./mail_index", - help="Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)", - ) - parser.add_argument( - "--max-emails", - type=int, - default=1000, - help="Maximum number of emails to process (-1 means all)", - ) - parser.add_argument( - "--query", - type=str, - default="Give me some funny advertisement about apple or other companies", - help="Single query to run (default: runs example queries)", - ) - parser.add_argument( - "--include-html", - action="store_true", - default=False, - help="Include HTML content in email processing (default: False)", - ) - parser.add_argument( - "--embedding-model", - type=str, - default="facebook/contriever", - help="Embedding model to use (default: facebook/contriever)", - ) - - args = parser.parse_args() - - print(f"args: {args}") - - # Automatically find all Messages directories under the current user's Mail directory - from examples.email_data.LEANN_email_reader import find_all_messages_directories - - mail_path = get_mail_path() - print(f"Searching for email data in: {mail_path}") - messages_dirs = find_all_messages_directories(mail_path) - # messages_dirs = find_all_messages_directories(DEFAULT_MAIL_PATH) - # messages_dirs = [DEFAULT_MAIL_PATH] - # messages_dirs = messages_dirs[:1] - - print("len(messages_dirs): ", len(messages_dirs)) - - if not messages_dirs: - print("No Messages directories found. Exiting.") - return - - INDEX_DIR = Path(args.index_dir) - INDEX_PATH = str(INDEX_DIR / "mail_documents.leann") - print(f"Index directory: {INDEX_DIR}") - print(f"Found {len(messages_dirs)} Messages directories.") - - # Create or load the LEANN index from all sources - index_path = create_leann_index_from_multiple_sources( - messages_dirs, - INDEX_PATH, - args.max_emails, - args.include_html, - args.embedding_model, - ) - - if index_path: - if args.query: - # Run single query - await query_leann_index(index_path, args.query) - else: - # Example queries - queries = [ - "Hows Berkeley Graduate Student Instructor", - "how's the icloud related advertisement saying", - "Whats the number of class recommend to take per semester for incoming EECS students", - ] - for query in queries: - print("\n" + "=" * 60) - await query_leann_index(index_path, query) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/main_cli_example.py b/examples/main_cli_example.py deleted file mode 100644 index 8bdd83e..0000000 --- a/examples/main_cli_example.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 -""" -This script has been replaced by document_rag.py with a unified interface. -This file is kept for backward compatibility. -""" - -import sys - -print("=" * 70) -print("NOTICE: This script has been replaced!") -print("=" * 70) -print("\nThe examples have been refactored with a unified interface.") -print("Please use the new script instead:\n") -print(" python examples/document_rag.py") -print("\nThe new script provides:") -print(" ✓ Consistent parameters across all examples") -print(" ✓ Better error handling") -print(" ✓ Interactive mode support") -print(" ✓ More customization options") -print("\nExample usage:") -print(' python examples/document_rag.py --query "What are the main techniques?"') -print(" python examples/document_rag.py # For interactive mode") -print("\nSee README.md for full documentation.") -print("=" * 70) - -# If user passed arguments, show how to use them with new script -if len(sys.argv) > 1: - print("\nTo use your arguments with the new script:") - print(f" python examples/document_rag.py {' '.join(sys.argv[1:])}") - -sys.exit(1) diff --git a/examples/wechat_history_reader_leann.py b/examples/wechat_history_reader_leann.py deleted file mode 100644 index ef68c6b..0000000 --- a/examples/wechat_history_reader_leann.py +++ /dev/null @@ -1,320 +0,0 @@ -import argparse -import asyncio -import os -from pathlib import Path - -import dotenv -from leann.api import LeannBuilder, LeannChat -from llama_index.core.node_parser import SentenceSplitter - -dotenv.load_dotenv() - -# Default WeChat export directory -DEFAULT_WECHAT_EXPORT_DIR = "./wechat_export_direct" - - -def create_leann_index_from_multiple_wechat_exports( - export_dirs: list[Path], - index_path: str = "wechat_history_index.leann", - max_count: int = -1, -): - """ - Create LEANN index from multiple WeChat export data sources. - - Args: - export_dirs: List of Path objects pointing to WeChat export directories - index_path: Path to save the LEANN index - max_count: Maximum number of chat entries to process per export - """ - print("Creating LEANN index from multiple WeChat export data sources...") - - # Load documents using WeChatHistoryReader from history_data - from history_data.wechat_history import WeChatHistoryReader - - reader = WeChatHistoryReader() - - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - all_documents = [] - total_processed = 0 - - # Process each WeChat export directory - for i, export_dir in enumerate(export_dirs): - print(f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}") - - try: - documents = reader.load_data( - wechat_export_dir=str(export_dir), - max_count=max_count, - concatenate_messages=True, # Disable concatenation - one message per document - ) - if documents: - print(f"Loaded {len(documents)} chat documents from {export_dir}") - all_documents.extend(documents) - total_processed += len(documents) - - # Check if we've reached the max count - if max_count > 0 and total_processed >= max_count: - print(f"Reached max count of {max_count} documents") - break - else: - print(f"No documents loaded from {export_dir}") - except Exception as e: - print(f"Error processing {export_dir}: {e}") - continue - - if not all_documents: - print("No documents loaded from any source. Exiting.") - return None - - print( - f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports and starting to split them into chunks" - ) - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=192, chunk_overlap=64) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in all_documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - text = ( - "[Contact] means the message is from: " - + doc.metadata["contact_name"] - + "\n" - + node.get_content() - ) - all_texts.append(text) - - print( - f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks" - ) - - # Create LEANN index directory - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - builder = LeannBuilder( - backend_name="hnsw", - embedding_model="Qwen/Qwen3-Embedding-0.6B", - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} chat chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -def create_leann_index( - export_dir: str | None = None, - index_path: str = "wechat_history_index.leann", - max_count: int = 1000, -): - """ - Create LEANN index from WeChat chat history data. - - Args: - export_dir: Path to the WeChat export directory (optional, uses default if None) - index_path: Path to save the LEANN index - max_count: Maximum number of chat entries to process - """ - print("Creating LEANN index from WeChat chat history data...") - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Load documents using WeChatHistoryReader from history_data - from history_data.wechat_history import WeChatHistoryReader - - reader = WeChatHistoryReader() - - documents = reader.load_data( - wechat_export_dir=export_dir, - max_count=max_count, - concatenate_messages=False, # Disable concatenation - one message per document - ) - - if not documents: - print("No documents loaded. Exiting.") - return None - - print(f"Loaded {len(documents)} chat documents") - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - all_texts.append(node.get_content()) - - print(f"Created {len(all_texts)} text chunks from {len(documents)} documents") - - # Create LEANN index directory - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - builder = LeannBuilder( - backend_name="hnsw", - embedding_model="mlx-community/Qwen3-Embedding-0.6B-4bit-DWQ", # MLX-optimized model - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} chat chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -async def query_leann_index(index_path: str, query: str): - """ - Query the LEANN index. - - Args: - index_path: Path to the LEANN index - query: The query string - """ - print("\n[PHASE 2] Starting Leann chat session...") - chat = LeannChat(index_path=index_path) - - print(f"You: {query}") - chat_response = chat.ask( - query, - top_k=20, - recompute_beighbor_embeddings=True, - complexity=16, - beam_width=1, - llm_config={ - "type": "openai", - "model": "gpt-4o", - "api_key": os.getenv("OPENAI_API_KEY"), - }, - llm_kwargs={"temperature": 0.0, "max_tokens": 1000}, - ) - print(f"Leann chat response: \033[36m{chat_response}\033[0m") - - -async def main(): - """Main function with integrated WeChat export functionality.""" - - # Parse command line arguments - parser = argparse.ArgumentParser( - description="LEANN WeChat History Reader - Create and query WeChat chat history index" - ) - parser.add_argument( - "--export-dir", - type=str, - default=DEFAULT_WECHAT_EXPORT_DIR, - help=f"Directory to store WeChat exports (default: {DEFAULT_WECHAT_EXPORT_DIR})", - ) - parser.add_argument( - "--index-dir", - type=str, - default="./wechat_history_magic_test_11Debug_new", - help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)", - ) - parser.add_argument( - "--max-entries", - type=int, - default=50, - help="Maximum number of chat entries to process (default: 5000)", - ) - parser.add_argument( - "--query", - type=str, - default=None, - help="Single query to run (default: runs example queries)", - ) - parser.add_argument( - "--force-export", - action="store_true", - default=False, - help="Force re-export of WeChat data even if exports exist", - ) - - args = parser.parse_args() - - INDEX_DIR = Path(args.index_dir) - INDEX_PATH = str(INDEX_DIR / "wechat_history.leann") - - print(f"Using WeChat export directory: {args.export_dir}") - print(f"Index directory: {INDEX_DIR}") - print(f"Max entries: {args.max_entries}") - - # Initialize WeChat reader with export capabilities - from history_data.wechat_history import WeChatHistoryReader - - reader = WeChatHistoryReader() - - # Find existing exports or create new ones using the centralized method - export_dirs = reader.find_or_export_wechat_data(args.export_dir) - if not export_dirs: - print("Failed to find or export WeChat data. Exiting.") - return - - # Create or load the LEANN index from all sources - index_path = create_leann_index_from_multiple_wechat_exports( - export_dirs, INDEX_PATH, max_count=args.max_entries - ) - - if index_path: - if args.query: - # Run single query - await query_leann_index(index_path, args.query) - else: - # Example queries - queries = [ - "我想买魔术师约翰逊的球衣,给我一些对应聊天记录?", - ] - - for query in queries: - print("\n" + "=" * 60) - await query_leann_index(index_path, query) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/tests/README.md b/tests/README.md index f73eb21..22822bd 100644 --- a/tests/README.md +++ b/tests/README.md @@ -19,7 +19,7 @@ Basic functionality tests that verify: - Uses parametrized tests to test both backends ### `test_document_rag.py` -Tests the document RAG example functionality (formerly main_cli_example): +Tests the document RAG example functionality: - Tests with facebook/contriever embeddings - Tests with OpenAI embeddings (if API key is available) - Tests error handling with invalid parameters diff --git a/tests/test_ci_minimal.py b/tests/test_ci_minimal.py index 4207802..072123c 100644 --- a/tests/test_ci_minimal.py +++ b/tests/test_ci_minimal.py @@ -20,7 +20,7 @@ def test_package_imports(): def test_cli_help(): """Test that CLI example shows help.""" result = subprocess.run( - [sys.executable, "examples/main_cli_example.py", "--help"], capture_output=True, text=True + [sys.executable, "examples/document_rag.py", "--help"], capture_output=True, text=True ) assert result.returncode == 0 diff --git a/tests/test_document_rag.py b/tests/test_document_rag.py index 3ce64f4..26007a8 100644 --- a/tests/test_document_rag.py +++ b/tests/test_document_rag.py @@ -118,15 +118,3 @@ def test_document_rag_error_handling(test_data_dir): # Should fail with invalid LLM type assert result.returncode != 0 assert "invalid choice" in result.stderr or "invalid_llm_type" in result.stderr - - -def test_main_cli_backward_compatibility(): - """Test that main_cli_example.py shows migration message.""" - cmd = [sys.executable, "examples/main_cli_example.py", "--help"] - - result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) - - # Should exit with error code and show migration message - assert result.returncode != 0 - assert "This script has been replaced" in result.stdout - assert "document_rag.py" in result.stdout