From 88997349528acfe26b96d30f67214a933623fe38 Mon Sep 17 00:00:00 2001
From: Andy Lee ๐ฆ Prerequisites: Install uv (if you don't have it)
+### ๐ฆ Prerequisites: Install uv
-Install uv first if you don't have it:
+[Install uv](https://docs.astral.sh/uv/getting-started/installation/#installation-methods) first if you don't have it. Typically, you can install it with:
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
```
-๐ [Detailed uv installation methods โ](https://docs.astral.sh/uv/getting-started/installation/#installation-methods)
+### ๐ Quick Install
-
+๐ง Build from Source (Recommended for development)
+
+
+
```bash
-git clone git@github.com:yichuan-w/LEANN.git leann
+git clone https://github.com/yichuan-w/LEANN.git leann
cd leann
git submodule update --init --recursive
```
@@ -91,14 +91,14 @@ sudo apt-get install libomp-dev libboost-all-dev protobuf-compiler libabsl-dev l
uv sync
```
-
+๐ OpenAI API Setup (Default)
@@ -166,7 +166,49 @@ ollama pull llama3.2:1b
๐ Click to expand: Common Parameters (Available in All Examples)
+
+All RAG examples share these common parameters. **Interactive mode** is available in all examples - simply run without `--query` to start a continuous Q&A session where you can ask multiple questions. Type 'quit' to exit.
+
+```bash
+# Core Parameters (General preprocessing for all examples)
+--index-dir DIR # Directory to store the index (default: current directory)
+--query "YOUR QUESTION" # Single query mode. Omit for interactive chat (type 'quit' to exit), and now you can play with your index interactively
+--max-items N # Limit data preprocessing (default: -1, process all data)
+--force-rebuild # Force rebuild index even if it exists
+
+# Embedding Parameters
+--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small or mlx-community/multilingual-e5-base-mlx
+--embedding-mode MODE # sentence-transformers, openai, or mlx
+
+# LLM Parameters (Text generation models)
+--llm TYPE # LLM backend: openai, ollama, or hf (default: openai)
+--llm-model MODEL # Model name (default: gpt-4o) e.g., gpt-4o-mini, llama3.2:1b, Qwen/Qwen2.5-1.5B-Instruct
+
+# Search Parameters
+--top-k N # Number of results to retrieve (default: 20)
+--search-complexity N # Search complexity for graph traversal (default: 32)
+
+# Chunking Parameters
+--chunk-size N # Size of text chunks (default varies by source: 256 for most, 192 for WeChat)
+--chunk-overlap N # Overlap between chunks (default varies: 25-128 depending on source)
+
+# Index Building Parameters
+--backend-name NAME # Backend to use: hnsw or diskann (default: hnsw)
+--graph-degree N # Graph degree for index construction (default: 32)
+--build-complexity N # Build complexity for index construction (default: 64)
+--no-compact # Disable compact index storage (compact storage IS enabled to save storage by default)
+--no-recompute # Disable embedding recomputation (recomputation IS enabled to save storage by default)
+```
+
+
-**Note:** You need to grant full disk access to your terminal/VS Code in System Preferences โ Privacy & Security โ Full Disk Access.
+Before running the example below, you need to grant full disk access to your terminal/VS Code in System Preferences โ Privacy & Security โ Full Disk Access.
+
```bash
-python examples/mail_reader_leann.py --query "What's the food I ordered by DoorDash or Uber Eats mostly?"
+python -m apps.email_rag --query "What's the food I ordered by DoorDash or Uber Eats mostly?"
```
**780K email chunks โ 78MB storage.** Finally, search your email like you search Google.
diff --git a/apps/__init__.py b/apps/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/base_rag_example.py b/apps/base_rag_example.py new file mode 100644 index 0000000..a135625 --- /dev/null +++ b/apps/base_rag_example.py @@ -0,0 +1,296 @@ +""" +Base class for unified RAG examples interface. +Provides common parameters and functionality for all RAG examples. +""" + +import argparse +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any + +import dotenv +from leann.api import LeannBuilder, LeannChat +from llama_index.core.node_parser import SentenceSplitter + +dotenv.load_dotenv() + + +class BaseRAGExample(ABC): + """Base class for all RAG examples with unified interface.""" + + def __init__( + self, + name: str, + description: str, + default_index_name: str, + ): + self.name = name + self.description = description + self.default_index_name = default_index_name + self.parser = self._create_parser() + + def _create_parser(self) -> argparse.ArgumentParser: + """Create argument parser with common parameters.""" + parser = argparse.ArgumentParser( + description=self.description, formatter_class=argparse.RawDescriptionHelpFormatter + ) + + # Core parameters (all examples share these) + core_group = parser.add_argument_group("Core Parameters") + core_group.add_argument( + "--index-dir", + type=str, + default=f"./{self.default_index_name}", + help=f"Directory to store the index (default: ./{self.default_index_name})", + ) + core_group.add_argument( + "--query", + type=str, + default=None, + help="Query to run (if not provided, will run in interactive mode)", + ) + # Allow subclasses to override default max_items + max_items_default = getattr(self, "max_items_default", -1) + core_group.add_argument( + "--max-items", + type=int, + default=max_items_default, + help="Maximum number of items to process -1 for all, means index all documents, and you should set it to a reasonable number if you have a large dataset and try at the first time)", + ) + core_group.add_argument( + "--force-rebuild", action="store_true", help="Force rebuild index even if it exists" + ) + + # Embedding parameters + embedding_group = parser.add_argument_group("Embedding Parameters") + # Allow subclasses to override default embedding_model + embedding_model_default = getattr(self, "embedding_model_default", "facebook/contriever") + embedding_group.add_argument( + "--embedding-model", + type=str, + default=embedding_model_default, + help=f"Embedding model to use (default: {embedding_model_default})", + ) + embedding_group.add_argument( + "--embedding-mode", + type=str, + default="sentence-transformers", + choices=["sentence-transformers", "openai", "mlx"], + help="Embedding backend mode (default: sentence-transformers)", + ) + + # LLM parameters + llm_group = parser.add_argument_group("LLM Parameters") + llm_group.add_argument( + "--llm", + type=str, + default="openai", + choices=["openai", "ollama", "hf"], + help="LLM backend to use (default: openai)", + ) + llm_group.add_argument( + "--llm-model", + type=str, + default=None, + help="LLM model name (default: gpt-4o for openai, llama3.2:1b for ollama)", + ) + llm_group.add_argument( + "--llm-host", + type=str, + default="http://localhost:11434", + help="Host for Ollama API (default: http://localhost:11434)", + ) + + # Search parameters + search_group = parser.add_argument_group("Search Parameters") + search_group.add_argument( + "--top-k", type=int, default=20, help="Number of results to retrieve (default: 20)" + ) + search_group.add_argument( + "--search-complexity", + type=int, + default=32, + help="Search complexity for graph traversal (default: 64)", + ) + + # Index building parameters + index_group = parser.add_argument_group("Index Building Parameters") + index_group.add_argument( + "--backend-name", + type=str, + default="hnsw", + choices=["hnsw", "diskann"], + help="Backend to use for index (default: hnsw)", + ) + index_group.add_argument( + "--graph-degree", + type=int, + default=32, + help="Graph degree for index construction (default: 32)", + ) + index_group.add_argument( + "--build-complexity", + type=int, + default=64, + help="Build complexity for index construction (default: 64)", + ) + index_group.add_argument( + "--no-compact", + action="store_true", + help="Disable compact index storage", + ) + index_group.add_argument( + "--no-recompute", + action="store_true", + help="Disable embedding recomputation", + ) + + # Add source-specific parameters + self._add_specific_arguments(parser) + + return parser + + @abstractmethod + def _add_specific_arguments(self, parser: argparse.ArgumentParser): + """Add source-specific arguments. Override in subclasses.""" + pass + + @abstractmethod + async def load_data(self, args) -> list[str]: + """Load data from the source. Returns list of text chunks.""" + pass + + def get_llm_config(self, args) -> dict[str, Any]: + """Get LLM configuration based on arguments.""" + config = {"type": args.llm} + + if args.llm == "openai": + config["model"] = args.llm_model or "gpt-4o" + elif args.llm == "ollama": + config["model"] = args.llm_model or "llama3.2:1b" + config["host"] = args.llm_host + elif args.llm == "hf": + config["model"] = args.llm_model or "Qwen/Qwen2.5-1.5B-Instruct" + + return config + + async def build_index(self, args, texts: list[str]) -> str: + """Build LEANN index from texts.""" + index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann") + + print(f"\n[Building Index] Creating {self.name} index...") + print(f"Total text chunks: {len(texts)}") + + builder = LeannBuilder( + backend_name=args.backend_name, + embedding_model=args.embedding_model, + embedding_mode=args.embedding_mode, + graph_degree=args.graph_degree, + complexity=args.build_complexity, + is_compact=not args.no_compact, + is_recompute=not args.no_recompute, + num_threads=1, # Force single-threaded mode + ) + + # Add texts in batches for better progress tracking + batch_size = 1000 + for i in range(0, len(texts), batch_size): + batch = texts[i : i + batch_size] + for text in batch: + builder.add_text(text) + print(f"Added {min(i + batch_size, len(texts))}/{len(texts)} texts...") + + print("Building index structure...") + builder.build_index(index_path) + print(f"Index saved to: {index_path}") + + return index_path + + async def run_interactive_chat(self, args, index_path: str): + """Run interactive chat with the index.""" + chat = LeannChat( + index_path, + llm_config=self.get_llm_config(args), + system_prompt=f"You are a helpful assistant that answers questions about {self.name} data.", + complexity=args.search_complexity, + ) + + print(f"\n[Interactive Mode] Chat with your {self.name} data!") + print("Type 'quit' or 'exit' to stop.\n") + + while True: + try: + query = input("You: ").strip() + if query.lower() in ["quit", "exit", "q"]: + print("Goodbye!") + break + + if not query: + continue + + response = chat.ask(query, top_k=args.top_k, complexity=args.search_complexity) + print(f"\nAssistant: {response}\n") + + except KeyboardInterrupt: + print("\nGoodbye!") + break + except Exception as e: + print(f"Error: {e}") + + async def run_single_query(self, args, index_path: str, query: str): + """Run a single query against the index.""" + chat = LeannChat( + index_path, + llm_config=self.get_llm_config(args), + system_prompt=f"You are a helpful assistant that answers questions about {self.name} data.", + complexity=args.search_complexity, + ) + + print(f"\n[Query]: \033[36m{query}\033[0m") + response = chat.ask(query, top_k=args.top_k, complexity=args.search_complexity) + print(f"\n[Response]: \033[36m{response}\033[0m") + + async def run(self): + """Main entry point for the example.""" + args = self.parser.parse_args() + + # Check if index exists + index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann") + index_exists = Path(args.index_dir).exists() + + if not index_exists or args.force_rebuild: + # Load data and build index + print(f"\n{'Rebuilding' if index_exists else 'Building'} index...") + texts = await self.load_data(args) + + if not texts: + print("No data found to index!") + return + + index_path = await self.build_index(args, texts) + else: + print(f"\nUsing existing index in {args.index_dir}") + + # Run query or interactive mode + if args.query: + await self.run_single_query(args, index_path, args.query) + else: + await self.run_interactive_chat(args, index_path) + + +def create_text_chunks(documents, chunk_size=256, chunk_overlap=25) -> list[str]: + """Helper function to create text chunks from documents.""" + node_parser = SentenceSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + separator=" ", + paragraph_separator="\n\n", + ) + + all_texts = [] + for doc in documents: + nodes = node_parser.get_nodes_from_documents([doc]) + if nodes: + all_texts.extend(node.get_content() for node in nodes) + + return all_texts diff --git a/apps/browser_rag.py b/apps/browser_rag.py new file mode 100644 index 0000000..d115510 --- /dev/null +++ b/apps/browser_rag.py @@ -0,0 +1,170 @@ +""" +Browser History RAG example using the unified interface. +Supports Chrome browser history. +""" + +import os +import sys +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from base_rag_example import BaseRAGExample, create_text_chunks + +from .history_data.history import ChromeHistoryReader + + +class BrowserRAG(BaseRAGExample): + """RAG example for Chrome browser history.""" + + def __init__(self): + # Set default values BEFORE calling super().__init__ + self.embedding_model_default = ( + "sentence-transformers/all-MiniLM-L6-v2" # Fast 384-dim model + ) + + super().__init__( + name="Browser History", + description="Process and query Chrome browser history with LEANN", + default_index_name="google_history_index", + ) + + def _add_specific_arguments(self, parser): + """Add browser-specific arguments.""" + browser_group = parser.add_argument_group("Browser Parameters") + browser_group.add_argument( + "--chrome-profile", + type=str, + default=None, + help="Path to Chrome profile directory (auto-detected if not specified)", + ) + browser_group.add_argument( + "--auto-find-profiles", + action="store_true", + default=True, + help="Automatically find all Chrome profiles (default: True)", + ) + browser_group.add_argument( + "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)" + ) + browser_group.add_argument( + "--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)" + ) + + def _get_chrome_base_path(self) -> Path: + """Get the base Chrome profile path based on OS.""" + if sys.platform == "darwin": + return Path.home() / "Library" / "Application Support" / "Google" / "Chrome" + elif sys.platform.startswith("linux"): + return Path.home() / ".config" / "google-chrome" + elif sys.platform == "win32": + return Path(os.environ["LOCALAPPDATA"]) / "Google" / "Chrome" / "User Data" + else: + raise ValueError(f"Unsupported platform: {sys.platform}") + + def _find_chrome_profiles(self) -> list[Path]: + """Auto-detect all Chrome profiles.""" + base_path = self._get_chrome_base_path() + if not base_path.exists(): + return [] + + profiles = [] + + # Check Default profile + default_profile = base_path / "Default" + if default_profile.exists() and (default_profile / "History").exists(): + profiles.append(default_profile) + + # Check numbered profiles + for item in base_path.iterdir(): + if item.is_dir() and item.name.startswith("Profile "): + if (item / "History").exists(): + profiles.append(item) + + return profiles + + async def load_data(self, args) -> list[str]: + """Load browser history and convert to text chunks.""" + # Determine Chrome profiles + if args.chrome_profile and not args.auto_find_profiles: + profile_dirs = [Path(args.chrome_profile)] + else: + print("Auto-detecting Chrome profiles...") + profile_dirs = self._find_chrome_profiles() + + # If specific profile given, filter to just that one + if args.chrome_profile: + profile_path = Path(args.chrome_profile) + profile_dirs = [p for p in profile_dirs if p == profile_path] + + if not profile_dirs: + print("No Chrome profiles found!") + print("Please specify --chrome-profile manually") + return [] + + print(f"Found {len(profile_dirs)} Chrome profiles") + + # Create reader + reader = ChromeHistoryReader() + + # Process each profile + all_documents = [] + total_processed = 0 + + for i, profile_dir in enumerate(profile_dirs): + print(f"\nProcessing profile {i + 1}/{len(profile_dirs)}: {profile_dir.name}") + + try: + # Apply max_items limit per profile + max_per_profile = -1 + if args.max_items > 0: + remaining = args.max_items - total_processed + if remaining <= 0: + break + max_per_profile = remaining + + # Load history + documents = reader.load_data( + chrome_profile_path=str(profile_dir), + max_count=max_per_profile, + ) + + if documents: + all_documents.extend(documents) + total_processed += len(documents) + print(f"Processed {len(documents)} history entries from this profile") + + except Exception as e: + print(f"Error processing {profile_dir}: {e}") + continue + + if not all_documents: + print("No browser history found to process!") + return [] + + print(f"\nTotal history entries processed: {len(all_documents)}") + + # Convert to text chunks + all_texts = create_text_chunks( + all_documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap + ) + + return all_texts + + +if __name__ == "__main__": + import asyncio + + # Example queries for browser history RAG + print("\n๐ Browser History RAG Example") + print("=" * 50) + print("\nExample queries you can try:") + print("- 'What websites did I visit about machine learning?'") + print("- 'Find my search history about programming'") + print("- 'What YouTube videos did I watch recently?'") + print("- 'Show me websites about travel planning'") + print("\nNote: Make sure Chrome is closed before running\n") + + rag = BrowserRAG() + asyncio.run(rag.run()) diff --git a/apps/document_rag.py b/apps/document_rag.py new file mode 100644 index 0000000..02c954a --- /dev/null +++ b/apps/document_rag.py @@ -0,0 +1,106 @@ +""" +Document RAG example using the unified interface. +Supports PDF, TXT, MD, and other document formats. +""" + +import sys +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from base_rag_example import BaseRAGExample, create_text_chunks +from llama_index.core import SimpleDirectoryReader + + +class DocumentRAG(BaseRAGExample): + """RAG example for document processing (PDF, TXT, MD, etc.).""" + + def __init__(self): + super().__init__( + name="Document", + description="Process and query documents (PDF, TXT, MD, etc.) with LEANN", + default_index_name="test_doc_files", + ) + + def _add_specific_arguments(self, parser): + """Add document-specific arguments.""" + doc_group = parser.add_argument_group("Document Parameters") + doc_group.add_argument( + "--data-dir", + type=str, + default="data", + help="Directory containing documents to index (default: data)", + ) + doc_group.add_argument( + "--file-types", + nargs="+", + default=None, + help="Filter by file types (e.g., .pdf .txt .md). If not specified, all supported types are processed", + ) + doc_group.add_argument( + "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)" + ) + doc_group.add_argument( + "--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)" + ) + + async def load_data(self, args) -> list[str]: + """Load documents and convert to text chunks.""" + print(f"Loading documents from: {args.data_dir}") + if args.file_types: + print(f"Filtering by file types: {args.file_types}") + else: + print("Processing all supported file types") + + # Check if data directory exists + data_path = Path(args.data_dir) + if not data_path.exists(): + raise ValueError(f"Data directory not found: {args.data_dir}") + + # Load documents + reader_kwargs = { + "recursive": True, + "encoding": "utf-8", + } + if args.file_types: + reader_kwargs["required_exts"] = args.file_types + + documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data( + show_progress=True + ) + + if not documents: + print(f"No documents found in {args.data_dir} with extensions {args.file_types}") + return [] + + print(f"Loaded {len(documents)} documents") + + # Convert to text chunks + all_texts = create_text_chunks( + documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap + ) + + # Apply max_items limit if specified + if args.max_items > 0 and len(all_texts) > args.max_items: + print(f"Limiting to {args.max_items} chunks (from {len(all_texts)})") + all_texts = all_texts[: args.max_items] + + return all_texts + + +if __name__ == "__main__": + import asyncio + + # Example queries for document RAG + print("\n๐ Document RAG Example") + print("=" * 50) + print("\nExample queries you can try:") + print("- 'What are the main techniques LEANN uses?'") + print("- 'What is the technique DLPM?'") + print("- 'Who does Elizabeth Bennet marry?'") + print("- 'What is the problem of developing pan gu model? (็ๅคๅคงๆจกๅๅผๅไธญ้ๅฐไปไน้ฎ้ข?)'") + print("\nOr run without --query for interactive mode\n") + + rag = DocumentRAG() + asyncio.run(rag.run()) diff --git a/examples/email_data/LEANN_email_reader.py b/apps/email_data/LEANN_email_reader.py similarity index 58% rename from examples/email_data/LEANN_email_reader.py rename to apps/email_data/LEANN_email_reader.py index 393daf6..407e2ae 100644 --- a/examples/email_data/LEANN_email_reader.py +++ b/apps/email_data/LEANN_email_reader.py @@ -52,6 +52,11 @@ class EmlxReader(BaseReader): docs: list[Document] = [] max_count = load_kwargs.get("max_count", 1000) count = 0 + total_files = 0 + successful_files = 0 + failed_files = 0 + + print(f"Starting to process directory: {input_dir}") # Walk through the directory recursively for dirpath, dirnames, filenames in os.walk(input_dir): @@ -59,10 +64,12 @@ class EmlxReader(BaseReader): dirnames[:] = [d for d in dirnames if not d.startswith(".")] for filename in filenames: - if count >= max_count: + # Check if we've reached the max count (skip if max_count == -1) + if max_count > 0 and count >= max_count: break if filename.endswith(".emlx"): + total_files += 1 filepath = os.path.join(dirpath, filename) try: # Read the .emlx file @@ -98,17 +105,26 @@ class EmlxReader(BaseReader): and not self.include_html ): continue - body += part.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - # break + try: + payload = part.get_payload(decode=True) + if payload: + body += payload.decode("utf-8", errors="ignore") + except Exception as e: + print(f"Error decoding payload: {e}") + continue else: - body = msg.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) + try: + payload = msg.get_payload(decode=True) + if payload: + body = payload.decode("utf-8", errors="ignore") + except Exception as e: + print(f"Error decoding single part payload: {e}") + body = "" - # Create document content with metadata embedded in text - doc_content = f""" + # Only create document if we have some content + if body.strip() or subject != "No Subject": + # Create document content with metadata embedded in text + doc_content = f""" [File]: {filename} [From]: {from_addr} [To]: {to_addr} @@ -118,18 +134,34 @@ class EmlxReader(BaseReader): {body} """ - # No separate metadata - everything is in the text - doc = Document(text=doc_content, metadata={}) - docs.append(doc) - count += 1 + # No separate metadata - everything is in the text + doc = Document(text=doc_content, metadata={}) + docs.append(doc) + count += 1 + successful_files += 1 + + # Print first few successful files for debugging + if successful_files <= 3: + print( + f"Successfully loaded: {filename} - Subject: {subject[:50]}..." + ) except Exception as e: - print(f"Error parsing email from {filepath}: {e}") + failed_files += 1 + if failed_files <= 5: # Only print first few errors + print(f"Error parsing email from {filepath}: {e}") continue except Exception as e: - print(f"Error reading file {filepath}: {e}") + failed_files += 1 + if failed_files <= 5: # Only print first few errors + print(f"Error reading file {filepath}: {e}") continue - print(f"Loaded {len(docs)} email documents") + print("Processing summary:") + print(f" Total .emlx files found: {total_files}") + print(f" Successfully loaded: {successful_files}") + print(f" Failed to load: {failed_files}") + print(f" Final documents: {len(docs)}") + return docs diff --git a/examples/email_data/email.py b/apps/email_data/email.py similarity index 100% rename from examples/email_data/email.py rename to apps/email_data/email.py diff --git a/apps/email_rag.py b/apps/email_rag.py new file mode 100644 index 0000000..75a6cc8 --- /dev/null +++ b/apps/email_rag.py @@ -0,0 +1,156 @@ +""" +Email RAG example using the unified interface. +Supports Apple Mail on macOS. +""" + +import sys +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from base_rag_example import BaseRAGExample, create_text_chunks + +from .email_data.LEANN_email_reader import EmlxReader + + +class EmailRAG(BaseRAGExample): + """RAG example for Apple Mail processing.""" + + def __init__(self): + # Set default values BEFORE calling super().__init__ + self.max_items_default = -1 # Process all emails by default + self.embedding_model_default = ( + "sentence-transformers/all-MiniLM-L6-v2" # Fast 384-dim model + ) + + super().__init__( + name="Email", + description="Process and query Apple Mail emails with LEANN", + default_index_name="mail_index", + ) + + def _add_specific_arguments(self, parser): + """Add email-specific arguments.""" + email_group = parser.add_argument_group("Email Parameters") + email_group.add_argument( + "--mail-path", + type=str, + default=None, + help="Path to Apple Mail directory (auto-detected if not specified)", + ) + email_group.add_argument( + "--include-html", action="store_true", help="Include HTML content in email processing" + ) + email_group.add_argument( + "--chunk-size", type=int, default=256, help="Text chunk size (default: 256)" + ) + email_group.add_argument( + "--chunk-overlap", type=int, default=25, help="Text chunk overlap (default: 25)" + ) + + def _find_mail_directories(self) -> list[Path]: + """Auto-detect all Apple Mail directories.""" + mail_base = Path.home() / "Library" / "Mail" + if not mail_base.exists(): + return [] + + # Find all Messages directories + messages_dirs = [] + for item in mail_base.rglob("Messages"): + if item.is_dir(): + messages_dirs.append(item) + + return messages_dirs + + async def load_data(self, args) -> list[str]: + """Load emails and convert to text chunks.""" + # Determine mail directories + if args.mail_path: + messages_dirs = [Path(args.mail_path)] + else: + print("Auto-detecting Apple Mail directories...") + messages_dirs = self._find_mail_directories() + + if not messages_dirs: + print("No Apple Mail directories found!") + print("Please specify --mail-path manually") + return [] + + print(f"Found {len(messages_dirs)} mail directories") + + # Create reader + reader = EmlxReader(include_html=args.include_html) + + # Process each directory + all_documents = [] + total_processed = 0 + + for i, messages_dir in enumerate(messages_dirs): + print(f"\nProcessing directory {i + 1}/{len(messages_dirs)}: {messages_dir}") + + try: + # Count emlx files + emlx_files = list(messages_dir.glob("*.emlx")) + print(f"Found {len(emlx_files)} email files") + + # Apply max_items limit per directory + max_per_dir = -1 # Default to process all + if args.max_items > 0: + remaining = args.max_items - total_processed + if remaining <= 0: + break + max_per_dir = remaining + # If args.max_items == -1, max_per_dir stays -1 (process all) + + # Load emails - fix the parameter passing + documents = reader.load_data( + input_dir=str(messages_dir), + max_count=max_per_dir, + ) + + if documents: + all_documents.extend(documents) + total_processed += len(documents) + print(f"Processed {len(documents)} emails from this directory") + + except Exception as e: + print(f"Error processing {messages_dir}: {e}") + continue + + if not all_documents: + print("No emails found to process!") + return [] + + print(f"\nTotal emails processed: {len(all_documents)}") + print("now starting to split into text chunks ... take some time") + + # Convert to text chunks + # Email reader uses chunk_overlap=25 as in original + all_texts = create_text_chunks( + all_documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap + ) + + return all_texts + + +if __name__ == "__main__": + import asyncio + + # Check platform + if sys.platform != "darwin": + print("\nโ ๏ธ Warning: This example is designed for macOS (Apple Mail)") + print(" Windows/Linux support coming soon!\n") + + # Example queries for email RAG + print("\n๐ง Email RAG Example") + print("=" * 50) + print("\nExample queries you can try:") + print("- 'What did my boss say about deadlines?'") + print("- 'Find emails about travel expenses'") + print("- 'Show me emails from last month about the project'") + print("- 'What food did I order from DoorDash?'") + print("\nNote: You may need to grant Full Disk Access to your terminal\n") + + rag = EmailRAG() + asyncio.run(rag.run()) diff --git a/examples/history_data/__init__.py b/apps/history_data/__init__.py similarity index 100% rename from examples/history_data/__init__.py rename to apps/history_data/__init__.py diff --git a/examples/history_data/history.py b/apps/history_data/history.py similarity index 95% rename from examples/history_data/history.py rename to apps/history_data/history.py index 4125244..bb2eac1 100644 --- a/examples/history_data/history.py +++ b/apps/history_data/history.py @@ -97,6 +97,11 @@ class ChromeHistoryReader(BaseReader): except Exception as e: print(f"Error reading Chrome history: {e}") + # add you may need to close your browser to make the database file available + # also highlight in red + print( + "\033[91mYou may need to close your browser to make the database file available\033[0m" + ) return docs return docs diff --git a/examples/history_data/wechat_history.py b/apps/history_data/wechat_history.py similarity index 98% rename from examples/history_data/wechat_history.py rename to apps/history_data/wechat_history.py index 4106321..e985bd4 100644 --- a/examples/history_data/wechat_history.py +++ b/apps/history_data/wechat_history.py @@ -411,8 +411,8 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars): wechat_export_dir = load_kwargs.get("wechat_export_dir", None) include_non_text = load_kwargs.get("include_non_text", False) concatenate_messages = load_kwargs.get("concatenate_messages", False) - load_kwargs.get("max_length", 1000) - load_kwargs.get("time_window_minutes", 30) + max_length = load_kwargs.get("max_length", 1000) + time_window_minutes = load_kwargs.get("time_window_minutes", 30) # Default WeChat export path if wechat_export_dir is None: @@ -460,9 +460,9 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars): # Concatenate messages based on rules message_groups = self._concatenate_messages( readable_messages, - max_length=-1, - time_window_minutes=-1, - overlap_messages=0, # Keep 2 messages overlap between groups + max_length=max_length, + time_window_minutes=time_window_minutes, + overlap_messages=0, # No overlap between groups ) # Create documents from concatenated groups @@ -532,7 +532,9 @@ Message: {readable_text if readable_text else message_text} """ # Create document with embedded metadata - doc = Document(text=doc_content, metadata={}) + doc = Document( + text=doc_content, metadata={"contact_name": contact_name} + ) docs.append(doc) count += 1 @@ -560,8 +562,8 @@ Message: {readable_text if readable_text else message_text} # Look for common export directory names possible_dirs = [ - Path("./wechat_export_test"), Path("./wechat_export"), + Path("./wechat_export_direct"), Path("./wechat_chat_history"), Path("./chat_export"), ] diff --git a/apps/wechat_rag.py b/apps/wechat_rag.py new file mode 100644 index 0000000..7355c6f --- /dev/null +++ b/apps/wechat_rag.py @@ -0,0 +1,189 @@ +""" +WeChat History RAG example using the unified interface. +Supports WeChat chat history export and search. +""" + +import subprocess +import sys +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from base_rag_example import BaseRAGExample + +from .history_data.wechat_history import WeChatHistoryReader + + +class WeChatRAG(BaseRAGExample): + """RAG example for WeChat chat history.""" + + def __init__(self): + # Set default values BEFORE calling super().__init__ + self.max_items_default = -1 # Match original default + self.embedding_model_default = ( + "sentence-transformers/all-MiniLM-L6-v2" # Fast 384-dim model + ) + + super().__init__( + name="WeChat History", + description="Process and query WeChat chat history with LEANN", + default_index_name="wechat_history_magic_test_11Debug_new", + ) + + def _add_specific_arguments(self, parser): + """Add WeChat-specific arguments.""" + wechat_group = parser.add_argument_group("WeChat Parameters") + wechat_group.add_argument( + "--export-dir", + type=str, + default="./wechat_export", + help="Directory to store WeChat exports (default: ./wechat_export)", + ) + wechat_group.add_argument( + "--force-export", + action="store_true", + help="Force re-export of WeChat data even if exports exist", + ) + wechat_group.add_argument( + "--chunk-size", type=int, default=192, help="Text chunk size (default: 192)" + ) + wechat_group.add_argument( + "--chunk-overlap", type=int, default=64, help="Text chunk overlap (default: 64)" + ) + + def _export_wechat_data(self, export_dir: Path) -> bool: + """Export WeChat data using wechattweak-cli.""" + print("Exporting WeChat data...") + + # Check if WeChat is running + try: + result = subprocess.run(["pgrep", "WeChat"], capture_output=True, text=True) + if result.returncode != 0: + print("WeChat is not running. Please start WeChat first.") + return False + except Exception: + pass # pgrep might not be available on all systems + + # Create export directory + export_dir.mkdir(parents=True, exist_ok=True) + + # Run export command + cmd = ["packages/wechat-exporter/wechattweak-cli", "export", str(export_dir)] + + try: + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode == 0: + print("WeChat data exported successfully!") + return True + else: + print(f"Export failed: {result.stderr}") + return False + + except FileNotFoundError: + print("\nError: wechattweak-cli not found!") + print("Please install it first:") + print(" sudo packages/wechat-exporter/wechattweak-cli install") + return False + except Exception as e: + print(f"Export error: {e}") + return False + + async def load_data(self, args) -> list[str]: + """Load WeChat history and convert to text chunks.""" + # Initialize WeChat reader with export capabilities + reader = WeChatHistoryReader() + + # Find existing exports or create new ones using the centralized method + export_dirs = reader.find_or_export_wechat_data(args.export_dir) + if not export_dirs: + print("Failed to find or export WeChat data. Trying to find any existing exports...") + # Try to find any existing exports in common locations + export_dirs = reader.find_wechat_export_dirs() + if not export_dirs: + print("No WeChat data found. Please ensure WeChat exports exist.") + return [] + + # Load documents from all found export directories + all_documents = [] + total_processed = 0 + + for i, export_dir in enumerate(export_dirs): + print(f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}") + + try: + # Apply max_items limit per export + max_per_export = -1 + if args.max_items > 0: + remaining = args.max_items - total_processed + if remaining <= 0: + break + max_per_export = remaining + + documents = reader.load_data( + wechat_export_dir=str(export_dir), + max_count=max_per_export, + concatenate_messages=True, # Enable message concatenation for better context + ) + + if documents: + print(f"Loaded {len(documents)} chat documents from {export_dir}") + all_documents.extend(documents) + total_processed += len(documents) + else: + print(f"No documents loaded from {export_dir}") + + except Exception as e: + print(f"Error processing {export_dir}: {e}") + continue + + if not all_documents: + print("No documents loaded from any source. Exiting.") + return [] + + print(f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports") + print("now starting to split into text chunks ... take some time") + + # Convert to text chunks with contact information + all_texts = [] + for doc in all_documents: + # Split the document into chunks + from llama_index.core.node_parser import SentenceSplitter + + text_splitter = SentenceSplitter( + chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap + ) + nodes = text_splitter.get_nodes_from_documents([doc]) + + for node in nodes: + # Add contact information to each chunk + contact_name = doc.metadata.get("contact_name", "Unknown") + text = f"[Contact] means the message is from: {contact_name}\n" + node.get_content() + all_texts.append(text) + + print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents") + return all_texts + + +if __name__ == "__main__": + import asyncio + + # Check platform + if sys.platform != "darwin": + print("\nโ ๏ธ Warning: WeChat export is only supported on macOS") + print(" You can still query existing exports on other platforms\n") + + # Example queries for WeChat RAG + print("\n๐ฌ WeChat History RAG Example") + print("=" * 50) + print("\nExample queries you can try:") + print("- 'Show me conversations about travel plans'") + print("- 'Find group chats about weekend activities'") + print("- 'ๆๆณไนฐ้ญๆฏๅธ็บฆ็ฟฐ้็็่กฃ,็ปๆไธไบๅฏนๅบ่ๅคฉ่ฎฐๅฝ?'") + print("- 'What did we discuss about the project last month?'") + print("\nNote: WeChat must be running for export to work\n") + + rag = WeChatRAG() + asyncio.run(rag.run()) diff --git a/test/sanity_checks/README.md b/benchmarks/README.md similarity index 100% rename from test/sanity_checks/README.md rename to benchmarks/README.md diff --git a/test/sanity_checks/benchmark_embeddings.py b/benchmarks/benchmark_embeddings.py similarity index 100% rename from test/sanity_checks/benchmark_embeddings.py rename to benchmarks/benchmark_embeddings.py diff --git a/examples/compare_faiss_vs_leann.py b/benchmarks/compare_faiss_vs_leann.py similarity index 99% rename from examples/compare_faiss_vs_leann.py rename to benchmarks/compare_faiss_vs_leann.py index ea714f1..03cf508 100644 --- a/examples/compare_faiss_vs_leann.py +++ b/benchmarks/compare_faiss_vs_leann.py @@ -62,7 +62,7 @@ def test_faiss_hnsw(): try: result = subprocess.run( - [sys.executable, "examples/faiss_only.py"], + [sys.executable, "benchmarks/faiss_only.py"], capture_output=True, text=True, timeout=300, @@ -115,7 +115,7 @@ def test_leann_hnsw(): # Load and parse documents documents = SimpleDirectoryReader( - "examples/data", + "data", recursive=True, encoding="utf-8", required_exts=[".pdf", ".txt", ".md"], diff --git a/examples/faiss_only.py b/benchmarks/faiss_only.py similarity index 99% rename from examples/faiss_only.py rename to benchmarks/faiss_only.py index 227e14a..c501c10 100644 --- a/examples/faiss_only.py +++ b/benchmarks/faiss_only.py @@ -65,7 +65,7 @@ def main(): tracker.checkpoint("After Faiss index creation") documents = SimpleDirectoryReader( - "examples/data", + "data", recursive=True, encoding="utf-8", required_exts=[".pdf", ".txt", ".md"], diff --git a/test/micro_tpt.py b/benchmarks/micro_tpt.py similarity index 100% rename from test/micro_tpt.py rename to benchmarks/micro_tpt.py diff --git a/examples/run_evaluation.py b/benchmarks/run_evaluation.py similarity index 97% rename from examples/run_evaluation.py rename to benchmarks/run_evaluation.py index 0e55178..2fae210 100644 --- a/examples/run_evaluation.py +++ b/benchmarks/run_evaluation.py @@ -200,10 +200,10 @@ def main(): args = parser.parse_args() # --- Path Configuration --- - # Assumes a project structure where the script is in 'examples/' - # and data is in 'data/' at the project root. - project_root = Path(__file__).resolve().parent.parent - data_root = project_root / "data" + # Assumes a project structure where the script is in 'benchmarks/' + # and evaluation data is in 'benchmarks/data/'. + script_dir = Path(__file__).resolve().parent + data_root = script_dir / "data" # Download data based on mode if args.mode == "build": @@ -279,7 +279,9 @@ def main(): if not args.index_path: print("No indices found. The data download should have included pre-built indices.") - print("Please check the data/indices/ directory or provide --index-path manually.") + print( + "Please check the benchmarks/data/indices/ directory or provide --index-path manually." + ) sys.exit(1) # Detect dataset type from index path to select the correct ground truth diff --git a/test/simple_mac_tpt_test.py b/benchmarks/simple_mac_tpt_test.py similarity index 100% rename from test/simple_mac_tpt_test.py rename to benchmarks/simple_mac_tpt_test.py diff --git a/examples/data/2501.14312v1 (1).pdf b/data/2501.14312v1 (1).pdf similarity index 100% rename from examples/data/2501.14312v1 (1).pdf rename to data/2501.14312v1 (1).pdf diff --git a/examples/data/2506.08276v1.pdf b/data/2506.08276v1.pdf similarity index 100% rename from examples/data/2506.08276v1.pdf rename to data/2506.08276v1.pdf diff --git a/examples/data/PrideandPrejudice.txt b/data/PrideandPrejudice.txt similarity index 100% rename from examples/data/PrideandPrejudice.txt rename to data/PrideandPrejudice.txt diff --git a/data/README.md b/data/README.md index bf4af69..0dd35da 100644 --- a/data/README.md +++ b/data/README.md @@ -1,44 +1,82 @@ ---- -license: mit +# ็ๅคไนๆฎ๏ผๅไธบ่ฏบไบ็ๅคๅคงๆจกๅ็ ๅๅ็จ็ๅฟ้ ธไธ้ปๆ + +ๅไฝๅฅฝ๏ผ + +ๆๆฏไธๅ็ๅคๅคงๆจกๅๅข้๏ผๅไธบ่ฏบไบๆน่ๅฎ้ชๅฎค็ๅๅทฅใ + +้ฆๅ ไธบ่ช่ฏ่บซไปฝ๏ผๅไธพไธไบ็ป่๏ผ + +1. ็ฐ่ฏบไบไธปไปป๏ผๅ็ฎๆณๅบ็จ้จ้จ้ฟ๏ผๅๆนๅไธบๅฐๆจกๅๅฎ้ชๅฎค็ไธปไปป็ไบ้นคใๅ่ฏบไบไธปไปป๏ผๅง้ช๏ผๅคงๅฎถ็งฐๅง่ๅธ๏ผใๅ ไธชๅฎ้ชๅฎคไธปไปป๏ผๅ็ฟๆ๏ผๆๅฅ๏ผๆ้๏ผๅทฒ็ฆป่๏ผ๏ผๅฐๅฉๅณฐ๏ผๅผ ็ปด๏ผ็ปดๅฅ๏ผ๏ผ้ๅปบไธ๏ผ้่ๅธ๏ผ๏ผๅๆญฆ้พ๏ผ็งฐๅผไธบๆญฆ้พๆ๏ผ็ญใๅ ถไป้ชจๅนฒๆๅๅไธๅฎถ้็ปญๆๅพๅคไบบ็ฆป่ใ +2. ๆไปฌ้ถๅฑไบโๅ้โ่ฟไธช็ป็ปใๅ้ไธๅฑๆ่ฎธๅค็บต้๏ผๅบ็ก่ฏญ่จๅคงๆจกๅๆฏๅ็บตใ็ไบ้นค็ๅฐๆจกๅๆฏๅๅ ญ็บต้ใๆไปฌๅๅ ่ฟ่ๅท็้็ป๏ผๆๅ็งๆไปฝ็ๆถ้ด่็นใๅจ่ๅทๆปๅ ณไผ้ขๅไปปๅกไปค๏ผ้่ฆๅจ่็นๅ่พพๆ็ฎๆ ใ่ๅท้็ปไผๆๅๅฐ็ไบบๅ้ฝ้ไธญๅจ่ๅท็ ็ฉถๆ๏ผๅนณๅธธไฝๅฎพ้ฆ๏ผๆฏๅฆๅจ็ช็ด็้ ๅบ๏ผไธๅฎถไบบๅญฉๅญๅคฉๅไธๆนใ +3. ๅจ่ๅท้็ป็ๆถๅๅจๅ ญ้ป่ฎคไธ็ญ๏ผ้ๅธธ่พ่ฆ๏ผไธ่ฟๅจๅ ญๆไธๅ่ถ๏ผๆไธๆฌก่ฟๆๅฐ้พ่พใๅจ่ๅท็ ็ฉถๆ็ๅทฅไฝๆฌ่ฟ่ฟไธๆฌก๏ผไปไธๆ ๆฅผๆขๅฐไบๅฆไธๆ ใ่ๅท็ ็ฉถๆๆฅผๆ ้ฝๆฏๆฌงๅผ่ฃ ไฟฎ๏ผ้จๅฃๆๅคงๅก๏ผ้้ขๆฏ่ฒๅพไธ้ใๅป่ๅท้็ปไธ่ฌ่ณๅฐ่ฆๅปไธๅจ๏ผ็่ณๆดไน ๏ผๅค็ไบบ็่ณไธไธคไธชๆ้ฝๅไธไบๅฎถใ +4. ่ฏบไบๆพ็ปไผ ่ฏดๆฏ็ ็ฉถๅ็๏ผไฝๆฏๆฅไบไนๅๅ ไธบๅจๅ้ๅๅคงๆจกๅ้กน็ฎ๏ผ้กน็ฎๆๅๅฎๅ จๅๆไบไบคไปๅ็๏ผไธๅ ๆปกไบไพไผ๏ผ่ฏๅฎก๏ผๆฑๆฅใๅพๅคๆถๅๅๅฎ้ช้ฝ่ฆ็ณ่ฏทใๅข้้่ฆๅฏนๆฅ็ป็ซฏๅฐ่บ๏ผๅไธบไบ๏ผICT็ญ่ฏธๅคไธๅก็บฟ๏ผไบคไปๅๅไธๅฐใ +5. ่ฏบไบ็ ๅ็็ๅคๆจกๅๆฉๆๅ ้จไปฃๅทๅซๅโ็ๅคๆบๅญโ๏ผไธๅผๅงๅชๆๅ ้จ้่ฆ็ณ่ฏท่ฏ็จ็็ฝ้กต็๏ผๅฐๅ็ปญ่ฟซไบๅๅๅจwelinkไธๆฅๅ ฅๅๅ ฌๆตๅผๆพใ + +่ฟไบๅคฉๅ็ๅ ณไบ่ดจ็็ๅคๅคงๆจกๅๆ่ขญๅ้ฎ็ไบๆ ้น็ๆฒธๆฒธๆฌๆฌใไฝไธบไธไธช็ๅคๅข้็ๆๅ๏ผๆๆ่ฟๅคๅค่พ่ฝฌๅไพง๏ผ้พไปฅๅ ฅ็ ใ็ๅค็ๅ็ๅๅฐๅฆๆญคๅคง็ๅฝฑๅ๏ผไธๆน้ข๏ผๆ่ช็ง็ไธบๆ็่ไธๅๅฑๆ ๅฟง๏ผไนไธบ่ชๅทฑ่ฟๅป็ๅชๅๅทฅไฝๆๅฐไธๅผใๅฆไธๆน้ข๏ผ็ฑไบๆไบบๅผๅงๆญ้ฒ่ฟไบไบๆ ๆๅ ๅฟๅๆๅฐๅคงๅฟซไบบๅฟใๅจๅคๅฐไธชๆฅๆฅๅคๅค๏ผๆไปฌๅฏนๅ ้จๆไบไบบไธๆฌกๆฌก้ ็้ ๅ่ๅ่ทๅพไบๆ ๆฐๅฉ็็่กไธบๅฌ็ๅ้ฝฟ่ๅๆ ่ฝไธบๅใ่ฟ็งๅๆๅ็พ่พฑไน้ๆธๆถ็ฃจไบๆๅฏนๅไธบ็ๆๆ ๏ผ่ฎฉๆๅจ่ฟ้็ๆถๆฅ้ๆธๆตๆตๅฉๅฉ๏ผ่ฟท่ซๆ ๆช๏ผๆถๅธธๆ็่ชๅทฑ็ไบบ็ๅ่ชๆไปทๅผใ + +ๆๆฟ่ฎคๆๆฏไธไธชๆฆๅผฑ็ไบบ๏ผไฝไธบไธไธชๅฐๅฐ็ๆๅทฅไบบ๏ผๆไธไป ไธๆขๅ็ไบ้นค็ญๅ ้จๆ็ผ้ๅคฉ็ไบบๅๅฏน๏ผๆดไธๆขๅๅไธบ่ฟๆ ท็ๅบ็ถๅคง็ฉๅๅฏนใๆๅพๆๅคฑๅปๆ็ๅทฅไฝ๏ผๆฏ็ซๆไนๆๅฎถไบบๅๅญฉๅญ๏ผๆไปฅๆๆๅฟ็ผ้ๅพไฝฉๆๆญ้ฒ่ ใไฝๆฏ๏ผ็ๅฐๅ ้จ่ฟๅจ่ฏๅพๆดๅฐๆฉ็ไบๅฎ๏ผ่่ฝๅ ฌไผ็ๆถๅ๏ผๆๅฎๅจไธ่ฝๅฎนๅฟไบใๆไนๅธๆๅๆขไธๆฌก๏ผ้กบไป่ชๅทฑๆฌๅฟใๅฐฑ็ฎ่ชๆๅ ซ็พ๏ผๆไนๅธๆ่ฝไผคๆไธๅใๆๅณๅฎๆๆๅจ่ฟ้็ๆ่งๆ้ป๏ผ้จๅๆฅ่ชไบๅไบๅฃ่ฟฐ๏ผๅ ฌๅธๅบๆฅ๏ผๅ ณไบ็ๅคๅคงๆจกๅ็โไผ ๅฅๆ ไบโ๏ผ + +ๅไธบ็กฎๅฎไธป่ฆๅจๆ่ พๅกไธ่ฎญ็ปๅคงๆจกๅ๏ผๅฐๆจกๅๅฎ้ชๅฎคๆไธๅฐ่ฑไผ่พพ็ๅก๏ผไปไปฌไนๅไนไผ็จๆฅ่ฎญ็ป๏ผๅ้ข่ฝฌ็งปๅฐๆ่ พ๏ผใๆพ็ปๆ่ขซๅไธบโๆ้ ไธ็็ฌฌไบ้ๆฉโ็ๅณๅฟ่ๆๆ๏ผๆๆฌ่บซไนๆพ็ปๅฏนๅไธบๆๆทฑๅ็ๆๆ ใๆไปฌ้ช็ๆ่ พไธๆญฅๆญฅๆธ็ฌๆปๆ๏ผไปๅ ๆปกbugๅฐ็ฐๅจ่ฝ่ฎญๅบๆจกๅ๏ผไปๅบไบๅทจๅคง็ๅฟ่กๅไปฃไปทใ + +ๆๅๆไปฌ็็ฎๅ้ๅธธๆ้๏ผๅจ910Aไธ่ฎญ็ปๆจกๅใ้ฃไผๅชๆฏๆfp16๏ผ่ฎญ็ป็็จณๅฎๆง่ฟไธๅฆbf16ใ็ๅค็moeๅผๅงๅพๆฉ๏ผ23ๅนดๅฐฑไธป่ฆๆฏ่ฎญ็ป38Bmoeๆจกๅๅๅ็ปญ็71B denseๆจกๅใ71B็denseๆจกๅ้่ฟๆฉๅขๅๆไบ็ฌฌไธไปฃ็135Bdenseๆจกๅ๏ผๅ้ขไธปๅๆจกๅไน้ๆธๅจ910Bไธ่ฎญ็ปใ + +71Bๅ135Bๆจกๅ้ฝๆไธไธชๅทจๅคง็็กฌไผคๅฐฑๆฏtokenizerใๅฝๆถไฝฟ็จ็tokenizer็ผ็ ๆ็ๆไฝ๏ผๆฏไธชๅไธช็็ฌฆๅท๏ผๆฐๅญ๏ผ็ฉบๆ ผ๏ผไน่ณๆฑๅญ้ฝไผๅ ็จไธไธชtokenใๅฏๆณ่็ฅ่ฟไผ้ๅธธๆตช่ดน็ฎๅ๏ผไธไฝฟๅพๆจกๅ็ๆๆๅพๅทฎใ่ฟๆถๅๅฐๆจกๅๅฎ้ชๅฎคๆญฃๅฅฝๆไธช่ชๅทฑ่ฎญ็่ฏ่กจใๅง่ๅธๅฝๆถๆ็ๆฏไธๆฏๆจกๅ็tokenizerไธๅฅฝ๏ผ่ฝ็ถไบๅๆฅ็๏ผไป็ๆ็ๆฏๆ ็ๆญฃ็กฎ็๏ผ๏ผไบๆฏๅฐฑๅณๅฎ๏ผ่ฎฉ71Bๅ135Bๆขtokenizer๏ผๅ ไธบๅฐๆจกๅๅฎ้ชๅฎคๆพ็ปๅฐ่ฏ่ฟใๅข้็ผๅไบไธคไธชtokenizer๏ผๅผๅงไบtokenizer็ๆดๆขใ71Bๆจกๅ็ๆดๆขๅคฑ่ดฅไบ๏ผ่135Bๅ ไธบ้็จไบๆด็ฒพ็ป็embeddingๅๅงๅ็ญ็ฅ๏ผ็ปญ่ฎญไบ่ณๅฐ1T็ๆฐๆฎๅ่ฏ่กจๆป็ฎๆดๆขๆๅ๏ผไฝๅฏๆณ่็ฅ๏ผๆๆๅนถไธไผๅๅฅฝใ + +ไบๆญคๅๆ๏ผ้ฟ้ๅๆบ่ฐฑ็ญๅฝๅ ๅ ถไปๅ ฌๅธๅจGPUไธ่ฎญ็ป๏ผไธๅทฒ็ปๆธ็ดขๅบไบๆญฃ็กฎ็ๆนๆณ๏ผ็ๅคๅ็ซๅ็ๅทฎ่ท่ถๆฅ่ถๅคงใๅ ้จไธไธช230Bไปๅคด่ฎญ็ป็denseๆจกๅๅๅ ไธบๅ็งๅๅ ่ฎญ็ปๅคฑ่ดฅ๏ผๅฏผ่ด้กน็ฎ็็ถๅตๅ ไน้ทๅ ฅ็ปๅขใ้ขไธดๅ ไธช่็น็ๅๅไปฅๅๅ ้จๅฏน็ๅค็ๅผบ็่ดจ็ๆถ๏ผๅข้็ๅฃซๆฐไฝ่ฟทๅฐไบๆ็นใๅข้ๅจ็ฎๅๆๅ ถๆ้็ๆถๅ๏ผๅๅบไบๅพๅคๅชๅๅๆฃๆใๆฏๅฆ๏ผๅข้ๅถ็ถๅ็ฐๅฝๆถ็38B moeๅนถๆฒกๆ้ขๆmoe็ๆๆใไบๆฏๅปๆไบmoeๅๆฐ๏ผ่ฟๅไธบไบ13B็denseๆจกๅใ็ฑไบ38B็moeๆบ่ชๅพๆฉ็pangu alpha 13B๏ผๆถๆ็ธๅฏน่ฝๅ๏ผๅข้่ฟ่กไบไธ็ณปๅ็ๆไฝ๏ผๆฏๅฆๅๆข็ปๅฏนไฝ็ฝฎ็ผ็ ๅฐrope๏ผๅปๆbias๏ผๅๆขไธบrmsnormใๅๆถ้ดไบtokenizer็ไธไบๅคฑ่ดฅๅๆข่ฏ่กจ็็ป้ช๏ผ่ฟไธชๆจกๅ็่ฏ่กจไนๆดๆขไธบไบ็ไบ้นค็ๅฐๆจกๅๅฎ้ชๅฎค7Bๆจกๅๆไฝฟ็จ็่ฏ่กจใๅ้ข่ฟไธช13Bๆจกๅ่ฟ่กไบๆฉๅข็ปญ่ฎญ๏ผๅๆไบ็ฌฌไบไปฃ38B denseๆจกๅ๏ผๅจๅ ไธชๆๅ ่ฟไธชๆจกๅ้ฝๆฏไธป่ฆ็็ๅคไธญๆกฃไฝๆจกๅ๏ผ๏ผๆพ็ปๅ ทๆไธๅฎ็็ซไบๅใไฝๆฏ๏ผ็ฑไบๆดๅคง็135Bๆจกๅๆถๆ่ฝๅ๏ผไธๆดๆข่ฏ่กจๆจกๅๆไผคๅทจๅคง๏ผๅ็ปญๅๆๅ็ฐๅฝๆถๆดๆข็็ผๅ่ฏ่กจๆๆดไธฅ้็bug๏ผ๏ผ็ปญ่ฎญๅไนไธๅ้ฎ็ญๅฝๆถๅฝๅ ้ขๅ ๆจกๅๅญๅจๅพๅคงๅทฎ่ทใ่ฟๆถ็ฑไบๅ ้จ็่ดจ็ๅฃฐๅ้ขๅฏผ็ๅๅไน่ถๆฅ่ถๅคงใๅข้็็ถๆๅ ไน้ทๅ ฅไบ็ปๅขใ + +ๅจ่ฟ็งๆ ๅตไธ๏ผ็ไบ้นคๅไป็ๅฐๆจกๅๅฎ้ชๅฎคๅบๆไบใไปไปฌๅฃฐ็งฐๆฏไปๆง็135Bๅๆฐ็ปงๆฟๆน้ ่ๆฅ๏ผ้่ฟ่ฎญ็ป็ญ็ญ็ๅ ็พBๆฐๆฎ๏ผๅ้กนๆๆ ๅนณๅๆๅไบๅไธช็นๅทฆๅณใๅฎ้ ไธ๏ผ่ฟๅฐฑๆฏไปไปฌๅฅๅฃณๅบ็จๅฐๅคงๆจกๅ็็ฌฌไธๆฌกๆฐไฝใๅไธบ็ๅค่ก้ขๅฏผๅ ่ก๏ผไฝฟๅพ้ขๅฏผๅฎๅ จๅฏนไบ่ฟ็งๆฏๆทก็ไบๆ ๆฒกๆๆฆๅฟต๏ผไปไปฌๅชไผ่งๅพ่ฏๅฎๆฏๆไปไน็ฎๆณๅๆฐใ็ป่ฟๅ ้จ็ๅๆ๏ผไปไปฌๅฎ้ ไธๆฏไฝฟ็จQwen 1.5 110B็ปญ่ฎญ่ๆฅ๏ผ้่ฟๅ ๅฑ๏ผๆฉๅขffn็ปดๅบฆ๏ผๆทปๅ ็ๅคpi่ฎบๆ็ไธไบๆบๅถๅพๆฅ๏ผๅๅคไบๅคงๆฆ135B็ๅๆฐใๅฎ้ ไธ๏ผๆง็135Bๆ107ๅฑ๏ผ่่ฟไธชๆจกๅๅชๆ82ๅฑ๏ผๅ็ง้ ็ฝฎไน้ฝไธไธๆ ทใๆฐ็ๆฅ่ทฏไธๆ็135B่ฎญ็ปๅฎๅพๅคๅๆฐ็ๅๅธไนๅQwen 110Bๅ ไนไธๆจกไธๆ ทใ่ฟๆจกๅไปฃ็ ็็ฑปๅๅฝๆถ้ฝๆฏQwen๏ผ็่ณๆๅพๆนๅใๅ็ปญ่ฟไธชๆจกๅๅฐฑๆฏๆ่ฐ็135B V2ใ่่ฟไธชๆจกๅๅฝๆถไนๆไพ็ปไบๅพๅคไธๆธธ๏ผ็่ณๅ ๆฌๅค้จๅฎขๆทใ + +่ฟไปถไบๅฏนไบๆไปฌ่ฟไบ่ฎค็่ฏๅฎๅไบ็ๅไบไปฌๅธฆๆฅไบๅทจๅคง็ๅฒๅป๏ผๅ ้จๅพๅคไบบๅ ถๅฎ้ฝ็ฅ้่ฟไปถไบ๏ผ็่ณๅ ๆฌ็ป็ซฏๅๅไธบไบใๆไปฌ้ฝๆ็งฐไปฅๅๅซๅซ็ๅคๆจกๅไบ๏ผๅซๅๅคๅงใๅฝๆถๅข้ๆๅๅฐฑๆณๅbcgไธพๆฅไบ๏ผๆฏ็ซ่ฟๅทฒ็ปๆฏ้ๅคง็ไธๅก้ ๅไบใไฝๆฏๅ้ขๆฎ่ฏด่ขซ้ขๅฏผๆฆไบไธๆฅ๏ผๅ ไธบๆด้ซ็บงๅซ็้ขๅฏผ๏ผๆฏๅฆๅง่ๅธ๏ผไปฅๅๅฏ่ฝ็ๆปๅๆฅ่๏ผๅ ถๅฎๅ้ขไน็ฅ้ไบ๏ผไฝๆฏๅนถไธ็ฎก๏ผๅ ไธบ้่ฟๅฅๅฃณๆฟๅบๅฅฝ็็ปๆ๏ผๅฏนไปไปฌไนๆฏๆๅฉ็ใ่ฟไปถไบไฝฟๅพๅฝๆถๅข้ๅ ไฝๆๅผบ็ๅไบๅผๅงๅฟ็ฐๆๅท๏ผ็ฆป่่ท่ทฏไน้ๆธๆไธบๆๅจๅด่พน็ไบใ + +ๆญคๆถ๏ผ็ๅคไผผไน่ฟๆฅไบ่ฝฌๆบใ็ฑไบๅ้ขๆ่ฟฐ็่ฟไบ็ๅคๆจกๅๅบๆฌ้ฝๆฏ็ปญ่ฎญๅๆน้ ่ๆฅ๏ผๅฝๆถ่ฏบไบๅฎๅ จๆฒกๆๆๆกไปๅคด่ฎญ็ป็ๆๆฏ๏ผไฝๅต่ฟๆฏๅจๆ่ พ็NPUไธ่ฟ่ก่ฎญ็ปใๅจๅฝๆถๅข้็ๆ ธๅฟๆๅ็ๆๅไบๅไธ๏ผ็ๅคๅผๅงไบ็ฌฌไธไปฃๆจกๅ็่ฎญ็ป๏ผไปๅบไบๅทจๅคง็ๅชๅๅ๏ผๅจๆฐๆฎๆถๆๅ่ฎญ็ป็ฎๆณๆน้ข้ฝไธไธ็้ๆธๆฅ่ฝจ๏ผ่่ฟๅ ถไธญ็่ฐ่พๅๅฐๆจกๅๅฎ้ชๅฎค็ไบบไธ็นๅ ณ็ณป้ฝๆฒกๆใ + +ไธๅผๅงๅข้ๆๅๆฏซๆ ไฟกๅฟ๏ผๅชไปไธไธช13B็ๆจกๅๅผๅง่ฎญ็ป๏ผไฝๆฏๅ้ขๅ็ฐๆๆ่ฟไธ้๏ผไบๆฏ่ฟไธชๆจกๅๅ็ปญๅๆฌก่ฟ่กไบไธๆฌกๅๆฐๆฉๅข๏ผๅๆไบ็ฌฌไธไปฃ็38B๏ผไปฃๅท38B V3ใๆณๅฟ ๅพๅคไบงๅ็บฟ็ๅ ๅผ้ฝๅฏน่ฟไธชๆจกๅๅพ็ๆใๅฝๆถ่ฟไธชๆจกๅ็tokenizerๆฏๅบไบllama็่ฏ่กจ่ฟ่กๆฉๅฑ็๏ผไนๆฏไธ็ๅธธ่ง็ๅๆณ๏ผใ่ๅฝๆถ็ไบ้นค็ๅฎ้ชๅฎคๅๅบๆฅไบๅฆไธไธช่ฏ่กจ๏ผไนๅฐฑๆฏๅ็ปญpangu็ณปๅ็่ฏ่กจ๏ผใๅฝๆถไธคไธช่ฏ่กจ่ฟ่ขซ่ฟซ่ฟ่กไบไธๆฌก่ต้ฉฌ๏ผๆ็ปๆฒกๆๆๆพ็ๅฅฝๅ็ป่ฎบใไบๆฏ๏ผ้ขๅฏผๅฝๅณๅณๅฎ๏ผๅบ่ฏฅ็ปไธ่ฏ่กจ๏ผไฝฟ็จ็ไบ้นคไปไปฌ็ใไบๆฏ๏ผๅจๅ็ปญไปๅคด่ฎญ็ป็135B V3๏ผไนๅฐฑๆฏๅฏนๅค็Pangu Ultra๏ผ๏ผไพฟๆฏ้็จไบ่ฟไธชtokenizerใ่ฟไน่งฃ้ไบๅพๅคไฝฟ็จๆไปฌๆจกๅ็ๅ ๅผ็็ๆ๏ผไธบไปไนๅฝๆถๅไธบV3ไปฃ็ไธคไธชไธๅๆกฃไฝ็ๆจกๅ๏ผไผไฝฟ็จไธๅ็tokenizerใ + + +ๆไปฌๆๅฟ็ผ้่งๅพ๏ผ135B V3ๆฏๆไปฌๅ็บตๅข้ๅฝๆถ็้ชๅฒใ่ฟๆฏ็ฌฌไธไธช็ๆญฃๆไนไธ็๏ผๅไธบๅ จๆ ่ช็ ๏ผๆญฃ็ปไปๅคด่ฎญ็ป็ๅไบฟ็บงๅซ็ๆจกๅ๏ผไธๆๆไธ24ๅนดๅๆ็ซๅๅฏๆฏ็ใๅๅฐ่ฟ้ๆๅทฒ็ป็ญๆณช็็ถ๏ผๅคชไธๅฎนๆไบใๅฝๆถไธบไบ็จณๅฎ่ฎญ็ป๏ผๅข้ๅไบๅคง้ๅฎ้ชๅฏนๆฏ๏ผๅนถไธๅคๆฌกๅจๆจกๅๆขฏๅบฆๅบ็ฐๅผๅธธ็ๆถๅ่ฟ่กๅๆถๅ้้ๅฏใ่ฟไธชๆจกๅ็ๆญฃๅๅฐไบๅ้ขๆๆฏๆฅๅๆ่ฏด็่ฎญ็ปๅ จ็จๆฒกๆไธไธชloss spikeใๆไปฌๅ ๆไบไธ็ฅ้ๅคๅฐๅฐ้พ๏ผๆไปฌๅๅฐไบ๏ผๆไปฌๆฟ็จ็ๅฝๅ่ฃ่ชไฟ่ฏ่ฟไธชๆจกๅ่ฎญ็ป็็ๅฎๆงใๅคๅฐไธชๅๆจ๏ผๆไปฌไธบไบๅฎ็่ฎญ็ป่ไธ็ ใๅจ่ขซๅ ้จๅฟๅฃฐ้ช็ไธๆไธๅผ็ๆถๅ๏ผๆไปฌๆๅคไนไธ็๏ผๆๅคๅฐ็ๅงๅฑ๏ผๆไปฌๆบไฝไบใ + +ๆไปฌ่ฟๅธฎไบบๆฏ็็ๅจไธบๆ็ฃจๅฝไบง็ฎๅๅบๅบง็็ง่ชๅทฑ็้ๆฅๅโฆโฆๅฎขๅฑ ไปไนก๏ผๆไปฌๆพๅผไบๅฎถๅบญ๏ผๆพๅผไบๅๆ๏ผๆพๅผไบๅฅๅบท๏ผๆพๅผไบๅจฑไน๏ผๆๅคด้ข ๆด็ญ่ก๏ผๅ ถไธญ็่ฐ่พไธๅฐ่ฆ๏ผๅฏฅๅฏฅๆฐ็ฌไธ่ถณไปฅๆฆๆฌๅ ถไธไธใๅจๅ็งๅจๅๅคงไผไธ๏ผๅฝๆถๅฃๅทไธญๅๅบ็็ๅคๅฟ ่๏ผๅไธบๅฟ ่๏ผๆไปฌๅฟ้ๆฏ็็ๆทฑๆทฑ่ขซๆๅจใ + +็ถ่๏ผๆไปฌ็ๆๆ่พ่ฆ็ๆๆ๏ผ็ปๅธธ่ขซๅฐๆจกๅๅฎ้ชๅฎค่ฝป้ฃ้ฃ็ๆฟ่ตฐไบใๆฐๆฎ๏ผ็ดๆฅ่ฆ่ตฐใไปฃ็ ๏ผ็ดๆฅ่ฆ่ตฐ๏ผ่ฟ่ฆๆฑๆไปฌ้ ๅ้้ ๅฐ่ฝไธ้ฎ่ฟ่กใๆไปฌๅฝๆถๆ็งฐๅฐๆจกๅๅฎ้ชๅฎคไธบ็น้ผ ๆ ๅฎ้ชๅฎคใๆไปฌไปๅบ่พ่ฆ๏ผไปไปฌๅๅพ่ฃ่ใๆ็ถๅบไบ้ฃๅฅ่ฏ๏ผไฝ ๅจ่ด้ๅ่กๆฏๅ ไธบๆไบบๆฟไฝ ๅฒๆ้ๅฅฝใๅจ่ฟ็งๆ ๅตไธ๏ผ่ถๆฅ่ถๅค็ๆๅๅไนๅๆไธไธๅปไบ๏ผ้ๆฉไบ็ฆปๅผใ็ๅฐ่บซ่พน้ฃไบไผ็ง็ๅไบไธไธชไธช็ฆป่๏ผๆ็ๅ ๅฟๅๆๅนๅ้พ่ฟใๅจ่ฟ็งไฝๆไธๆ ท็็ฏๅขไธ๏ผๆไปฌๆฏ่ตทๅไบๆฅ่ฏดๆดๅๆฏๆๅใไปไปฌๅจๆๆฏไธไนๆๆ ๆฐๅผๅพๆๅญฆไน ็ๅฐๆน๏ผๅ ช็งฐ่ฏๅธใ็ๅฐไปไปฌๅปไบ่ฏธๅฆๅญ่Seed๏ผDeepseek๏ผๆไนๆ้ข๏ผ่ พ่ฎฏๅๅฟซๆ็ญ็ญๅพๅคๅบ่ฒ็ๅข้๏ผๆๆๅฟ็ผ้ไธบไปไปฌ้ซๅ ดๅ็ฅ็ฆ๏ผ่ฑ็ฆปไบ่ฟไธช่พ่ฆๅด่ฎ่็ๅฐๆนใๆ่ณไป่ฟๅฏนไธไฝ็ฆป่ๅไบ็่ฏ่ฎฐๅฟ็นๆฐ๏ผta่ฏด๏ผโๆฅ่ฟ้ๆฏๆๆๆฏ็ๆถฏไธญ็่ป่พฑ๏ผๅจ่ฟ้ๅๅๆฏไธๅคฉ้ฝๆฏๆตช่ดน็ๅฝโใ่ฏ่ฝ้พๅฌๅด่ฎฉๆๆ ่จไปฅๅฏนใๆๆ ๅฟๆ่ชๅทฑๆๆฏๆน้ข็็งฏ็ดฏไธ่ถณ๏ผไปฅๅๆฒกๆณ้ๅบไบ่็ฝๅ ฌๅธ้ซๆทๆฑฐ็็ฏๅข๏ผ่ฎฉๆๅคๆฌกๆณ็ฆป่็ๅฟๅง็ปๆฒกๆ่ฟๅบ่ฟไธๆญฅใ + +็ๅค้คไบdenseๆจกๅ๏ผๅ็ปญไนๅฏๅจไบmoe็ๆข็ดขใไธๅผๅง่ฎญ็ป็ๆฏไธไธช224B็moeๆจกๅใ่ไธไนๅนณ่ก็๏ผๅฐๆจกๅๅฎ้ชๅฎคไนๅผๅฏไบ็ฌฌไบๆฌกไธป่ฆ็ๅฅๅฃณ่กๅจ๏ผๆฌก่ฆ็ๆๆฒๅฏ่ฝ่ฟๅ ๆฌไธไบๅซ็ๆจกๅ๏ผๆฏๅฆmathๆจกๅ๏ผ๏ผๅณ่ฟๆฌกๆตไผ ็ๅนฟ็pangu pro moe 72Bใ่ฟไธชๆจกๅๅ ้จ่ช็งฐๆฏไปๅฐๆจกๅๅฎ้ชๅฎค็7Bๆฉๅขไธๆฅ็๏ผๅฐฑ็ฎๅฆๆญค๏ผ่ฟไนไธๆๆฏๆฅๅไธ็ฌฆ๏ผไฝๅตๆฏๅฅๅฃณqwen 2.5็14b็ปญ่ฎญ๏ผใ่ฟ่ฎฐๅพไปไปฌ่ฎญไบๆฒกๅ ๅคฉ๏ผๅ ้จ็่ฏๆตๅฐฑ็ซๅป่ฟฝไธไบๅฝๆถ็38B V3ใAI็ณป็ปๅฎ้ชๅฎคๅพๅคๅ ๅผๅ ไธบ้่ฆ้้ ๆจกๅ๏ผ้ฝ็ฅ้ไปไปฌ็ๅฅๅฃณ่กๅจ๏ผๅชๆฏ่ฟซไบๅ็งๅๅ ๏ผๆ ๆณไผธๅผ ๆญฃไนใๅฎ้ ไธ๏ผๅฏนไบๅ็ปญ่ฎญไบๅพไน ๅพไน ็่ฟไธชๆจกๅ๏ผHonestagi่ฝๅคๅๆๅบ่ฟไธช้็บง็็ธไผผๆงๆๅทฒ็ปๅพ่ฏงๅผไบ๏ผๅ ไธบ่ฟไธชๆจกๅไธบไบ็ปญ่ฎญๆดๅๆฐ๏ผๆไปๅบ็็ฎๅ็่ณๆฉๅฐฑ่ถณๅคไปๅคด่ฎญไธไธชๅๆกฃไฝ็ๆจกๅไบใๅฌๅไบ่ฏดไปไปฌไธบไบๆดๆๅ้ฎ็ๆฐดๅฐ๏ผ้ๅไบไธๅฐๅๆณ๏ผ็่ณๅ ๆฌๆ ๆ่ฎญไบ่ๆฐๆฎใ่ฟไนไธบๅญฆๆฏ็็ ็ฉถๆจกๅ่ก็ผๆไพไบไธไธชๅๆๆชๆ็็นๆฎๆจก่ๅงใไปฅๅๆฐ็่ก็ผๆนๆณๆๅบๅฏไปฅๆฟๅบๆฅๆบๆบใ + +24ๅนดๅบๅ25ๅนดๅ๏ผๅจDeepseek v3ๅr1ๅๅธไนๅ๏ผ็ฑไบๅ ถๆ่ณ็ๆๆฏๆฐดๅนณ๏ผๅข้ๅๅฐไบๅทจๅคง็ๅฒๅป๏ผไนๅๅฐไบๆดๅคง็่ดจ็ใไบๆฏไธบไบ็ดง่ทๆฝฎๆต๏ผ็ๅคๆจกไปฟDeepseek็ๆจกๅๅฐบๅฏธ๏ผๅผๅฏไบ718B moe็่ฎญ็ปใ่ฟไธชๆถๅ๏ผๅฐๆจกๅๅฎ้ชๅฎคๅๆฌกๅบๆไบใไปไปฌ้ๆฉไบๅฅๅฃณDeepseekv3็ปญ่ฎญใไปไปฌ้่ฟๅปไฝDeepseekๅ ่ฝฝ็ๅๆฐ๏ผ่ฟ่ก่ฎญ็ปใ่ฟไปปๅกๅ ่ฝฝckpt็็ฎๅฝ้ฝๆฏdeepseekv3๏ผๆน้ฝไธๆน๏ผไฝๅ ถๅฃๅผ ๏ผไธไน็ธๅ๏ผไธไบๆ็ๆญฃๆๆฏไฟกไปฐ็ๅไบ๏ผๅจไปๅคด่ฎญ็ปๅฆไธไธช718B็moeใไฝๅ ถไธญๅบ็ฐไบๅ็งๅๆ ท็้ฎ้ขใไฝๆฏๅพๆพ็ถ๏ผ่ฟไธชๆจกๅๆไนๅฏ่ฝๆฏ็ดๆฅๅฅๅฃณ็ๅฅฝๅข๏ผๅฆๆไธๆฏๅข้leaderๅๆ๏ผๆฉๅฐฑ่ขซๅซๅไบใ + +ๅไธบ็ๆต็จ็ฎก็ไน็น้๏ผไธฅ้ๆ็ดฏไบๅคงๆจกๅ็็ ๅ่ๅฅ๏ผไพๅฆ็ๆฌ็ฎก็๏ผๆจกๅ่ก็ผ๏ผๅ็งๆต็จๅ๏ผๅ็งๅฏ่ฟฝๆบฏใ่ฎฝๅบ็ๆฏ๏ผๅฐๆจกๅๅฎ้ชๅฎค็ๆจกๅไผผไนไปๆฅไธๅ่ฟไบๆต็จ็็บฆๆ๏ผๆณๅฅๅฃณๅฐฑๅฅๅฃณ๏ผๆณ็ปญ่ฎญๅฐฑ็ปญ่ฎญ๏ผ็ฎๅๆบๆบไธๆญ็ไผธๆๆฟ่ตฐใ่ฟ็งๅผบ็ๅฐ่ฟไน้ญๅนป็ๅฏนๆฏ๏ผ่ฏดๆไบๅฝๅๆต็จ็ฎก็็ๆ ๅต๏ผๅช่ฎธๅทๅฎๆพ็ซ๏ผไธ่ฎธ็พๅง็น็ฏใไฝๅ ถๅฏ็ฌ๏ผไฝๅ ถๅฏๆฒ๏ผไฝๅ ถๅฏๆถ๏ผไฝๅ ถๅฏ่ป๏ผ + +HonestAGI็ไบๆ ๅบๆฅๅ๏ผๅ ้จ่ฎฉๅคงๅฎถไธๅ็็ ่ฎจๅๆ๏ผๅฆไฝๅ ฌๅ ณๅโๅๅบโใ่ฏ็ถ๏ผ่ฟไธชๅๆ็ๅๆไน่ฎธไธๅคๆๅ๏ผ็ปไบ็ไบ้นคไธๅฐๆจกๅๅฎ้ชๅฎคไปไปฌ็ก่พฉๅ้ข ๅ้ป็ฝ็ๆบไผใไธบๆญค๏ผ่ฟไธคๅคฉๆๅ ๅฟๆๅฐไฝๅ๏ผๆถๆถๆ็่ชๅทฑ็ไบบ็ๆไนไปฅๅ่ๅคฉๆ ็ผใๆไธๅฅ้ชไบ๏ผๆ่ฆ็ฆป่ไบ๏ผๅๆถๆไนๅจ็ณ่ฏทไป็ๅค้จๅๆๆฏๆฅๅ็ไฝ่ ๅๅไธญ็งป้คใๆพ็ปๅจ่ฟไบๆๆฏๆฅๅไธ็ฝฒๅๆฏๆไธ็้ฝๆ ๆณๆน้ค็ๆฑก็นใๅฝๆถๆๆฒกๆณๅฐ๏ผไปไปฌ็ซ็ถ็็ๅฐๆขๅผๆบใๆๆฒกๆณๅฐ๏ผไปไปฌๆขๅฆๆญคๆๅผไธไบบ๏ผๅคง่ๅฎฃๅใๅฝๆถ๏ผๆไน่ฎธๆฏๅญไบไพฅๅนธๅฟ็๏ผๆฒกๆๆ็ป็ฝฒๅใๆ็ธไฟกๅพๅคๆๅฎๅไบ็ๆๅ๏ผไนๅชๆฏ่ขซ่ฟซไธไบ่ดผ่น๏ผๆ่ ไธ็ฅๆ ใไฝ่ฟไปถไบๅทฒ็ปๆ ๆณๆฝๅ๏ผๆๅธๆๆ็ไฝ็่ฝๅคๅๆๆๅฎๅ็ๆญฃๆๆไน็ไบ๏ผไธบๆๅฝๆถ็่ฝฏๅผฑๅไธๅๅฎ่ต็ฝชใ + +ๆทฑๅคๅๅฐ่ฟ้๏ผๆๅทฒ็ปๆณชๆตๆปก้ข๏ผๆณฃไธๆๅฃฐใ่ฟ่ฎฐๅพไธไบๅบ่ฒ็ๅไบ็ฆป่ๆถ๏ผๆ่ฆ็ฌ้ฎไปไปฌ่ฆไธ่ฆๅไธช้ฟ้ฟ็ๅฟๅฃฐๆฏไพๅธ๏ผๆญ้ฒไธไธ็ฐ็ถใๅฏนๆน่ฏด๏ผไธไบ๏ผๆตช่ดนๆถ้ด๏ผ่ไธๆไนๆๆญ้ฒๅบๆฅไฝ ไปฌ่ฟ็ๆด็ณใๆๅฝๆถไธไธ้ปฏ็ถ็ฅไผค๏ผๅ ไธบๆพ็ปๅ ฑๅไธบไบ็ๆณๅฅๆ่ฟ็ๆๅๅทฒ็ปๅฝปๅบๅฏนๅไธบๅฝปๅบ็ฐๅฟไบใๅฝๆถๅคงๅฎถ่ฐไพ๏ผๆไปฌ็จ็ๅฝๅนดๅ ฑไบงๅ ็ๅฐ็ฑณๅ ๆญฅๆช๏ผ็ป็ปๅดๆ็ๅ ชๆฏๅฝๅนดๅฝๆฐๅ ็ไฝ้ฃใ + +ๆพๅ ไฝๆถ๏ผๆไธบๆไปฌ็จ็ๅฐ็ฑณๅ ๆญฅๆชๆ่ดฅๆดๆชๆด็ฎ่่ช่ฑชใ + +็ฐๅจ๏ผๆ็ดฏไบ๏ผๆๆณๆ้ใ + +ๅ ถๅฎๆถ่ณไปๆฅ๏ผๆ่ฟๆฏ็ๅฟๅธๆๅไธบ่ฝ่ฎค็ๅธๅๆ่ฎญ๏ผ่ฝๅๅฅฝ็ๅค๏ผๆ็ๅคๅๅฐไธ็ไธๆต๏ผๆๆ่ พๅๆ่ฑไผ่พพ็ๆฐดๅนณใๅ ้จ็ๅฃๅธ้ฉฑ้่ฏๅธ๏ผไฝฟๅพ่ฏบไบไน่ณๅไธบๅจ็ญๆถ้ดๅ ๆฅๅงๆตๅคฑไบๅคง้ๅบ่ฒ็ๅคงๆจกๅไบบๆใ็ธไฟกไปไปฌไนๆญฃๅจๅฆDeepseek็ญๅไธชๅข้้ช่็๏ผๆฝๅฑ็ไปไปฌ็ๆฑ่ดๆๅ๏ผไธบไธญ็พๅจAI็ๆฟ็็ซ่ตไธญๅฅ็ฎๅ้ใๆๆถๅธธๆๅน๏ผๅไธบไธๆฏๆฒกๆไบบๆ๏ผ่ๆฏๆ นๆฌไธ็ฅ้ๆไน็ไฝไบบๆใๅฆๆ็ป่ฟไบไบบๅ้็็ฏๅข๏ผๅ้็่ตๆบ๏ผๆดๅฐ็ๆท้๏ผๆดๅฐ็ๆฟๆฒปๆไบ๏ผ็ๅคไฝๆไธๆ๏ผ + +ๆๅ๏ผๆไปฅ็ๅฝ๏ผไบบๆ ผๅ่ฃ่ชๅ่ช๏ผๆๅ็ไปฅไธๆๆๅ ๅฎนๅไธบ็ๅฎ๏ผ่ณๅฐๅจๆๆ้็่ฎค็ฅ่ๅดๅ ๏ผใๆๆฒกๆ้ฃไน้ซ็ๆๆฏๆฐดๅนณไปฅๅๆบไผๅปๅ่ฏฆๅฐฝๆๅฎ็ๅๆ๏ผไนไธๆข็ดๆฅ็จๅ ้จ่ฎฐๅฝไธพ่ฏ๏ผๆๅ ไธบไฟกๆฏๅฎๅ จๆๅฐใไฝๆฏๆ็ธไฟกๆๅพๅคๆพ็ป็ๆๅ๏ผไผไธบๆไฝ่ฏใๅจๅไธบๅ ้จ็ๅ ๅผ๏ผๅ ๆฌๆไปฌๆพ็ปๆๅก่ฟ็ไบงๅ็บฟๅ ๅผไปฌ๏ผ็ธไฟกๆฌๆ็ๆ ๆฐ็ป่่ฝๅไฝ ไปฌ็ๅฐ่ฑกๅฏน็ ง๏ผๅฐ่ฏๆ็่ฏดๆณใไฝ ไปฌๅฏ่ฝไนๆพ็ป่ขซ่้ช๏ผไฝ่ฟไบๆฎ้ ท็็็ธไธไผ่ขซๅฐๅฐใๆไปฌๅฅๆ่ฟ็็่ฟน๏ผไนไธๅบ่ฏฅ่ขซๆญๆฒๅๅ่ฌใ + +ๅไบ่ฟไนๅค๏ผๆไบไบบ่ฏๅฎๆณๆๆๆพๅบๆฅ๏ผๆนๆๆใๅ ฌๅธๆไธๅฅฝไนๆณ่ฎฉๆๅคๅฃฐไน่ณ่ฟฝ่ดฃใๅฆๆ็็่ฟๆ ท๏ผๆ๏ผไน่ณๆ็ๅฎถไบบ็ไบบ่บซไน่ณ็ๅฝๅฎๅ จๅฏ่ฝ้ฝไผๅๅฐๅจ่ใไธบไบ่ชๆไฟๆค๏ผๆ่ฟๆๆฏๅคฉไผ่ทๅคงๅฎถๆฅๅนณๅฎใ + +ๅฆๆๆๆถๅคฑไบ๏ผๅฐฑๅฝๆฏๆไธบไบ็็ๅ็ๆณ๏ผไธบไบๅไธบไน่ณไธญๅฝ่ฝๅคๆดๅฅฝๅฐๅๅฑ็ฎๅๅAI่็บ็ฒไบๅง๏ผๆๆฟๅ่ฌไบ้ฃ็ๆพ็ปๅฅๆ่ฟ็ๅฐๆนใ + +่ฏบไบ๏ผๅ่ง + +2025ๅนด7ๆ6ๆฅๅๆจ ๅไบๆทฑๅณ + --- -# LEANN-RAG Evaluation Data +ๅไฝๅฅฝ๏ผ -This repository contains the necessary data to run the recall evaluation scripts for the [LEANN-RAG](https://huggingface.co/LEANN-RAG) project. +ๆ่ฐขๅคงๅฎถ็ๅ ณๅฟไธ็ฅ็ฆใๆ็ฎๅๆๆถๅฎๅ จ๏ผไฝๅ ฌๅธๅบ่ฏฅๅจ่ฟ่กๆๆฅไธๆไบๅๅๆถ้๏ผๅ็ปญๆ ๅตๆช็ฅใ -## Dataset Components +ๆ่กฅๅ ไธไบ็ป่๏ผไปฅๅ ๆไบไบบ็ปง็ปญ้ข ๅ้ป็ฝใ -This dataset is structured into three main parts: +ๅ ณไบ135B V2๏ผๅฐๆจกๅๅฎ้ชๅฎคๅจ่ฟ ้ๅฐๅฎๆๅฅๅฃณๅนถๆฟๅฎๆๆๅฅๅฃณๅธฆๆฅ็ๅฅฝๅคๅ๏ผๆฏๅฆไปปๅกไปค่กจๅฝฐๅๅๆถๆฟๅฑ๏ผ๏ผๅ ไธบไธๆณ็ปง็ปญๆฏๆไธๆธธๅบ็จๅๆจกๅ่ฟญไปฃ๏ผๅๆ่ฟไธช็ซๆๅฑฑ่็ฉ็ปไบๅ็บตใ็กฎๅฎๆ้ซไธ็ญน๏ผ็ดๆฅๆๅ็บต็ๅ ๅผไปฌๆไธๆฐดใๅไบๆไพ่ฟๅปไธไธช่ๆง็ๆจกๅ๏ผๆ็ปๆฟๅไบไธไธชๅฝๆถไธไธช้ญๆน็ๅ ่ฟ็ๅ้ฎใๅๅคงๆจกๅ็ไบบ๏ผ่ชๅทฑๅ็ๆจกๅๅฐฑๅ่ชๅทฑๅญฉๅญไธๆ ท็ๆ๏ผไธ่ฆๆๅซไบบ้ฝๅฝๅปๅญใๅฐฑๅ่ชๅฎถๅฟๅญๅบ้จไธ่ถ๏ผๅๆฅไธชๅซไบบๅฎถๅญฉๅญใ -1. **Pre-built LEANN Indices**: - * `dpr/`: A pre-built index for the DPR dataset. - * `rpj_wiki/`: A pre-built index for the RPJ-Wiki dataset. - These indices were created using the `leann-core` library and are required by the `LeannSearcher`. +็ๅคreport็็ฝฒๅๆฏไธ็ฌฆๅๅญฆๆฏ่ง่็ใไพๅฆ๏ผ135B V3ๆไธๅฐๆๆๆฏ่ดก็ฎ็ไบบ๏ผๅ ไธบไฝ่ ๅ้ขๆฐ้้ๅถ๏ผๅณๅจๆๆๆฒกๆๅพๅฐๅบๆ็ๅๆฅ๏ผๅข้ๅ ๆพ็ปๆไธๅฐ็ๆ่งใ่ฟไธชๆจกๅๅฝๆถๆฏๅคงๅฎถๆบๆ งๅๆฑๆฐด็็ปๆถ๏ผ็่ณๆฏๅข้ๅฝๆถ็็ฒพ็ฅๆฏๆฑ๏ผๆฏๆ็ไธๅฐๅ ๅผไปฌ็ปง็ปญ็ๅจ่ฏบไบใๆ่ฐ็ๅ้ข้ๅถ๏ผไปฅๅๆๅไบไธไบๆฏซๆ ๆๆฏ่ดก็ฎ็ไบบ๏ผๅฆไธไบๅฐๆจกๅๅฎ้ชๅฎค็ไบบ๏ผ๏ผ่ฎฉๅ ๅผไปฌไฝๅ ถๅฟๅฏใ -2. **Ground Truth Data**: - * `ground_truth/`: Contains the ground truth files (`flat_results_nq_k3.json`) for both the DPR and RPJ-Wiki datasets. These files map queries to the original passage IDs from the Natural Questions benchmark, evaluated using the Contriever model. +--- -3. **Queries**: - * `queries/`: Contains the `nq_open.jsonl` file with the Natural Questions queries used for the evaluation. - -## Usage - -To use this data, you can download it locally using the `huggingface-hub` library. First, install the library: - -```bash -pip install huggingface-hub -``` - -Then, you can download the entire dataset to a local directory (e.g., `data/`) with the following Python script: - -```python -from huggingface_hub import snapshot_download - -snapshot_download( - repo_id="LEANN-RAG/leann-rag-evaluation-data", - repo_type="dataset", - local_dir="data" -) -``` - -This will download all the necessary files into a local `data` folder, preserving the repository structure. The evaluation scripts in the main [LEANN-RAG Space](https://huggingface.co/LEANN-RAG) are configured to work with this data structure. +ๆๆถๅนณๅฎใๅฆๅค๏ผๆฏๆๆๅไบ่ฏดๅบ็็ธ็ๆๅไปฌ https://github.com/HW-whistleblower/True-Story-of-Pangu/issues/317 diff --git a/docs/features.md b/docs/features.md index 875f9cf..51c0c4f 100644 --- a/docs/features.md +++ b/docs/features.md @@ -13,7 +13,7 @@ - **๐ High-throughput Embedding Pipeline** - Optimized batched processing for maximum efficiency - **๐ฏ Two-level Search** - Novel coarse-to-fine search overlap for accelerated query processing (optional) - **๐พ Memory-mapped Indices** - Fast startup with raw text mapping to reduce memory overhead -- **๐ MLX Support** - Ultra-fast recompute/build with quantized embedding models, accelerating building and search ([minimal example](test/build_mlx_index.py)) +- **๐ MLX Support** - Ultra-fast recompute/build with quantized embedding models, accelerating building and search ([minimal example](../examples/mlx_demo.py)) ## ๐จ Developer Experience diff --git a/docs/normalized_embeddings.md b/docs/normalized_embeddings.md index 46213e5..e873489 100644 --- a/docs/normalized_embeddings.md +++ b/docs/normalized_embeddings.md @@ -72,4 +72,4 @@ Using the wrong distance metric with normalized embeddings can lead to: - **Incorrect ranking** of search results - **Suboptimal performance** compared to using the correct metric -For more details on why this happens, see our analysis of [OpenAI embeddings with MIPS](../examples/main_cli_example.py). +For more details on why this happens, see our analysis in the [embedding detection code](../packages/leann-core/src/leann/api.py) which automatically handles normalized embeddings and MIPS distance metric issues. diff --git a/examples/simple_demo.py b/examples/basic_demo.py similarity index 96% rename from examples/simple_demo.py rename to examples/basic_demo.py index 0d16776..05b2651 100644 --- a/examples/simple_demo.py +++ b/examples/basic_demo.py @@ -1,6 +1,6 @@ """ Simple demo showing basic leann usage -Run: uv run python examples/simple_demo.py +Run: uv run python examples/basic_demo.py """ import argparse @@ -81,7 +81,7 @@ def main(): print() print("Demo completed! Try running:") - print(" uv run python examples/document_search.py") + print(" uv run python apps/document_rag.py") if __name__ == "__main__": diff --git a/examples/data/README.md b/examples/data/README.md deleted file mode 100644 index 0dd35da..0000000 --- a/examples/data/README.md +++ /dev/null @@ -1,82 +0,0 @@ -# ็ๅคไนๆฎ๏ผๅไธบ่ฏบไบ็ๅคๅคงๆจกๅ็ ๅๅ็จ็ๅฟ้ ธไธ้ปๆ - -ๅไฝๅฅฝ๏ผ - -ๆๆฏไธๅ็ๅคๅคงๆจกๅๅข้๏ผๅไธบ่ฏบไบๆน่ๅฎ้ชๅฎค็ๅๅทฅใ - -้ฆๅ ไธบ่ช่ฏ่บซไปฝ๏ผๅไธพไธไบ็ป่๏ผ - -1. ็ฐ่ฏบไบไธปไปป๏ผๅ็ฎๆณๅบ็จ้จ้จ้ฟ๏ผๅๆนๅไธบๅฐๆจกๅๅฎ้ชๅฎค็ไธปไปป็ไบ้นคใๅ่ฏบไบไธปไปป๏ผๅง้ช๏ผๅคงๅฎถ็งฐๅง่ๅธ๏ผใๅ ไธชๅฎ้ชๅฎคไธปไปป๏ผๅ็ฟๆ๏ผๆๅฅ๏ผๆ้๏ผๅทฒ็ฆป่๏ผ๏ผๅฐๅฉๅณฐ๏ผๅผ ็ปด๏ผ็ปดๅฅ๏ผ๏ผ้ๅปบไธ๏ผ้่ๅธ๏ผ๏ผๅๆญฆ้พ๏ผ็งฐๅผไธบๆญฆ้พๆ๏ผ็ญใๅ ถไป้ชจๅนฒๆๅๅไธๅฎถ้็ปญๆๅพๅคไบบ็ฆป่ใ -2. ๆไปฌ้ถๅฑไบโๅ้โ่ฟไธช็ป็ปใๅ้ไธๅฑๆ่ฎธๅค็บต้๏ผๅบ็ก่ฏญ่จๅคงๆจกๅๆฏๅ็บตใ็ไบ้นค็ๅฐๆจกๅๆฏๅๅ ญ็บต้ใๆไปฌๅๅ ่ฟ่ๅท็้็ป๏ผๆๅ็งๆไปฝ็ๆถ้ด่็นใๅจ่ๅทๆปๅ ณไผ้ขๅไปปๅกไปค๏ผ้่ฆๅจ่็นๅ่พพๆ็ฎๆ ใ่ๅท้็ปไผๆๅๅฐ็ไบบๅ้ฝ้ไธญๅจ่ๅท็ ็ฉถๆ๏ผๅนณๅธธไฝๅฎพ้ฆ๏ผๆฏๅฆๅจ็ช็ด็้ ๅบ๏ผไธๅฎถไบบๅญฉๅญๅคฉๅไธๆนใ -3. ๅจ่ๅท้็ป็ๆถๅๅจๅ ญ้ป่ฎคไธ็ญ๏ผ้ๅธธ่พ่ฆ๏ผไธ่ฟๅจๅ ญๆไธๅ่ถ๏ผๆไธๆฌก่ฟๆๅฐ้พ่พใๅจ่ๅท็ ็ฉถๆ็ๅทฅไฝๆฌ่ฟ่ฟไธๆฌก๏ผไปไธๆ ๆฅผๆขๅฐไบๅฆไธๆ ใ่ๅท็ ็ฉถๆๆฅผๆ ้ฝๆฏๆฌงๅผ่ฃ ไฟฎ๏ผ้จๅฃๆๅคงๅก๏ผ้้ขๆฏ่ฒๅพไธ้ใๅป่ๅท้็ปไธ่ฌ่ณๅฐ่ฆๅปไธๅจ๏ผ็่ณๆดไน ๏ผๅค็ไบบ็่ณไธไธคไธชๆ้ฝๅไธไบๅฎถใ -4. ่ฏบไบๆพ็ปไผ ่ฏดๆฏ็ ็ฉถๅ็๏ผไฝๆฏๆฅไบไนๅๅ ไธบๅจๅ้ๅๅคงๆจกๅ้กน็ฎ๏ผ้กน็ฎๆๅๅฎๅ จๅๆไบไบคไปๅ็๏ผไธๅ ๆปกไบไพไผ๏ผ่ฏๅฎก๏ผๆฑๆฅใๅพๅคๆถๅๅๅฎ้ช้ฝ่ฆ็ณ่ฏทใๅข้้่ฆๅฏนๆฅ็ป็ซฏๅฐ่บ๏ผๅไธบไบ๏ผICT็ญ่ฏธๅคไธๅก็บฟ๏ผไบคไปๅๅไธๅฐใ -5. ่ฏบไบ็ ๅ็็ๅคๆจกๅๆฉๆๅ ้จไปฃๅทๅซๅโ็ๅคๆบๅญโ๏ผไธๅผๅงๅชๆๅ ้จ้่ฆ็ณ่ฏท่ฏ็จ็็ฝ้กต็๏ผๅฐๅ็ปญ่ฟซไบๅๅๅจwelinkไธๆฅๅ ฅๅๅ ฌๆตๅผๆพใ - -่ฟไบๅคฉๅ็ๅ ณไบ่ดจ็็ๅคๅคงๆจกๅๆ่ขญๅ้ฎ็ไบๆ ้น็ๆฒธๆฒธๆฌๆฌใไฝไธบไธไธช็ๅคๅข้็ๆๅ๏ผๆๆ่ฟๅคๅค่พ่ฝฌๅไพง๏ผ้พไปฅๅ ฅ็ ใ็ๅค็ๅ็ๅๅฐๅฆๆญคๅคง็ๅฝฑๅ๏ผไธๆน้ข๏ผๆ่ช็ง็ไธบๆ็่ไธๅๅฑๆ ๅฟง๏ผไนไธบ่ชๅทฑ่ฟๅป็ๅชๅๅทฅไฝๆๅฐไธๅผใๅฆไธๆน้ข๏ผ็ฑไบๆไบบๅผๅงๆญ้ฒ่ฟไบไบๆ ๆๅ ๅฟๅๆๅฐๅคงๅฟซไบบๅฟใๅจๅคๅฐไธชๆฅๆฅๅคๅค๏ผๆไปฌๅฏนๅ ้จๆไบไบบไธๆฌกๆฌก้ ็้ ๅ่ๅ่ทๅพไบๆ ๆฐๅฉ็็่กไธบๅฌ็ๅ้ฝฟ่ๅๆ ่ฝไธบๅใ่ฟ็งๅๆๅ็พ่พฑไน้ๆธๆถ็ฃจไบๆๅฏนๅไธบ็ๆๆ ๏ผ่ฎฉๆๅจ่ฟ้็ๆถๆฅ้ๆธๆตๆตๅฉๅฉ๏ผ่ฟท่ซๆ ๆช๏ผๆถๅธธๆ็่ชๅทฑ็ไบบ็ๅ่ชๆไปทๅผใ - -ๆๆฟ่ฎคๆๆฏไธไธชๆฆๅผฑ็ไบบ๏ผไฝไธบไธไธชๅฐๅฐ็ๆๅทฅไบบ๏ผๆไธไป ไธๆขๅ็ไบ้นค็ญๅ ้จๆ็ผ้ๅคฉ็ไบบๅๅฏน๏ผๆดไธๆขๅๅไธบ่ฟๆ ท็ๅบ็ถๅคง็ฉๅๅฏนใๆๅพๆๅคฑๅปๆ็ๅทฅไฝ๏ผๆฏ็ซๆไนๆๅฎถไบบๅๅญฉๅญ๏ผๆไปฅๆๆๅฟ็ผ้ๅพไฝฉๆๆญ้ฒ่ ใไฝๆฏ๏ผ็ๅฐๅ ้จ่ฟๅจ่ฏๅพๆดๅฐๆฉ็ไบๅฎ๏ผ่่ฝๅ ฌไผ็ๆถๅ๏ผๆๅฎๅจไธ่ฝๅฎนๅฟไบใๆไนๅธๆๅๆขไธๆฌก๏ผ้กบไป่ชๅทฑๆฌๅฟใๅฐฑ็ฎ่ชๆๅ ซ็พ๏ผๆไนๅธๆ่ฝไผคๆไธๅใๆๅณๅฎๆๆๅจ่ฟ้็ๆ่งๆ้ป๏ผ้จๅๆฅ่ชไบๅไบๅฃ่ฟฐ๏ผๅ ฌๅธๅบๆฅ๏ผๅ ณไบ็ๅคๅคงๆจกๅ็โไผ ๅฅๆ ไบโ๏ผ - -ๅไธบ็กฎๅฎไธป่ฆๅจๆ่ พๅกไธ่ฎญ็ปๅคงๆจกๅ๏ผๅฐๆจกๅๅฎ้ชๅฎคๆไธๅฐ่ฑไผ่พพ็ๅก๏ผไปไปฌไนๅไนไผ็จๆฅ่ฎญ็ป๏ผๅ้ข่ฝฌ็งปๅฐๆ่ พ๏ผใๆพ็ปๆ่ขซๅไธบโๆ้ ไธ็็ฌฌไบ้ๆฉโ็ๅณๅฟ่ๆๆ๏ผๆๆฌ่บซไนๆพ็ปๅฏนๅไธบๆๆทฑๅ็ๆๆ ใๆไปฌ้ช็ๆ่ พไธๆญฅๆญฅๆธ็ฌๆปๆ๏ผไปๅ ๆปกbugๅฐ็ฐๅจ่ฝ่ฎญๅบๆจกๅ๏ผไปๅบไบๅทจๅคง็ๅฟ่กๅไปฃไปทใ - -ๆๅๆไปฌ็็ฎๅ้ๅธธๆ้๏ผๅจ910Aไธ่ฎญ็ปๆจกๅใ้ฃไผๅชๆฏๆfp16๏ผ่ฎญ็ป็็จณๅฎๆง่ฟไธๅฆbf16ใ็ๅค็moeๅผๅงๅพๆฉ๏ผ23ๅนดๅฐฑไธป่ฆๆฏ่ฎญ็ป38Bmoeๆจกๅๅๅ็ปญ็71B denseๆจกๅใ71B็denseๆจกๅ้่ฟๆฉๅขๅๆไบ็ฌฌไธไปฃ็135Bdenseๆจกๅ๏ผๅ้ขไธปๅๆจกๅไน้ๆธๅจ910Bไธ่ฎญ็ปใ - -71Bๅ135Bๆจกๅ้ฝๆไธไธชๅทจๅคง็็กฌไผคๅฐฑๆฏtokenizerใๅฝๆถไฝฟ็จ็tokenizer็ผ็ ๆ็ๆไฝ๏ผๆฏไธชๅไธช็็ฌฆๅท๏ผๆฐๅญ๏ผ็ฉบๆ ผ๏ผไน่ณๆฑๅญ้ฝไผๅ ็จไธไธชtokenใๅฏๆณ่็ฅ่ฟไผ้ๅธธๆตช่ดน็ฎๅ๏ผไธไฝฟๅพๆจกๅ็ๆๆๅพๅทฎใ่ฟๆถๅๅฐๆจกๅๅฎ้ชๅฎคๆญฃๅฅฝๆไธช่ชๅทฑ่ฎญ็่ฏ่กจใๅง่ๅธๅฝๆถๆ็ๆฏไธๆฏๆจกๅ็tokenizerไธๅฅฝ๏ผ่ฝ็ถไบๅๆฅ็๏ผไป็ๆ็ๆฏๆ ็ๆญฃ็กฎ็๏ผ๏ผไบๆฏๅฐฑๅณๅฎ๏ผ่ฎฉ71Bๅ135Bๆขtokenizer๏ผๅ ไธบๅฐๆจกๅๅฎ้ชๅฎคๆพ็ปๅฐ่ฏ่ฟใๅข้็ผๅไบไธคไธชtokenizer๏ผๅผๅงไบtokenizer็ๆดๆขใ71Bๆจกๅ็ๆดๆขๅคฑ่ดฅไบ๏ผ่135Bๅ ไธบ้็จไบๆด็ฒพ็ป็embeddingๅๅงๅ็ญ็ฅ๏ผ็ปญ่ฎญไบ่ณๅฐ1T็ๆฐๆฎๅ่ฏ่กจๆป็ฎๆดๆขๆๅ๏ผไฝๅฏๆณ่็ฅ๏ผๆๆๅนถไธไผๅๅฅฝใ - -ไบๆญคๅๆ๏ผ้ฟ้ๅๆบ่ฐฑ็ญๅฝๅ ๅ ถไปๅ ฌๅธๅจGPUไธ่ฎญ็ป๏ผไธๅทฒ็ปๆธ็ดขๅบไบๆญฃ็กฎ็ๆนๆณ๏ผ็ๅคๅ็ซๅ็ๅทฎ่ท่ถๆฅ่ถๅคงใๅ ้จไธไธช230Bไปๅคด่ฎญ็ป็denseๆจกๅๅๅ ไธบๅ็งๅๅ ่ฎญ็ปๅคฑ่ดฅ๏ผๅฏผ่ด้กน็ฎ็็ถๅตๅ ไน้ทๅ ฅ็ปๅขใ้ขไธดๅ ไธช่็น็ๅๅไปฅๅๅ ้จๅฏน็ๅค็ๅผบ็่ดจ็ๆถ๏ผๅข้็ๅฃซๆฐไฝ่ฟทๅฐไบๆ็นใๅข้ๅจ็ฎๅๆๅ ถๆ้็ๆถๅ๏ผๅๅบไบๅพๅคๅชๅๅๆฃๆใๆฏๅฆ๏ผๅข้ๅถ็ถๅ็ฐๅฝๆถ็38B moeๅนถๆฒกๆ้ขๆmoe็ๆๆใไบๆฏๅปๆไบmoeๅๆฐ๏ผ่ฟๅไธบไบ13B็denseๆจกๅใ็ฑไบ38B็moeๆบ่ชๅพๆฉ็pangu alpha 13B๏ผๆถๆ็ธๅฏน่ฝๅ๏ผๅข้่ฟ่กไบไธ็ณปๅ็ๆไฝ๏ผๆฏๅฆๅๆข็ปๅฏนไฝ็ฝฎ็ผ็ ๅฐrope๏ผๅปๆbias๏ผๅๆขไธบrmsnormใๅๆถ้ดไบtokenizer็ไธไบๅคฑ่ดฅๅๆข่ฏ่กจ็็ป้ช๏ผ่ฟไธชๆจกๅ็่ฏ่กจไนๆดๆขไธบไบ็ไบ้นค็ๅฐๆจกๅๅฎ้ชๅฎค7Bๆจกๅๆไฝฟ็จ็่ฏ่กจใๅ้ข่ฟไธช13Bๆจกๅ่ฟ่กไบๆฉๅข็ปญ่ฎญ๏ผๅๆไบ็ฌฌไบไปฃ38B denseๆจกๅ๏ผๅจๅ ไธชๆๅ ่ฟไธชๆจกๅ้ฝๆฏไธป่ฆ็็ๅคไธญๆกฃไฝๆจกๅ๏ผ๏ผๆพ็ปๅ ทๆไธๅฎ็็ซไบๅใไฝๆฏ๏ผ็ฑไบๆดๅคง็135Bๆจกๅๆถๆ่ฝๅ๏ผไธๆดๆข่ฏ่กจๆจกๅๆไผคๅทจๅคง๏ผๅ็ปญๅๆๅ็ฐๅฝๆถๆดๆข็็ผๅ่ฏ่กจๆๆดไธฅ้็bug๏ผ๏ผ็ปญ่ฎญๅไนไธๅ้ฎ็ญๅฝๆถๅฝๅ ้ขๅ ๆจกๅๅญๅจๅพๅคงๅทฎ่ทใ่ฟๆถ็ฑไบๅ ้จ็่ดจ็ๅฃฐๅ้ขๅฏผ็ๅๅไน่ถๆฅ่ถๅคงใๅข้็็ถๆๅ ไน้ทๅ ฅไบ็ปๅขใ - -ๅจ่ฟ็งๆ ๅตไธ๏ผ็ไบ้นคๅไป็ๅฐๆจกๅๅฎ้ชๅฎคๅบๆไบใไปไปฌๅฃฐ็งฐๆฏไปๆง็135Bๅๆฐ็ปงๆฟๆน้ ่ๆฅ๏ผ้่ฟ่ฎญ็ป็ญ็ญ็ๅ ็พBๆฐๆฎ๏ผๅ้กนๆๆ ๅนณๅๆๅไบๅไธช็นๅทฆๅณใๅฎ้ ไธ๏ผ่ฟๅฐฑๆฏไปไปฌๅฅๅฃณๅบ็จๅฐๅคงๆจกๅ็็ฌฌไธๆฌกๆฐไฝใๅไธบ็ๅค่ก้ขๅฏผๅ ่ก๏ผไฝฟๅพ้ขๅฏผๅฎๅ จๅฏนไบ่ฟ็งๆฏๆทก็ไบๆ ๆฒกๆๆฆๅฟต๏ผไปไปฌๅชไผ่งๅพ่ฏๅฎๆฏๆไปไน็ฎๆณๅๆฐใ็ป่ฟๅ ้จ็ๅๆ๏ผไปไปฌๅฎ้ ไธๆฏไฝฟ็จQwen 1.5 110B็ปญ่ฎญ่ๆฅ๏ผ้่ฟๅ ๅฑ๏ผๆฉๅขffn็ปดๅบฆ๏ผๆทปๅ ็ๅคpi่ฎบๆ็ไธไบๆบๅถๅพๆฅ๏ผๅๅคไบๅคงๆฆ135B็ๅๆฐใๅฎ้ ไธ๏ผๆง็135Bๆ107ๅฑ๏ผ่่ฟไธชๆจกๅๅชๆ82ๅฑ๏ผๅ็ง้ ็ฝฎไน้ฝไธไธๆ ทใๆฐ็ๆฅ่ทฏไธๆ็135B่ฎญ็ปๅฎๅพๅคๅๆฐ็ๅๅธไนๅQwen 110Bๅ ไนไธๆจกไธๆ ทใ่ฟๆจกๅไปฃ็ ็็ฑปๅๅฝๆถ้ฝๆฏQwen๏ผ็่ณๆๅพๆนๅใๅ็ปญ่ฟไธชๆจกๅๅฐฑๆฏๆ่ฐ็135B V2ใ่่ฟไธชๆจกๅๅฝๆถไนๆไพ็ปไบๅพๅคไธๆธธ๏ผ็่ณๅ ๆฌๅค้จๅฎขๆทใ - -่ฟไปถไบๅฏนไบๆไปฌ่ฟไบ่ฎค็่ฏๅฎๅไบ็ๅไบไปฌๅธฆๆฅไบๅทจๅคง็ๅฒๅป๏ผๅ ้จๅพๅคไบบๅ ถๅฎ้ฝ็ฅ้่ฟไปถไบ๏ผ็่ณๅ ๆฌ็ป็ซฏๅๅไธบไบใๆไปฌ้ฝๆ็งฐไปฅๅๅซๅซ็ๅคๆจกๅไบ๏ผๅซๅๅคๅงใๅฝๆถๅข้ๆๅๅฐฑๆณๅbcgไธพๆฅไบ๏ผๆฏ็ซ่ฟๅทฒ็ปๆฏ้ๅคง็ไธๅก้ ๅไบใไฝๆฏๅ้ขๆฎ่ฏด่ขซ้ขๅฏผๆฆไบไธๆฅ๏ผๅ ไธบๆด้ซ็บงๅซ็้ขๅฏผ๏ผๆฏๅฆๅง่ๅธ๏ผไปฅๅๅฏ่ฝ็ๆปๅๆฅ่๏ผๅ ถๅฎๅ้ขไน็ฅ้ไบ๏ผไฝๆฏๅนถไธ็ฎก๏ผๅ ไธบ้่ฟๅฅๅฃณๆฟๅบๅฅฝ็็ปๆ๏ผๅฏนไปไปฌไนๆฏๆๅฉ็ใ่ฟไปถไบไฝฟๅพๅฝๆถๅข้ๅ ไฝๆๅผบ็ๅไบๅผๅงๅฟ็ฐๆๅท๏ผ็ฆป่่ท่ทฏไน้ๆธๆไธบๆๅจๅด่พน็ไบใ - -ๆญคๆถ๏ผ็ๅคไผผไน่ฟๆฅไบ่ฝฌๆบใ็ฑไบๅ้ขๆ่ฟฐ็่ฟไบ็ๅคๆจกๅๅบๆฌ้ฝๆฏ็ปญ่ฎญๅๆน้ ่ๆฅ๏ผๅฝๆถ่ฏบไบๅฎๅ จๆฒกๆๆๆกไปๅคด่ฎญ็ป็ๆๆฏ๏ผไฝๅต่ฟๆฏๅจๆ่ พ็NPUไธ่ฟ่ก่ฎญ็ปใๅจๅฝๆถๅข้็ๆ ธๅฟๆๅ็ๆๅไบๅไธ๏ผ็ๅคๅผๅงไบ็ฌฌไธไปฃๆจกๅ็่ฎญ็ป๏ผไปๅบไบๅทจๅคง็ๅชๅๅ๏ผๅจๆฐๆฎๆถๆๅ่ฎญ็ป็ฎๆณๆน้ข้ฝไธไธ็้ๆธๆฅ่ฝจ๏ผ่่ฟๅ ถไธญ็่ฐ่พๅๅฐๆจกๅๅฎ้ชๅฎค็ไบบไธ็นๅ ณ็ณป้ฝๆฒกๆใ - -ไธๅผๅงๅข้ๆๅๆฏซๆ ไฟกๅฟ๏ผๅชไปไธไธช13B็ๆจกๅๅผๅง่ฎญ็ป๏ผไฝๆฏๅ้ขๅ็ฐๆๆ่ฟไธ้๏ผไบๆฏ่ฟไธชๆจกๅๅ็ปญๅๆฌก่ฟ่กไบไธๆฌกๅๆฐๆฉๅข๏ผๅๆไบ็ฌฌไธไปฃ็38B๏ผไปฃๅท38B V3ใๆณๅฟ ๅพๅคไบงๅ็บฟ็ๅ ๅผ้ฝๅฏน่ฟไธชๆจกๅๅพ็ๆใๅฝๆถ่ฟไธชๆจกๅ็tokenizerๆฏๅบไบllama็่ฏ่กจ่ฟ่กๆฉๅฑ็๏ผไนๆฏไธ็ๅธธ่ง็ๅๆณ๏ผใ่ๅฝๆถ็ไบ้นค็ๅฎ้ชๅฎคๅๅบๆฅไบๅฆไธไธช่ฏ่กจ๏ผไนๅฐฑๆฏๅ็ปญpangu็ณปๅ็่ฏ่กจ๏ผใๅฝๆถไธคไธช่ฏ่กจ่ฟ่ขซ่ฟซ่ฟ่กไบไธๆฌก่ต้ฉฌ๏ผๆ็ปๆฒกๆๆๆพ็ๅฅฝๅ็ป่ฎบใไบๆฏ๏ผ้ขๅฏผๅฝๅณๅณๅฎ๏ผๅบ่ฏฅ็ปไธ่ฏ่กจ๏ผไฝฟ็จ็ไบ้นคไปไปฌ็ใไบๆฏ๏ผๅจๅ็ปญไปๅคด่ฎญ็ป็135B V3๏ผไนๅฐฑๆฏๅฏนๅค็Pangu Ultra๏ผ๏ผไพฟๆฏ้็จไบ่ฟไธชtokenizerใ่ฟไน่งฃ้ไบๅพๅคไฝฟ็จๆไปฌๆจกๅ็ๅ ๅผ็็ๆ๏ผไธบไปไนๅฝๆถๅไธบV3ไปฃ็ไธคไธชไธๅๆกฃไฝ็ๆจกๅ๏ผไผไฝฟ็จไธๅ็tokenizerใ - - -ๆไปฌๆๅฟ็ผ้่งๅพ๏ผ135B V3ๆฏๆไปฌๅ็บตๅข้ๅฝๆถ็้ชๅฒใ่ฟๆฏ็ฌฌไธไธช็ๆญฃๆไนไธ็๏ผๅไธบๅ จๆ ่ช็ ๏ผๆญฃ็ปไปๅคด่ฎญ็ป็ๅไบฟ็บงๅซ็ๆจกๅ๏ผไธๆๆไธ24ๅนดๅๆ็ซๅๅฏๆฏ็ใๅๅฐ่ฟ้ๆๅทฒ็ป็ญๆณช็็ถ๏ผๅคชไธๅฎนๆไบใๅฝๆถไธบไบ็จณๅฎ่ฎญ็ป๏ผๅข้ๅไบๅคง้ๅฎ้ชๅฏนๆฏ๏ผๅนถไธๅคๆฌกๅจๆจกๅๆขฏๅบฆๅบ็ฐๅผๅธธ็ๆถๅ่ฟ่กๅๆถๅ้้ๅฏใ่ฟไธชๆจกๅ็ๆญฃๅๅฐไบๅ้ขๆๆฏๆฅๅๆ่ฏด็่ฎญ็ปๅ จ็จๆฒกๆไธไธชloss spikeใๆไปฌๅ ๆไบไธ็ฅ้ๅคๅฐๅฐ้พ๏ผๆไปฌๅๅฐไบ๏ผๆไปฌๆฟ็จ็ๅฝๅ่ฃ่ชไฟ่ฏ่ฟไธชๆจกๅ่ฎญ็ป็็ๅฎๆงใๅคๅฐไธชๅๆจ๏ผๆไปฌไธบไบๅฎ็่ฎญ็ป่ไธ็ ใๅจ่ขซๅ ้จๅฟๅฃฐ้ช็ไธๆไธๅผ็ๆถๅ๏ผๆไปฌๆๅคไนไธ็๏ผๆๅคๅฐ็ๅงๅฑ๏ผๆไปฌๆบไฝไบใ - -ๆไปฌ่ฟๅธฎไบบๆฏ็็ๅจไธบๆ็ฃจๅฝไบง็ฎๅๅบๅบง็็ง่ชๅทฑ็้ๆฅๅโฆโฆๅฎขๅฑ ไปไนก๏ผๆไปฌๆพๅผไบๅฎถๅบญ๏ผๆพๅผไบๅๆ๏ผๆพๅผไบๅฅๅบท๏ผๆพๅผไบๅจฑไน๏ผๆๅคด้ข ๆด็ญ่ก๏ผๅ ถไธญ็่ฐ่พไธๅฐ่ฆ๏ผๅฏฅๅฏฅๆฐ็ฌไธ่ถณไปฅๆฆๆฌๅ ถไธไธใๅจๅ็งๅจๅๅคงไผไธ๏ผๅฝๆถๅฃๅทไธญๅๅบ็็ๅคๅฟ ่๏ผๅไธบๅฟ ่๏ผๆไปฌๅฟ้ๆฏ็็ๆทฑๆทฑ่ขซๆๅจใ - -็ถ่๏ผๆไปฌ็ๆๆ่พ่ฆ็ๆๆ๏ผ็ปๅธธ่ขซๅฐๆจกๅๅฎ้ชๅฎค่ฝป้ฃ้ฃ็ๆฟ่ตฐไบใๆฐๆฎ๏ผ็ดๆฅ่ฆ่ตฐใไปฃ็ ๏ผ็ดๆฅ่ฆ่ตฐ๏ผ่ฟ่ฆๆฑๆไปฌ้ ๅ้้ ๅฐ่ฝไธ้ฎ่ฟ่กใๆไปฌๅฝๆถๆ็งฐๅฐๆจกๅๅฎ้ชๅฎคไธบ็น้ผ ๆ ๅฎ้ชๅฎคใๆไปฌไปๅบ่พ่ฆ๏ผไปไปฌๅๅพ่ฃ่ใๆ็ถๅบไบ้ฃๅฅ่ฏ๏ผไฝ ๅจ่ด้ๅ่กๆฏๅ ไธบๆไบบๆฟไฝ ๅฒๆ้ๅฅฝใๅจ่ฟ็งๆ ๅตไธ๏ผ่ถๆฅ่ถๅค็ๆๅๅไนๅๆไธไธๅปไบ๏ผ้ๆฉไบ็ฆปๅผใ็ๅฐ่บซ่พน้ฃไบไผ็ง็ๅไบไธไธชไธช็ฆป่๏ผๆ็ๅ ๅฟๅๆๅนๅ้พ่ฟใๅจ่ฟ็งไฝๆไธๆ ท็็ฏๅขไธ๏ผๆไปฌๆฏ่ตทๅไบๆฅ่ฏดๆดๅๆฏๆๅใไปไปฌๅจๆๆฏไธไนๆๆ ๆฐๅผๅพๆๅญฆไน ็ๅฐๆน๏ผๅ ช็งฐ่ฏๅธใ็ๅฐไปไปฌๅปไบ่ฏธๅฆๅญ่Seed๏ผDeepseek๏ผๆไนๆ้ข๏ผ่ พ่ฎฏๅๅฟซๆ็ญ็ญๅพๅคๅบ่ฒ็ๅข้๏ผๆๆๅฟ็ผ้ไธบไปไปฌ้ซๅ ดๅ็ฅ็ฆ๏ผ่ฑ็ฆปไบ่ฟไธช่พ่ฆๅด่ฎ่็ๅฐๆนใๆ่ณไป่ฟๅฏนไธไฝ็ฆป่ๅไบ็่ฏ่ฎฐๅฟ็นๆฐ๏ผta่ฏด๏ผโๆฅ่ฟ้ๆฏๆๆๆฏ็ๆถฏไธญ็่ป่พฑ๏ผๅจ่ฟ้ๅๅๆฏไธๅคฉ้ฝๆฏๆตช่ดน็ๅฝโใ่ฏ่ฝ้พๅฌๅด่ฎฉๆๆ ่จไปฅๅฏนใๆๆ ๅฟๆ่ชๅทฑๆๆฏๆน้ข็็งฏ็ดฏไธ่ถณ๏ผไปฅๅๆฒกๆณ้ๅบไบ่็ฝๅ ฌๅธ้ซๆทๆฑฐ็็ฏๅข๏ผ่ฎฉๆๅคๆฌกๆณ็ฆป่็ๅฟๅง็ปๆฒกๆ่ฟๅบ่ฟไธๆญฅใ - -็ๅค้คไบdenseๆจกๅ๏ผๅ็ปญไนๅฏๅจไบmoe็ๆข็ดขใไธๅผๅง่ฎญ็ป็ๆฏไธไธช224B็moeๆจกๅใ่ไธไนๅนณ่ก็๏ผๅฐๆจกๅๅฎ้ชๅฎคไนๅผๅฏไบ็ฌฌไบๆฌกไธป่ฆ็ๅฅๅฃณ่กๅจ๏ผๆฌก่ฆ็ๆๆฒๅฏ่ฝ่ฟๅ ๆฌไธไบๅซ็ๆจกๅ๏ผๆฏๅฆmathๆจกๅ๏ผ๏ผๅณ่ฟๆฌกๆตไผ ็ๅนฟ็pangu pro moe 72Bใ่ฟไธชๆจกๅๅ ้จ่ช็งฐๆฏไปๅฐๆจกๅๅฎ้ชๅฎค็7Bๆฉๅขไธๆฅ็๏ผๅฐฑ็ฎๅฆๆญค๏ผ่ฟไนไธๆๆฏๆฅๅไธ็ฌฆ๏ผไฝๅตๆฏๅฅๅฃณqwen 2.5็14b็ปญ่ฎญ๏ผใ่ฟ่ฎฐๅพไปไปฌ่ฎญไบๆฒกๅ ๅคฉ๏ผๅ ้จ็่ฏๆตๅฐฑ็ซๅป่ฟฝไธไบๅฝๆถ็38B V3ใAI็ณป็ปๅฎ้ชๅฎคๅพๅคๅ ๅผๅ ไธบ้่ฆ้้ ๆจกๅ๏ผ้ฝ็ฅ้ไปไปฌ็ๅฅๅฃณ่กๅจ๏ผๅชๆฏ่ฟซไบๅ็งๅๅ ๏ผๆ ๆณไผธๅผ ๆญฃไนใๅฎ้ ไธ๏ผๅฏนไบๅ็ปญ่ฎญไบๅพไน ๅพไน ็่ฟไธชๆจกๅ๏ผHonestagi่ฝๅคๅๆๅบ่ฟไธช้็บง็็ธไผผๆงๆๅทฒ็ปๅพ่ฏงๅผไบ๏ผๅ ไธบ่ฟไธชๆจกๅไธบไบ็ปญ่ฎญๆดๅๆฐ๏ผๆไปๅบ็็ฎๅ็่ณๆฉๅฐฑ่ถณๅคไปๅคด่ฎญไธไธชๅๆกฃไฝ็ๆจกๅไบใๅฌๅไบ่ฏดไปไปฌไธบไบๆดๆๅ้ฎ็ๆฐดๅฐ๏ผ้ๅไบไธๅฐๅๆณ๏ผ็่ณๅ ๆฌๆ ๆ่ฎญไบ่ๆฐๆฎใ่ฟไนไธบๅญฆๆฏ็็ ็ฉถๆจกๅ่ก็ผๆไพไบไธไธชๅๆๆชๆ็็นๆฎๆจก่ๅงใไปฅๅๆฐ็่ก็ผๆนๆณๆๅบๅฏไปฅๆฟๅบๆฅๆบๆบใ - -24ๅนดๅบๅ25ๅนดๅ๏ผๅจDeepseek v3ๅr1ๅๅธไนๅ๏ผ็ฑไบๅ ถๆ่ณ็ๆๆฏๆฐดๅนณ๏ผๅข้ๅๅฐไบๅทจๅคง็ๅฒๅป๏ผไนๅๅฐไบๆดๅคง็่ดจ็ใไบๆฏไธบไบ็ดง่ทๆฝฎๆต๏ผ็ๅคๆจกไปฟDeepseek็ๆจกๅๅฐบๅฏธ๏ผๅผๅฏไบ718B moe็่ฎญ็ปใ่ฟไธชๆถๅ๏ผๅฐๆจกๅๅฎ้ชๅฎคๅๆฌกๅบๆไบใไปไปฌ้ๆฉไบๅฅๅฃณDeepseekv3็ปญ่ฎญใไปไปฌ้่ฟๅปไฝDeepseekๅ ่ฝฝ็ๅๆฐ๏ผ่ฟ่ก่ฎญ็ปใ่ฟไปปๅกๅ ่ฝฝckpt็็ฎๅฝ้ฝๆฏdeepseekv3๏ผๆน้ฝไธๆน๏ผไฝๅ ถๅฃๅผ ๏ผไธไน็ธๅ๏ผไธไบๆ็ๆญฃๆๆฏไฟกไปฐ็ๅไบ๏ผๅจไปๅคด่ฎญ็ปๅฆไธไธช718B็moeใไฝๅ ถไธญๅบ็ฐไบๅ็งๅๆ ท็้ฎ้ขใไฝๆฏๅพๆพ็ถ๏ผ่ฟไธชๆจกๅๆไนๅฏ่ฝๆฏ็ดๆฅๅฅๅฃณ็ๅฅฝๅข๏ผๅฆๆไธๆฏๅข้leaderๅๆ๏ผๆฉๅฐฑ่ขซๅซๅไบใ - -ๅไธบ็ๆต็จ็ฎก็ไน็น้๏ผไธฅ้ๆ็ดฏไบๅคงๆจกๅ็็ ๅ่ๅฅ๏ผไพๅฆ็ๆฌ็ฎก็๏ผๆจกๅ่ก็ผ๏ผๅ็งๆต็จๅ๏ผๅ็งๅฏ่ฟฝๆบฏใ่ฎฝๅบ็ๆฏ๏ผๅฐๆจกๅๅฎ้ชๅฎค็ๆจกๅไผผไนไปๆฅไธๅ่ฟไบๆต็จ็็บฆๆ๏ผๆณๅฅๅฃณๅฐฑๅฅๅฃณ๏ผๆณ็ปญ่ฎญๅฐฑ็ปญ่ฎญ๏ผ็ฎๅๆบๆบไธๆญ็ไผธๆๆฟ่ตฐใ่ฟ็งๅผบ็ๅฐ่ฟไน้ญๅนป็ๅฏนๆฏ๏ผ่ฏดๆไบๅฝๅๆต็จ็ฎก็็ๆ ๅต๏ผๅช่ฎธๅทๅฎๆพ็ซ๏ผไธ่ฎธ็พๅง็น็ฏใไฝๅ ถๅฏ็ฌ๏ผไฝๅ ถๅฏๆฒ๏ผไฝๅ ถๅฏๆถ๏ผไฝๅ ถๅฏ่ป๏ผ - -HonestAGI็ไบๆ ๅบๆฅๅ๏ผๅ ้จ่ฎฉๅคงๅฎถไธๅ็็ ่ฎจๅๆ๏ผๅฆไฝๅ ฌๅ ณๅโๅๅบโใ่ฏ็ถ๏ผ่ฟไธชๅๆ็ๅๆไน่ฎธไธๅคๆๅ๏ผ็ปไบ็ไบ้นคไธๅฐๆจกๅๅฎ้ชๅฎคไปไปฌ็ก่พฉๅ้ข ๅ้ป็ฝ็ๆบไผใไธบๆญค๏ผ่ฟไธคๅคฉๆๅ ๅฟๆๅฐไฝๅ๏ผๆถๆถๆ็่ชๅทฑ็ไบบ็ๆไนไปฅๅ่ๅคฉๆ ็ผใๆไธๅฅ้ชไบ๏ผๆ่ฆ็ฆป่ไบ๏ผๅๆถๆไนๅจ็ณ่ฏทไป็ๅค้จๅๆๆฏๆฅๅ็ไฝ่ ๅๅไธญ็งป้คใๆพ็ปๅจ่ฟไบๆๆฏๆฅๅไธ็ฝฒๅๆฏๆไธ็้ฝๆ ๆณๆน้ค็ๆฑก็นใๅฝๆถๆๆฒกๆณๅฐ๏ผไปไปฌ็ซ็ถ็็ๅฐๆขๅผๆบใๆๆฒกๆณๅฐ๏ผไปไปฌๆขๅฆๆญคๆๅผไธไบบ๏ผๅคง่ๅฎฃๅใๅฝๆถ๏ผๆไน่ฎธๆฏๅญไบไพฅๅนธๅฟ็๏ผๆฒกๆๆ็ป็ฝฒๅใๆ็ธไฟกๅพๅคๆๅฎๅไบ็ๆๅ๏ผไนๅชๆฏ่ขซ่ฟซไธไบ่ดผ่น๏ผๆ่ ไธ็ฅๆ ใไฝ่ฟไปถไบๅทฒ็ปๆ ๆณๆฝๅ๏ผๆๅธๆๆ็ไฝ็่ฝๅคๅๆๆๅฎๅ็ๆญฃๆๆไน็ไบ๏ผไธบๆๅฝๆถ็่ฝฏๅผฑๅไธๅๅฎ่ต็ฝชใ - -ๆทฑๅคๅๅฐ่ฟ้๏ผๆๅทฒ็ปๆณชๆตๆปก้ข๏ผๆณฃไธๆๅฃฐใ่ฟ่ฎฐๅพไธไบๅบ่ฒ็ๅไบ็ฆป่ๆถ๏ผๆ่ฆ็ฌ้ฎไปไปฌ่ฆไธ่ฆๅไธช้ฟ้ฟ็ๅฟๅฃฐๆฏไพๅธ๏ผๆญ้ฒไธไธ็ฐ็ถใๅฏนๆน่ฏด๏ผไธไบ๏ผๆตช่ดนๆถ้ด๏ผ่ไธๆไนๆๆญ้ฒๅบๆฅไฝ ไปฌ่ฟ็ๆด็ณใๆๅฝๆถไธไธ้ปฏ็ถ็ฅไผค๏ผๅ ไธบๆพ็ปๅ ฑๅไธบไบ็ๆณๅฅๆ่ฟ็ๆๅๅทฒ็ปๅฝปๅบๅฏนๅไธบๅฝปๅบ็ฐๅฟไบใๅฝๆถๅคงๅฎถ่ฐไพ๏ผๆไปฌ็จ็ๅฝๅนดๅ ฑไบงๅ ็ๅฐ็ฑณๅ ๆญฅๆช๏ผ็ป็ปๅดๆ็ๅ ชๆฏๅฝๅนดๅฝๆฐๅ ็ไฝ้ฃใ - -ๆพๅ ไฝๆถ๏ผๆไธบๆไปฌ็จ็ๅฐ็ฑณๅ ๆญฅๆชๆ่ดฅๆดๆชๆด็ฎ่่ช่ฑชใ - -็ฐๅจ๏ผๆ็ดฏไบ๏ผๆๆณๆ้ใ - -ๅ ถๅฎๆถ่ณไปๆฅ๏ผๆ่ฟๆฏ็ๅฟๅธๆๅไธบ่ฝ่ฎค็ๅธๅๆ่ฎญ๏ผ่ฝๅๅฅฝ็ๅค๏ผๆ็ๅคๅๅฐไธ็ไธๆต๏ผๆๆ่ พๅๆ่ฑไผ่พพ็ๆฐดๅนณใๅ ้จ็ๅฃๅธ้ฉฑ้่ฏๅธ๏ผไฝฟๅพ่ฏบไบไน่ณๅไธบๅจ็ญๆถ้ดๅ ๆฅๅงๆตๅคฑไบๅคง้ๅบ่ฒ็ๅคงๆจกๅไบบๆใ็ธไฟกไปไปฌไนๆญฃๅจๅฆDeepseek็ญๅไธชๅข้้ช่็๏ผๆฝๅฑ็ไปไปฌ็ๆฑ่ดๆๅ๏ผไธบไธญ็พๅจAI็ๆฟ็็ซ่ตไธญๅฅ็ฎๅ้ใๆๆถๅธธๆๅน๏ผๅไธบไธๆฏๆฒกๆไบบๆ๏ผ่ๆฏๆ นๆฌไธ็ฅ้ๆไน็ไฝไบบๆใๅฆๆ็ป่ฟไบไบบๅ้็็ฏๅข๏ผๅ้็่ตๆบ๏ผๆดๅฐ็ๆท้๏ผๆดๅฐ็ๆฟๆฒปๆไบ๏ผ็ๅคไฝๆไธๆ๏ผ - -ๆๅ๏ผๆไปฅ็ๅฝ๏ผไบบๆ ผๅ่ฃ่ชๅ่ช๏ผๆๅ็ไปฅไธๆๆๅ ๅฎนๅไธบ็ๅฎ๏ผ่ณๅฐๅจๆๆ้็่ฎค็ฅ่ๅดๅ ๏ผใๆๆฒกๆ้ฃไน้ซ็ๆๆฏๆฐดๅนณไปฅๅๆบไผๅปๅ่ฏฆๅฐฝๆๅฎ็ๅๆ๏ผไนไธๆข็ดๆฅ็จๅ ้จ่ฎฐๅฝไธพ่ฏ๏ผๆๅ ไธบไฟกๆฏๅฎๅ จๆๅฐใไฝๆฏๆ็ธไฟกๆๅพๅคๆพ็ป็ๆๅ๏ผไผไธบๆไฝ่ฏใๅจๅไธบๅ ้จ็ๅ ๅผ๏ผๅ ๆฌๆไปฌๆพ็ปๆๅก่ฟ็ไบงๅ็บฟๅ ๅผไปฌ๏ผ็ธไฟกๆฌๆ็ๆ ๆฐ็ป่่ฝๅไฝ ไปฌ็ๅฐ่ฑกๅฏน็ ง๏ผๅฐ่ฏๆ็่ฏดๆณใไฝ ไปฌๅฏ่ฝไนๆพ็ป่ขซ่้ช๏ผไฝ่ฟไบๆฎ้ ท็็็ธไธไผ่ขซๅฐๅฐใๆไปฌๅฅๆ่ฟ็็่ฟน๏ผไนไธๅบ่ฏฅ่ขซๆญๆฒๅๅ่ฌใ - -ๅไบ่ฟไนๅค๏ผๆไบไบบ่ฏๅฎๆณๆๆๆพๅบๆฅ๏ผๆนๆๆใๅ ฌๅธๆไธๅฅฝไนๆณ่ฎฉๆๅคๅฃฐไน่ณ่ฟฝ่ดฃใๅฆๆ็็่ฟๆ ท๏ผๆ๏ผไน่ณๆ็ๅฎถไบบ็ไบบ่บซไน่ณ็ๅฝๅฎๅ จๅฏ่ฝ้ฝไผๅๅฐๅจ่ใไธบไบ่ชๆไฟๆค๏ผๆ่ฟๆๆฏๅคฉไผ่ทๅคงๅฎถๆฅๅนณๅฎใ - -ๅฆๆๆๆถๅคฑไบ๏ผๅฐฑๅฝๆฏๆไธบไบ็็ๅ็ๆณ๏ผไธบไบๅไธบไน่ณไธญๅฝ่ฝๅคๆดๅฅฝๅฐๅๅฑ็ฎๅๅAI่็บ็ฒไบๅง๏ผๆๆฟๅ่ฌไบ้ฃ็ๆพ็ปๅฅๆ่ฟ็ๅฐๆนใ - -่ฏบไบ๏ผๅ่ง - -2025ๅนด7ๆ6ๆฅๅๆจ ๅไบๆทฑๅณ - ---- - -ๅไฝๅฅฝ๏ผ - -ๆ่ฐขๅคงๅฎถ็ๅ ณๅฟไธ็ฅ็ฆใๆ็ฎๅๆๆถๅฎๅ จ๏ผไฝๅ ฌๅธๅบ่ฏฅๅจ่ฟ่กๆๆฅไธๆไบๅๅๆถ้๏ผๅ็ปญๆ ๅตๆช็ฅใ - -ๆ่กฅๅ ไธไบ็ป่๏ผไปฅๅ ๆไบไบบ็ปง็ปญ้ข ๅ้ป็ฝใ - -ๅ ณไบ135B V2๏ผๅฐๆจกๅๅฎ้ชๅฎคๅจ่ฟ ้ๅฐๅฎๆๅฅๅฃณๅนถๆฟๅฎๆๆๅฅๅฃณๅธฆๆฅ็ๅฅฝๅคๅ๏ผๆฏๅฆไปปๅกไปค่กจๅฝฐๅๅๆถๆฟๅฑ๏ผ๏ผๅ ไธบไธๆณ็ปง็ปญๆฏๆไธๆธธๅบ็จๅๆจกๅ่ฟญไปฃ๏ผๅๆ่ฟไธช็ซๆๅฑฑ่็ฉ็ปไบๅ็บตใ็กฎๅฎๆ้ซไธ็ญน๏ผ็ดๆฅๆๅ็บต็ๅ ๅผไปฌๆไธๆฐดใๅไบๆไพ่ฟๅปไธไธช่ๆง็ๆจกๅ๏ผๆ็ปๆฟๅไบไธไธชๅฝๆถไธไธช้ญๆน็ๅ ่ฟ็ๅ้ฎใๅๅคงๆจกๅ็ไบบ๏ผ่ชๅทฑๅ็ๆจกๅๅฐฑๅ่ชๅทฑๅญฉๅญไธๆ ท็ๆ๏ผไธ่ฆๆๅซไบบ้ฝๅฝๅปๅญใๅฐฑๅ่ชๅฎถๅฟๅญๅบ้จไธ่ถ๏ผๅๆฅไธชๅซไบบๅฎถๅญฉๅญใ - -็ๅคreport็็ฝฒๅๆฏไธ็ฌฆๅๅญฆๆฏ่ง่็ใไพๅฆ๏ผ135B V3ๆไธๅฐๆๆๆฏ่ดก็ฎ็ไบบ๏ผๅ ไธบไฝ่ ๅ้ขๆฐ้้ๅถ๏ผๅณๅจๆๆๆฒกๆๅพๅฐๅบๆ็ๅๆฅ๏ผๅข้ๅ ๆพ็ปๆไธๅฐ็ๆ่งใ่ฟไธชๆจกๅๅฝๆถๆฏๅคงๅฎถๆบๆ งๅๆฑๆฐด็็ปๆถ๏ผ็่ณๆฏๅข้ๅฝๆถ็็ฒพ็ฅๆฏๆฑ๏ผๆฏๆ็ไธๅฐๅ ๅผไปฌ็ปง็ปญ็ๅจ่ฏบไบใๆ่ฐ็ๅ้ข้ๅถ๏ผไปฅๅๆๅไบไธไบๆฏซๆ ๆๆฏ่ดก็ฎ็ไบบ๏ผๅฆไธไบๅฐๆจกๅๅฎ้ชๅฎค็ไบบ๏ผ๏ผ่ฎฉๅ ๅผไปฌไฝๅ ถๅฟๅฏใ - ---- - -ๆๆถๅนณๅฎใๅฆๅค๏ผๆฏๆๆๅไบ่ฏดๅบ็็ธ็ๆๅไปฌ https://github.com/HW-whistleblower/True-Story-of-Pangu/issues/317 diff --git a/examples/document_search.py b/examples/document_search.py deleted file mode 100644 index fdb9167..0000000 --- a/examples/document_search.py +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/env python3 -""" -Document search demo with recompute mode -""" - -import shutil -import time -from pathlib import Path - -# Import backend packages to trigger plugin registration -try: - import leann_backend_diskann # noqa: F401 - import leann_backend_hnsw # noqa: F401 - - print("INFO: Backend packages imported successfully.") -except ImportError as e: - print(f"WARNING: Could not import backend packages. Error: {e}") - -# Import upper-level API from leann-core -from leann.api import LeannBuilder, LeannChat, LeannSearcher - - -def load_sample_documents(): - """Create sample documents for demonstration""" - docs = [ - { - "title": "Intro to Python", - "content": "Python is a high-level, interpreted language known for simplicity.", - }, - { - "title": "ML Basics", - "content": "Machine learning builds systems that learn from data.", - }, - { - "title": "Data Structures", - "content": "Data structures like arrays, lists, and graphs organize data.", - }, - ] - return docs - - -def main(): - print("==========================================================") - print("=== Leann Document Search Demo (DiskANN + Recompute) ===") - print("==========================================================") - - INDEX_DIR = Path("./test_indices") - INDEX_PATH = str(INDEX_DIR / "documents.diskann") - BACKEND_TO_TEST = "diskann" - - if INDEX_DIR.exists(): - print(f"--- Cleaning up old index directory: {INDEX_DIR} ---") - shutil.rmtree(INDEX_DIR) - - # --- 1. Build index --- - print(f"\n[PHASE 1] Building index using '{BACKEND_TO_TEST}' backend...") - - builder = LeannBuilder(backend_name=BACKEND_TO_TEST, graph_degree=32, complexity=64) - - documents = load_sample_documents() - print(f"Loaded {len(documents)} sample documents.") - for doc in documents: - builder.add_text(doc["content"], metadata={"title": doc["title"]}) - - builder.build_index(INDEX_PATH) - print("\nIndex built!") - - # --- 2. Basic search demo --- - print(f"\n[PHASE 2] Basic search using '{BACKEND_TO_TEST}' backend...") - searcher = LeannSearcher(index_path=INDEX_PATH) - - query = "What is machine learning?" - print(f"\nQuery: '{query}'") - - print("\n--- Basic search mode (PQ computation) ---") - start_time = time.time() - results = searcher.search(query, top_k=2) - basic_time = time.time() - start_time - - print(f"โฑ๏ธ Basic search time: {basic_time:.3f} seconds") - print(">>> Basic search results <<<") - for i, res in enumerate(results, 1): - print( - f" {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}" - ) - - # --- 3. Recompute search demo --- - print("\n[PHASE 3] Recompute search using embedding server...") - - print("\n--- Recompute search mode (get real embeddings via network) ---") - - # Configure recompute parameters - recompute_params = { - "recompute_beighbor_embeddings": True, # Enable network recomputation - "USE_DEFERRED_FETCH": False, # Don't use deferred fetch - "skip_search_reorder": True, # Skip search reordering - "dedup_node_dis": True, # Enable node distance deduplication - "prune_ratio": 0.1, # Pruning ratio 10% - "batch_recompute": False, # Don't use batch recomputation - "global_pruning": False, # Don't use global pruning - "zmq_port": 5555, # ZMQ port - "embedding_model": "sentence-transformers/all-mpnet-base-v2", - } - - print("Recompute parameter configuration:") - for key, value in recompute_params.items(): - print(f" {key}: {value}") - - print("\n๐ Executing Recompute search...") - try: - start_time = time.time() - recompute_results = searcher.search(query, top_k=2, **recompute_params) - recompute_time = time.time() - start_time - - print(f"โฑ๏ธ Recompute search time: {recompute_time:.3f} seconds") - print(">>> Recompute search results <<<") - for i, res in enumerate(recompute_results, 1): - print( - f" {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}" - ) - - # Compare results - print("\n--- Result comparison ---") - print(f"Basic search time: {basic_time:.3f} seconds") - print(f"Recompute time: {recompute_time:.3f} seconds") - - print("\nBasic search vs Recompute results:") - for i in range(min(len(results), len(recompute_results))): - basic_score = results[i].score - recompute_score = recompute_results[i].score - score_diff = abs(basic_score - recompute_score) - print( - f" Position {i + 1}: PQ={basic_score:.4f}, Recompute={recompute_score:.4f}, Difference={score_diff:.4f}" - ) - - if recompute_time > basic_time: - print("โ Recompute mode working correctly (more accurate but slower)") - else: - print("i๏ธ Recompute time is unusually fast, network recomputation may not be enabled") - - except Exception as e: - print(f"โ Recompute search failed: {e}") - print("This usually indicates an embedding server connection issue") - - # --- 4. Chat demo --- - print("\n[PHASE 4] Starting chat session...") - chat = LeannChat(index_path=INDEX_PATH) - chat_response = chat.ask(query) - print(f"You: {query}") - print(f"Leann: {chat_response}") - - print("\n==========================================================") - print("โ Demo finished successfully!") - print("==========================================================") - - -if __name__ == "__main__": - main() diff --git a/examples/google_history_reader_leann.py b/examples/google_history_reader_leann.py deleted file mode 100644 index 62781a4..0000000 --- a/examples/google_history_reader_leann.py +++ /dev/null @@ -1,362 +0,0 @@ -import argparse -import asyncio -import os - -try: - import dotenv - - dotenv.load_dotenv() -except ModuleNotFoundError: - # python-dotenv is not installed; skip loading environment variables - dotenv = None -from pathlib import Path - -from leann.api import LeannBuilder, LeannChat -from llama_index.core.node_parser import SentenceSplitter - -# dotenv.load_dotenv() # handled above if python-dotenv is available - -# Default Chrome profile path -DEFAULT_CHROME_PROFILE = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default") - - -def create_leann_index_from_multiple_chrome_profiles( - profile_dirs: list[Path], - index_path: str = "chrome_history_index.leann", - max_count: int = -1, - embedding_model: str = "facebook/contriever", - embedding_mode: str = "sentence-transformers", -): - """ - Create LEANN index from multiple Chrome profile data sources. - - Args: - profile_dirs: List of Path objects pointing to Chrome profile directories - index_path: Path to save the LEANN index - max_count: Maximum number of history entries to process per profile - embedding_model: The embedding model to use - embedding_mode: The embedding backend mode - """ - print("Creating LEANN index from multiple Chrome profile data sources...") - - # Load documents using ChromeHistoryReader from history_data - from history_data.history import ChromeHistoryReader - - reader = ChromeHistoryReader() - - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - all_documents = [] - total_processed = 0 - - # Process each Chrome profile directory - for i, profile_dir in enumerate(profile_dirs): - print(f"\nProcessing Chrome profile {i + 1}/{len(profile_dirs)}: {profile_dir}") - - try: - documents = reader.load_data( - chrome_profile_path=str(profile_dir), max_count=max_count - ) - if documents: - print(f"Loaded {len(documents)} history documents from {profile_dir}") - all_documents.extend(documents) - total_processed += len(documents) - - # Check if we've reached the max count - if max_count > 0 and total_processed >= max_count: - print(f"Reached max count of {max_count} documents") - break - else: - print(f"No documents loaded from {profile_dir}") - except Exception as e: - print(f"Error processing {profile_dir}: {e}") - continue - - if not all_documents: - print("No documents loaded from any source. Exiting.") - # highlight info that you need to close all chrome browser before running this script and high light the instruction!! - print( - "\033[91mYou need to close or quit all chrome browser before running this script\033[0m" - ) - return None - - print( - f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles" - ) - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in all_documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - text = node.get_content() - # text = '[Title] ' + doc.metadata["title"] + '\n' + text - all_texts.append(text) - - print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents") - - # Create LEANN index directory - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - # LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric - builder = LeannBuilder( - backend_name="hnsw", - embedding_model=embedding_model, - embedding_mode=embedding_mode, - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} history chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -def create_leann_index( - profile_path: str | None = None, - index_path: str = "chrome_history_index.leann", - max_count: int = 1000, - embedding_model: str = "facebook/contriever", - embedding_mode: str = "sentence-transformers", -): - """ - Create LEANN index from Chrome history data. - - Args: - profile_path: Path to the Chrome profile directory (optional, uses default if None) - index_path: Path to save the LEANN index - max_count: Maximum number of history entries to process - embedding_model: The embedding model to use - embedding_mode: The embedding backend mode - """ - print("Creating LEANN index from Chrome history data...") - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Load documents using ChromeHistoryReader from history_data - from history_data.history import ChromeHistoryReader - - reader = ChromeHistoryReader() - - documents = reader.load_data(chrome_profile_path=profile_path, max_count=max_count) - - if not documents: - print("No documents loaded. Exiting.") - return None - - print(f"Loaded {len(documents)} history documents") - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - all_texts.append(node.get_content()) - - print(f"Created {len(all_texts)} text chunks from {len(documents)} documents") - - # Create LEANN index directory - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - # LeannBuilder will automatically detect normalized embeddings and set appropriate distance metric - builder = LeannBuilder( - backend_name="hnsw", - embedding_model=embedding_model, - embedding_mode=embedding_mode, - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} history chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -async def query_leann_index(index_path: str, query: str): - """ - Query the LEANN index. - - Args: - index_path: Path to the LEANN index - query: The query string - """ - print("\n[PHASE 2] Starting Leann chat session...") - chat = LeannChat(index_path=index_path) - - print(f"You: {query}") - chat_response = chat.ask( - query, - top_k=10, - recompute_beighbor_embeddings=True, - complexity=32, - beam_width=1, - llm_config={ - "type": "openai", - "model": "gpt-4o", - "api_key": os.getenv("OPENAI_API_KEY"), - }, - llm_kwargs={"temperature": 0.0, "max_tokens": 1000}, - ) - - print(f"Leann chat response: \033[36m{chat_response}\033[0m") - - -async def main(): - # Parse command line arguments - parser = argparse.ArgumentParser( - description="LEANN Chrome History Reader - Create and query browser history index" - ) - parser.add_argument( - "--chrome-profile", - type=str, - default=DEFAULT_CHROME_PROFILE, - help=f"Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this", - ) - parser.add_argument( - "--index-dir", - type=str, - default="./google_history_index", - help="Directory to store the LEANN index (default: ./chrome_history_index_leann_test)", - ) - parser.add_argument( - "--max-entries", - type=int, - default=1000, - help="Maximum number of history entries to process (default: 1000)", - ) - parser.add_argument( - "--query", - type=str, - default=None, - help="Single query to run (default: runs example queries)", - ) - parser.add_argument( - "--auto-find-profiles", - action="store_true", - default=True, - help="Automatically find all Chrome profiles (default: True)", - ) - parser.add_argument( - "--embedding-model", - type=str, - default="facebook/contriever", - help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small')", - ) - parser.add_argument( - "--embedding-mode", - type=str, - default="sentence-transformers", - choices=["sentence-transformers", "openai", "mlx"], - help="The embedding backend mode", - ) - parser.add_argument( - "--use-existing-index", - action="store_true", - help="Use existing index without rebuilding", - ) - - args = parser.parse_args() - - INDEX_DIR = Path(args.index_dir) - INDEX_PATH = str(INDEX_DIR / "chrome_history.leann") - - print(f"Using Chrome profile: {args.chrome_profile}") - print(f"Index directory: {INDEX_DIR}") - print(f"Max entries: {args.max_entries}") - - if args.use_existing_index: - # Use existing index without rebuilding - if not Path(INDEX_PATH).exists(): - print(f"Error: Index file not found at {INDEX_PATH}") - return - print(f"Using existing index at {INDEX_PATH}") - index_path = INDEX_PATH - else: - # Find Chrome profile directories - from history_data.history import ChromeHistoryReader - - if args.auto_find_profiles: - profile_dirs = ChromeHistoryReader.find_chrome_profiles() - if not profile_dirs: - print("No Chrome profiles found automatically. Exiting.") - return - else: - # Use single specified profile - profile_path = Path(args.chrome_profile) - if not profile_path.exists(): - print(f"Chrome profile not found: {profile_path}") - return - profile_dirs = [profile_path] - - # Create or load the LEANN index from all sources - index_path = create_leann_index_from_multiple_chrome_profiles( - profile_dirs, INDEX_PATH, args.max_entries, args.embedding_model, args.embedding_mode - ) - - if index_path: - if args.query: - # Run single query - await query_leann_index(index_path, args.query) - else: - # Example queries - queries = [ - "What websites did I visit about machine learning?", - "Find my search history about programming", - ] - - for query in queries: - print("\n" + "=" * 60) - await query_leann_index(index_path, query) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/mail_reader_leann.py b/examples/mail_reader_leann.py deleted file mode 100644 index 6aa7536..0000000 --- a/examples/mail_reader_leann.py +++ /dev/null @@ -1,342 +0,0 @@ -import argparse -import asyncio -import os -import sys -from pathlib import Path - -import dotenv - -# Add the project root to Python path so we can import from examples -project_root = Path(__file__).parent.parent -sys.path.insert(0, str(project_root)) - -from leann.api import LeannBuilder, LeannChat -from llama_index.core.node_parser import SentenceSplitter - -dotenv.load_dotenv() - - -# Auto-detect user's mail path -def get_mail_path(): - """Get the mail path for the current user""" - home_dir = os.path.expanduser("~") - return os.path.join(home_dir, "Library", "Mail") - - -# Default mail path for macOS -DEFAULT_MAIL_PATH = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data" - - -def create_leann_index_from_multiple_sources( - messages_dirs: list[Path], - index_path: str = "mail_index.leann", - max_count: int = -1, - include_html: bool = False, - embedding_model: str = "facebook/contriever", -): - """ - Create LEANN index from multiple mail data sources. - - Args: - messages_dirs: List of Path objects pointing to Messages directories - index_path: Path to save the LEANN index - max_count: Maximum number of emails to process per directory - include_html: Whether to include HTML content in email processing - """ - print("Creating LEANN index from multiple mail data sources...") - - # Load documents using EmlxReader from LEANN_email_reader - from examples.email_data.LEANN_email_reader import EmlxReader - - reader = EmlxReader(include_html=include_html) - # from email_data.email import EmlxMboxReader - # from pathlib import Path - # reader = EmlxMboxReader() - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - all_documents = [] - total_processed = 0 - - # Process each Messages directory - for i, messages_dir in enumerate(messages_dirs): - print(f"\nProcessing Messages directory {i + 1}/{len(messages_dirs)}: {messages_dir}") - - try: - documents = reader.load_data(messages_dir) - if documents: - print(f"Loaded {len(documents)} email documents from {messages_dir}") - all_documents.extend(documents) - total_processed += len(documents) - - # Check if we've reached the max count - if max_count > 0 and total_processed >= max_count: - print(f"Reached max count of {max_count} documents") - break - else: - print(f"No documents loaded from {messages_dir}") - except Exception as e: - print(f"Error processing {messages_dir}: {e}") - continue - - if not all_documents: - print("No documents loaded from any source. Exiting.") - return None - - print( - f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories and starting to split them into chunks" - ) - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in all_documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - text = node.get_content() - # text = '[subject] ' + doc.metadata["subject"] + '\n' + text - all_texts.append(text) - - print( - f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks" - ) - - # Create LEANN index directory - - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - builder = LeannBuilder( - backend_name="hnsw", - embedding_model=embedding_model, - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} email chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -def create_leann_index( - mail_path: str, - index_path: str = "mail_index.leann", - max_count: int = 1000, - include_html: bool = False, - embedding_model: str = "facebook/contriever", -): - """ - Create LEANN index from mail data. - - Args: - mail_path: Path to the mail directory - index_path: Path to save the LEANN index - max_count: Maximum number of emails to process - include_html: Whether to include HTML content in email processing - """ - print("Creating LEANN index from mail data...") - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Load documents using EmlxReader from LEANN_email_reader - from examples.email_data.LEANN_email_reader import EmlxReader - - reader = EmlxReader(include_html=include_html) - # from email_data.email import EmlxMboxReader - # from pathlib import Path - # reader = EmlxMboxReader() - documents = reader.load_data(Path(mail_path)) - - if not documents: - print("No documents loaded. Exiting.") - return None - - print(f"Loaded {len(documents)} email documents") - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - all_texts.append(node.get_content()) - - print(f"Created {len(all_texts)} text chunks from {len(documents)} documents") - - # Create LEANN index directory - - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - builder = LeannBuilder( - backend_name="hnsw", - embedding_model=embedding_model, - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} email chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -async def query_leann_index(index_path: str, query: str): - """ - Query the LEANN index. - - Args: - index_path: Path to the LEANN index - query: The query string - """ - print("\n[PHASE 2] Starting Leann chat session...") - chat = LeannChat(index_path=index_path, llm_config={"type": "openai", "model": "gpt-4o"}) - - print(f"You: {query}") - import time - - time.time() - chat_response = chat.ask( - query, - top_k=20, - recompute_beighbor_embeddings=True, - complexity=32, - beam_width=1, - ) - time.time() - # print(f"Time taken: {end_time - start_time} seconds") - # highlight the answer - print(f"Leann chat response: \033[36m{chat_response}\033[0m") - - -async def main(): - # Parse command line arguments - parser = argparse.ArgumentParser(description="LEANN Mail Reader - Create and query email index") - # Remove --mail-path argument and auto-detect all Messages directories - # Remove DEFAULT_MAIL_PATH - parser.add_argument( - "--index-dir", - type=str, - default="./mail_index", - help="Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)", - ) - parser.add_argument( - "--max-emails", - type=int, - default=1000, - help="Maximum number of emails to process (-1 means all)", - ) - parser.add_argument( - "--query", - type=str, - default="Give me some funny advertisement about apple or other companies", - help="Single query to run (default: runs example queries)", - ) - parser.add_argument( - "--include-html", - action="store_true", - default=False, - help="Include HTML content in email processing (default: False)", - ) - parser.add_argument( - "--embedding-model", - type=str, - default="facebook/contriever", - help="Embedding model to use (default: facebook/contriever)", - ) - - args = parser.parse_args() - - print(f"args: {args}") - - # Automatically find all Messages directories under the current user's Mail directory - from examples.email_data.LEANN_email_reader import find_all_messages_directories - - mail_path = get_mail_path() - print(f"Searching for email data in: {mail_path}") - messages_dirs = find_all_messages_directories(mail_path) - # messages_dirs = find_all_messages_directories(DEFAULT_MAIL_PATH) - # messages_dirs = [DEFAULT_MAIL_PATH] - # messages_dirs = messages_dirs[:1] - - print("len(messages_dirs): ", len(messages_dirs)) - - if not messages_dirs: - print("No Messages directories found. Exiting.") - return - - INDEX_DIR = Path(args.index_dir) - INDEX_PATH = str(INDEX_DIR / "mail_documents.leann") - print(f"Index directory: {INDEX_DIR}") - print(f"Found {len(messages_dirs)} Messages directories.") - - # Create or load the LEANN index from all sources - index_path = create_leann_index_from_multiple_sources( - messages_dirs, - INDEX_PATH, - args.max_emails, - args.include_html, - args.embedding_model, - ) - - if index_path: - if args.query: - # Run single query - await query_leann_index(index_path, args.query) - else: - # Example queries - queries = [ - "Hows Berkeley Graduate Student Instructor", - "how's the icloud related advertisement saying", - "Whats the number of class recommend to take per semester for incoming EECS students", - ] - for query in queries: - print("\n" + "=" * 60) - await query_leann_index(index_path, query) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/mail_reader_llamaindex.py b/examples/mail_reader_llamaindex.py deleted file mode 100644 index cfb6b82..0000000 --- a/examples/mail_reader_llamaindex.py +++ /dev/null @@ -1,135 +0,0 @@ -import argparse -import os -import sys -from pathlib import Path - -# Add the project root to Python path so we can import from examples -project_root = Path(__file__).parent.parent -sys.path.insert(0, str(project_root)) - -import torch -from llama_index.core import StorageContext, VectorStoreIndex -from llama_index.core.node_parser import SentenceSplitter - -# --- EMBEDDING MODEL --- -from llama_index.embeddings.huggingface import HuggingFaceEmbedding - -# --- END EMBEDDING MODEL --- -# Import EmlxReader from the new module -from examples.email_data.LEANN_email_reader import EmlxReader - - -def create_and_save_index( - mail_path: str, - save_dir: str = "mail_index_embedded", - max_count: int = 1000, - include_html: bool = False, -): - print("Creating index from mail data with embedded metadata...") - documents = EmlxReader(include_html=include_html).load_data(mail_path, max_count=max_count) - if not documents: - print("No documents loaded. Exiting.") - return None - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) - # Use facebook/contriever as the embedder - embed_model = HuggingFaceEmbedding(model_name="facebook/contriever") - # set on device - - if torch.cuda.is_available(): - embed_model._model.to("cuda") - # set mps - elif torch.backends.mps.is_available(): - embed_model._model.to("mps") - else: - embed_model._model.to("cpu") - index = VectorStoreIndex.from_documents( - documents, transformations=[text_splitter], embed_model=embed_model - ) - os.makedirs(save_dir, exist_ok=True) - index.storage_context.persist(persist_dir=save_dir) - print(f"Index saved to {save_dir}") - return index - - -def load_index(save_dir: str = "mail_index_embedded"): - try: - storage_context = StorageContext.from_defaults(persist_dir=save_dir) - index = VectorStoreIndex.from_vector_store( - storage_context.vector_store, storage_context=storage_context - ) - print(f"Index loaded from {save_dir}") - return index - except Exception as e: - print(f"Error loading index: {e}") - return None - - -def query_index(index, query: str): - if index is None: - print("No index available for querying.") - return - query_engine = index.as_query_engine() - response = query_engine.query(query) - print(f"Query: {query}") - print(f"Response: {response}") - - -def main(): - # Parse command line arguments - parser = argparse.ArgumentParser( - description="LlamaIndex Mail Reader - Create and query email index" - ) - parser.add_argument( - "--mail-path", - type=str, - default="/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages", - help="Path to mail data directory", - ) - parser.add_argument( - "--save-dir", - type=str, - default="mail_index_embedded", - help="Directory to store the index (default: mail_index_embedded)", - ) - parser.add_argument( - "--max-emails", - type=int, - default=10000, - help="Maximum number of emails to process", - ) - parser.add_argument( - "--include-html", - action="store_true", - default=False, - help="Include HTML content in email processing (default: False)", - ) - - args = parser.parse_args() - - mail_path = args.mail_path - save_dir = args.save_dir - - if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")): - print("Loading existing index...") - index = load_index(save_dir) - else: - print("Creating new index...") - index = create_and_save_index( - mail_path, - save_dir, - max_count=args.max_emails, - include_html=args.include_html, - ) - if index: - queries = [ - "Hows Berkeley Graduate Student Instructor", - "how's the icloud related advertisement saying", - "Whats the number of class recommend to take per semester for incoming EECS students", - ] - for query in queries: - print("\n" + "=" * 50) - query_index(index, query) - - -if __name__ == "__main__": - main() diff --git a/examples/main_cli_example.py b/examples/main_cli_example.py deleted file mode 100644 index 9b65725..0000000 --- a/examples/main_cli_example.py +++ /dev/null @@ -1,146 +0,0 @@ -import argparse -import asyncio -from pathlib import Path - -import dotenv -from leann.api import LeannBuilder, LeannChat -from llama_index.core import SimpleDirectoryReader -from llama_index.core.node_parser import SentenceSplitter - -dotenv.load_dotenv() - - -async def main(args): - INDEX_DIR = Path(args.index_dir) - INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann") - - if not INDEX_DIR.exists(): - node_parser = SentenceSplitter( - chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n" - ) - - print("Loading documents...") - documents = SimpleDirectoryReader( - args.data_dir, - recursive=True, - encoding="utf-8", - required_exts=[".pdf", ".txt", ".md"], - ).load_data(show_progress=True) - print("Documents loaded.") - all_texts = [] - for doc in documents: - nodes = node_parser.get_nodes_from_documents([doc]) - if nodes: - all_texts.extend(node.get_content() for node in nodes) - - print("--- Index directory not found, building new index ---") - - print("\n[PHASE 1] Building Leann index...") - - # LeannBuilder now automatically detects normalized embeddings and sets appropriate distance metric - print(f"Using {args.embedding_model} with {args.embedding_mode} mode") - - # Use HNSW backend for better macOS compatibility - builder = LeannBuilder( - backend_name="hnsw", - embedding_model=args.embedding_model, - embedding_mode=args.embedding_mode, - # distance_metric is automatically set based on embedding model - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Loaded {len(all_texts)} text chunks from documents.") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(INDEX_PATH) - print(f"\nLeann index built at {INDEX_PATH}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - print("\n[PHASE 2] Starting Leann chat session...") - - # Build llm_config based on command line arguments - if args.llm == "simulated": - llm_config = {"type": "simulated"} - elif args.llm == "ollama": - llm_config = {"type": "ollama", "model": args.model, "host": args.host} - elif args.llm == "hf": - llm_config = {"type": "hf", "model": args.model} - elif args.llm == "openai": - llm_config = {"type": "openai", "model": args.model} - else: - raise ValueError(f"Unknown LLM type: {args.llm}") - - print(f"Using LLM: {args.llm} with model: {args.model if args.llm != 'simulated' else 'N/A'}") - - chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config) - # query = ( - # "ไปไนๆฏ็ๅคๅคงๆจกๅไปฅๅ็ๅคๅผๅ่ฟ็จไธญ้ๅฐไบไปไน้ดๆ้ข,ไปปๅกไปคไธ่ฌๅจไปไนๅๅธ้ขๅ" - # ) - query = args.query - - print(f"You: {query}") - chat_response = chat.ask(query, top_k=20, recompute_embeddings=True, complexity=32) - print(f"Leann chat response: \033[36m{chat_response}\033[0m") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run Leann Chat with various LLM backends.") - parser.add_argument( - "--llm", - type=str, - default="openai", - choices=["simulated", "ollama", "hf", "openai"], - help="The LLM backend to use.", - ) - parser.add_argument( - "--model", - type=str, - default="gpt-4o", - help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).", - ) - parser.add_argument( - "--embedding-model", - type=str, - default="facebook/contriever", - help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small').", - ) - parser.add_argument( - "--embedding-mode", - type=str, - default="sentence-transformers", - choices=["sentence-transformers", "openai", "mlx"], - help="The embedding backend mode.", - ) - parser.add_argument( - "--host", - type=str, - default="http://localhost:11434", - help="The host for the Ollama API.", - ) - parser.add_argument( - "--index-dir", - type=str, - default="./test_doc_files", - help="Directory where the Leann index will be stored.", - ) - parser.add_argument( - "--data-dir", - type=str, - default="examples/data", - help="Directory containing documents to index (PDF, TXT, MD files).", - ) - parser.add_argument( - "--query", - type=str, - default="Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?", - help="The query to ask the Leann chat system.", - ) - args = parser.parse_args() - - asyncio.run(main(args)) diff --git a/test/build_mlx_index.py b/examples/mlx_demo.py similarity index 100% rename from test/build_mlx_index.py rename to examples/mlx_demo.py diff --git a/examples/multi_vector_aggregator.py b/examples/multi_vector_aggregator.py deleted file mode 100644 index 3ec376a..0000000 --- a/examples/multi_vector_aggregator.py +++ /dev/null @@ -1,360 +0,0 @@ -#!/usr/bin/env python3 -""" -Multi-Vector Aggregator for Fat Embeddings -========================================== - -This module implements aggregation strategies for multi-vector embeddings, -similar to ColPali's approach where multiple patch vectors represent a single document. - -Key features: -- MaxSim aggregation (take maximum similarity across patches) -- Voting-based aggregation (count patch matches) -- Weighted aggregation (attention-score weighted) -- Spatial clustering of matching patches -- Document-level result consolidation -""" - -from collections import defaultdict -from dataclasses import dataclass -from typing import Any - -import numpy as np - - -@dataclass -class PatchResult: - """Represents a single patch search result.""" - - patch_id: int - image_name: str - image_path: str - coordinates: tuple[int, int, int, int] # (x1, y1, x2, y2) - score: float - attention_score: float - scale: float - metadata: dict[str, Any] - - -@dataclass -class AggregatedResult: - """Represents an aggregated document-level result.""" - - image_name: str - image_path: str - doc_score: float - patch_count: int - best_patch: PatchResult - all_patches: list[PatchResult] - aggregation_method: str - spatial_clusters: list[list[PatchResult]] | None = None - - -class MultiVectorAggregator: - """ - Aggregates multiple patch-level results into document-level results. - """ - - def __init__( - self, - aggregation_method: str = "maxsim", - spatial_clustering: bool = True, - cluster_distance_threshold: float = 100.0, - ): - """ - Initialize the aggregator. - - Args: - aggregation_method: "maxsim", "voting", "weighted", or "mean" - spatial_clustering: Whether to cluster spatially close patches - cluster_distance_threshold: Distance threshold for spatial clustering - """ - self.aggregation_method = aggregation_method - self.spatial_clustering = spatial_clustering - self.cluster_distance_threshold = cluster_distance_threshold - - def aggregate_results( - self, search_results: list[dict[str, Any]], top_k: int = 10 - ) -> list[AggregatedResult]: - """ - Aggregate patch-level search results into document-level results. - - Args: - search_results: List of search results from LeannSearcher - top_k: Number of top documents to return - - Returns: - List of aggregated document results - """ - # Group results by image - image_groups = defaultdict(list) - - for result in search_results: - metadata = result.metadata - if "image_name" in metadata and "patch_id" in metadata: - patch_result = PatchResult( - patch_id=metadata["patch_id"], - image_name=metadata["image_name"], - image_path=metadata["image_path"], - coordinates=tuple(metadata["coordinates"]), - score=result.score, - attention_score=metadata.get("attention_score", 0.0), - scale=metadata.get("scale", 1.0), - metadata=metadata, - ) - image_groups[metadata["image_name"]].append(patch_result) - - # Aggregate each image group - aggregated_results = [] - for image_name, patches in image_groups.items(): - if len(patches) == 0: - continue - - agg_result = self._aggregate_image_patches(image_name, patches) - aggregated_results.append(agg_result) - - # Sort by aggregated score and return top-k - aggregated_results.sort(key=lambda x: x.doc_score, reverse=True) - return aggregated_results[:top_k] - - def _aggregate_image_patches( - self, image_name: str, patches: list[PatchResult] - ) -> AggregatedResult: - """Aggregate patches for a single image.""" - - if self.aggregation_method == "maxsim": - doc_score = max(patch.score for patch in patches) - best_patch = max(patches, key=lambda p: p.score) - - elif self.aggregation_method == "voting": - # Count patches above threshold - threshold = np.percentile([p.score for p in patches], 75) - doc_score = sum(1 for patch in patches if patch.score >= threshold) - best_patch = max(patches, key=lambda p: p.score) - - elif self.aggregation_method == "weighted": - # Weight by attention scores - total_weighted_score = sum(p.score * p.attention_score for p in patches) - total_weights = sum(p.attention_score for p in patches) - doc_score = total_weighted_score / max(total_weights, 1e-8) - best_patch = max(patches, key=lambda p: p.score * p.attention_score) - - elif self.aggregation_method == "mean": - doc_score = np.mean([patch.score for patch in patches]) - best_patch = max(patches, key=lambda p: p.score) - - else: - raise ValueError(f"Unknown aggregation method: {self.aggregation_method}") - - # Spatial clustering if enabled - spatial_clusters = None - if self.spatial_clustering: - spatial_clusters = self._cluster_patches_spatially(patches) - - return AggregatedResult( - image_name=image_name, - image_path=patches[0].image_path, - doc_score=float(doc_score), - patch_count=len(patches), - best_patch=best_patch, - all_patches=sorted(patches, key=lambda p: p.score, reverse=True), - aggregation_method=self.aggregation_method, - spatial_clusters=spatial_clusters, - ) - - def _cluster_patches_spatially(self, patches: list[PatchResult]) -> list[list[PatchResult]]: - """Cluster patches that are spatially close to each other.""" - if len(patches) <= 1: - return [patches] - - clusters = [] - remaining_patches = patches.copy() - - while remaining_patches: - # Start new cluster with highest scoring remaining patch - seed_patch = max(remaining_patches, key=lambda p: p.score) - current_cluster = [seed_patch] - remaining_patches.remove(seed_patch) - - # Add nearby patches to cluster - added_to_cluster = True - while added_to_cluster: - added_to_cluster = False - for patch in remaining_patches.copy(): - if self._is_patch_nearby(patch, current_cluster): - current_cluster.append(patch) - remaining_patches.remove(patch) - added_to_cluster = True - - clusters.append(current_cluster) - - return sorted(clusters, key=lambda cluster: max(p.score for p in cluster), reverse=True) - - def _is_patch_nearby(self, patch: PatchResult, cluster: list[PatchResult]) -> bool: - """Check if a patch is spatially close to any patch in the cluster.""" - patch_center = self._get_patch_center(patch.coordinates) - - for cluster_patch in cluster: - cluster_center = self._get_patch_center(cluster_patch.coordinates) - distance = np.sqrt( - (patch_center[0] - cluster_center[0]) ** 2 - + (patch_center[1] - cluster_center[1]) ** 2 - ) - - if distance <= self.cluster_distance_threshold: - return True - - return False - - def _get_patch_center(self, coordinates: tuple[int, int, int, int]) -> tuple[float, float]: - """Get center point of a patch.""" - x1, y1, x2, y2 = coordinates - return ((x1 + x2) / 2, (y1 + y2) / 2) - - def print_aggregated_results( - self, results: list[AggregatedResult], max_patches_per_doc: int = 3 - ): - """Pretty print aggregated results.""" - print(f"\n๐ Aggregated Results (method: {self.aggregation_method})") - print("=" * 80) - - for i, result in enumerate(results): - print(f"\n{i + 1}. {result.image_name}") - print(f" Doc Score: {result.doc_score:.4f} | Patches: {result.patch_count}") - print(f" Path: {result.image_path}") - - # Show best patch - best = result.best_patch - print( - f" ๐ Best Patch: #{best.patch_id} at {best.coordinates} (score: {best.score:.4f})" - ) - - # Show top patches - print(" ๐ Top Patches:") - for j, patch in enumerate(result.all_patches[:max_patches_per_doc]): - print( - f" {j + 1}. Patch #{patch.patch_id}: {patch.score:.4f} at {patch.coordinates}" - ) - - # Show spatial clusters if available - if result.spatial_clusters and len(result.spatial_clusters) > 1: - print(f" ๐๏ธ Spatial Clusters: {len(result.spatial_clusters)}") - for j, cluster in enumerate(result.spatial_clusters[:2]): # Show top 2 clusters - cluster_score = max(p.score for p in cluster) - print( - f" Cluster {j + 1}: {len(cluster)} patches (best: {cluster_score:.4f})" - ) - - -def demo_aggregation(): - """Demonstrate the multi-vector aggregation functionality.""" - print("=== Multi-Vector Aggregation Demo ===") - - # Simulate some patch-level search results - # In real usage, these would come from LeannSearcher.search() - - class MockResult: - def __init__(self, score, metadata): - self.score = score - self.metadata = metadata - - # Simulate results for 2 images with multiple patches each - mock_results = [ - # Image 1: cats_and_kitchen.jpg - 4 patches - MockResult( - 0.85, - { - "image_name": "cats_and_kitchen.jpg", - "image_path": "/path/to/cats_and_kitchen.jpg", - "patch_id": 3, - "coordinates": [100, 50, 224, 174], # Kitchen area - "attention_score": 0.92, - "scale": 1.0, - }, - ), - MockResult( - 0.78, - { - "image_name": "cats_and_kitchen.jpg", - "image_path": "/path/to/cats_and_kitchen.jpg", - "patch_id": 7, - "coordinates": [200, 300, 324, 424], # Cat area - "attention_score": 0.88, - "scale": 1.0, - }, - ), - MockResult( - 0.72, - { - "image_name": "cats_and_kitchen.jpg", - "image_path": "/path/to/cats_and_kitchen.jpg", - "patch_id": 12, - "coordinates": [150, 100, 274, 224], # Appliances - "attention_score": 0.75, - "scale": 1.0, - }, - ), - MockResult( - 0.65, - { - "image_name": "cats_and_kitchen.jpg", - "image_path": "/path/to/cats_and_kitchen.jpg", - "patch_id": 15, - "coordinates": [50, 250, 174, 374], # Furniture - "attention_score": 0.70, - "scale": 1.0, - }, - ), - # Image 2: city_street.jpg - 3 patches - MockResult( - 0.68, - { - "image_name": "city_street.jpg", - "image_path": "/path/to/city_street.jpg", - "patch_id": 2, - "coordinates": [300, 100, 424, 224], # Buildings - "attention_score": 0.80, - "scale": 1.0, - }, - ), - MockResult( - 0.62, - { - "image_name": "city_street.jpg", - "image_path": "/path/to/city_street.jpg", - "patch_id": 8, - "coordinates": [100, 350, 224, 474], # Street level - "attention_score": 0.75, - "scale": 1.0, - }, - ), - MockResult( - 0.55, - { - "image_name": "city_street.jpg", - "image_path": "/path/to/city_street.jpg", - "patch_id": 11, - "coordinates": [400, 200, 524, 324], # Sky area - "attention_score": 0.60, - "scale": 1.0, - }, - ), - ] - - # Test different aggregation methods - methods = ["maxsim", "voting", "weighted", "mean"] - - for method in methods: - print(f"\n{'=' * 20} {method.upper()} AGGREGATION {'=' * 20}") - - aggregator = MultiVectorAggregator( - aggregation_method=method, - spatial_clustering=True, - cluster_distance_threshold=100.0, - ) - - aggregated = aggregator.aggregate_results(mock_results, top_k=5) - aggregator.print_aggregated_results(aggregated) - - -if __name__ == "__main__": - demo_aggregation() diff --git a/examples/openai_hnsw_example.py b/examples/openai_hnsw_example.py deleted file mode 100644 index 9dcbbf8..0000000 --- a/examples/openai_hnsw_example.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python3 -""" -OpenAI Embedding Example - -Complete example showing how to build and search with OpenAI embeddings using HNSW backend. -""" - -import os -from pathlib import Path - -import dotenv -from leann.api import LeannBuilder, LeannSearcher - -# Load environment variables -dotenv.load_dotenv() - - -def main(): - # Check if OpenAI API key is available - api_key = os.getenv("OPENAI_API_KEY") - if not api_key: - print("ERROR: OPENAI_API_KEY environment variable not set") - return False - - print(f"โ OpenAI API key found: {api_key[:10]}...") - - # Sample texts - sample_texts = [ - "Machine learning is a powerful technology that enables computers to learn from data.", - "Natural language processing helps computers understand and generate human language.", - "Deep learning uses neural networks with multiple layers to solve complex problems.", - "Computer vision allows machines to interpret and understand visual information.", - "Reinforcement learning trains agents to make decisions through trial and error.", - "Data science combines statistics, math, and programming to extract insights from data.", - "Artificial intelligence aims to create machines that can perform human-like tasks.", - "Python is a popular programming language used extensively in data science and AI.", - "Neural networks are inspired by the structure and function of the human brain.", - "Big data refers to extremely large datasets that require special tools to process.", - ] - - INDEX_DIR = Path("./simple_openai_test_index") - INDEX_PATH = str(INDEX_DIR / "simple_test.leann") - - print("\n=== Building Index with OpenAI Embeddings ===") - print(f"Index path: {INDEX_PATH}") - - try: - # Use proper configuration for OpenAI embeddings - builder = LeannBuilder( - backend_name="hnsw", - embedding_model="text-embedding-3-small", - embedding_mode="openai", - # HNSW settings for OpenAI embeddings - M=16, # Smaller graph degree - efConstruction=64, # Smaller construction complexity - is_compact=True, # Enable compact storage for recompute - is_recompute=True, # MUST enable for OpenAI embeddings - num_threads=1, - ) - - print(f"Adding {len(sample_texts)} texts to the index...") - for i, text in enumerate(sample_texts): - metadata = {"id": f"doc_{i}", "topic": "AI"} - builder.add_text(text, metadata) - - print("Building index...") - builder.build_index(INDEX_PATH) - print("โ Index built successfully!") - - except Exception as e: - print(f"โ Error building index: {e}") - import traceback - - traceback.print_exc() - return False - - print("\n=== Testing Search ===") - - try: - searcher = LeannSearcher(INDEX_PATH) - - test_queries = [ - "What is machine learning?", - "How do neural networks work?", - "Programming languages for data science", - ] - - for query in test_queries: - print(f"\n๐ Query: '{query}'") - results = searcher.search(query, top_k=3) - - print(f" Found {len(results)} results:") - for i, result in enumerate(results): - print(f" {i + 1}. Score: {result.score:.4f}") - print(f" Text: {result.text[:80]}...") - - print("\nโ Search test completed successfully!") - return True - - except Exception as e: - print(f"โ Error during search: {e}") - import traceback - - traceback.print_exc() - return False - - -if __name__ == "__main__": - success = main() - if success: - print("\n๐ Simple OpenAI index test completed successfully!") - else: - print("\n๐ฅ Simple OpenAI index test failed!") diff --git a/examples/resue_index.py b/examples/resue_index.py deleted file mode 100644 index bec55ab..0000000 --- a/examples/resue_index.py +++ /dev/null @@ -1,23 +0,0 @@ -import asyncio -from pathlib import Path - -from leann.api import LeannChat - -INDEX_DIR = Path("./test_pdf_index_huawei") -INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann") - - -async def main(): - print("\n[PHASE 2] Starting Leann chat session...") - chat = LeannChat(index_path=INDEX_PATH) - query = "What is the main idea of RL and give me 5 exapmle of classic RL algorithms?" - query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?" - # query = "ไปไนๆฏ็ๅคๅคงๆจกๅไปฅๅ็ๅคๅผๅ่ฟ็จไธญ้ๅฐไบไปไน้ดๆ้ข,ไปปๅกไปคไธ่ฌๅจไปไนๅๅธ้ขๅ" - response = chat.ask( - query, top_k=20, recompute_beighbor_embeddings=True, complexity=32, beam_width=1 - ) - print(f"\n[PHASE 2] Response: {response}") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/wechat_history_reader_leann.py b/examples/wechat_history_reader_leann.py deleted file mode 100644 index ef68c6b..0000000 --- a/examples/wechat_history_reader_leann.py +++ /dev/null @@ -1,320 +0,0 @@ -import argparse -import asyncio -import os -from pathlib import Path - -import dotenv -from leann.api import LeannBuilder, LeannChat -from llama_index.core.node_parser import SentenceSplitter - -dotenv.load_dotenv() - -# Default WeChat export directory -DEFAULT_WECHAT_EXPORT_DIR = "./wechat_export_direct" - - -def create_leann_index_from_multiple_wechat_exports( - export_dirs: list[Path], - index_path: str = "wechat_history_index.leann", - max_count: int = -1, -): - """ - Create LEANN index from multiple WeChat export data sources. - - Args: - export_dirs: List of Path objects pointing to WeChat export directories - index_path: Path to save the LEANN index - max_count: Maximum number of chat entries to process per export - """ - print("Creating LEANN index from multiple WeChat export data sources...") - - # Load documents using WeChatHistoryReader from history_data - from history_data.wechat_history import WeChatHistoryReader - - reader = WeChatHistoryReader() - - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - all_documents = [] - total_processed = 0 - - # Process each WeChat export directory - for i, export_dir in enumerate(export_dirs): - print(f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}") - - try: - documents = reader.load_data( - wechat_export_dir=str(export_dir), - max_count=max_count, - concatenate_messages=True, # Disable concatenation - one message per document - ) - if documents: - print(f"Loaded {len(documents)} chat documents from {export_dir}") - all_documents.extend(documents) - total_processed += len(documents) - - # Check if we've reached the max count - if max_count > 0 and total_processed >= max_count: - print(f"Reached max count of {max_count} documents") - break - else: - print(f"No documents loaded from {export_dir}") - except Exception as e: - print(f"Error processing {export_dir}: {e}") - continue - - if not all_documents: - print("No documents loaded from any source. Exiting.") - return None - - print( - f"\nTotal loaded {len(all_documents)} chat documents from {len(export_dirs)} exports and starting to split them into chunks" - ) - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=192, chunk_overlap=64) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in all_documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - text = ( - "[Contact] means the message is from: " - + doc.metadata["contact_name"] - + "\n" - + node.get_content() - ) - all_texts.append(text) - - print( - f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks" - ) - - # Create LEANN index directory - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - builder = LeannBuilder( - backend_name="hnsw", - embedding_model="Qwen/Qwen3-Embedding-0.6B", - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} chat chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -def create_leann_index( - export_dir: str | None = None, - index_path: str = "wechat_history_index.leann", - max_count: int = 1000, -): - """ - Create LEANN index from WeChat chat history data. - - Args: - export_dir: Path to the WeChat export directory (optional, uses default if None) - index_path: Path to save the LEANN index - max_count: Maximum number of chat entries to process - """ - print("Creating LEANN index from WeChat chat history data...") - INDEX_DIR = Path(index_path).parent - - if not INDEX_DIR.exists(): - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Load documents using WeChatHistoryReader from history_data - from history_data.wechat_history import WeChatHistoryReader - - reader = WeChatHistoryReader() - - documents = reader.load_data( - wechat_export_dir=export_dir, - max_count=max_count, - concatenate_messages=False, # Disable concatenation - one message per document - ) - - if not documents: - print("No documents loaded. Exiting.") - return None - - print(f"Loaded {len(documents)} chat documents") - - # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) - - # Convert Documents to text strings and chunk them - all_texts = [] - for doc in documents: - # Split the document into chunks - nodes = text_splitter.get_nodes_from_documents([doc]) - for node in nodes: - all_texts.append(node.get_content()) - - print(f"Created {len(all_texts)} text chunks from {len(documents)} documents") - - # Create LEANN index directory - print("--- Index directory not found, building new index ---") - INDEX_DIR.mkdir(exist_ok=True) - - print("--- Building new LEANN index ---") - - print("\n[PHASE 1] Building Leann index...") - - # Use HNSW backend for better macOS compatibility - builder = LeannBuilder( - backend_name="hnsw", - embedding_model="mlx-community/Qwen3-Embedding-0.6B-4bit-DWQ", # MLX-optimized model - graph_degree=32, - complexity=64, - is_compact=True, - is_recompute=True, - num_threads=1, # Force single-threaded mode - ) - - print(f"Adding {len(all_texts)} chat chunks to index...") - for chunk_text in all_texts: - builder.add_text(chunk_text) - - builder.build_index(index_path) - print(f"\nLEANN index built at {index_path}!") - else: - print(f"--- Using existing index at {INDEX_DIR} ---") - - return index_path - - -async def query_leann_index(index_path: str, query: str): - """ - Query the LEANN index. - - Args: - index_path: Path to the LEANN index - query: The query string - """ - print("\n[PHASE 2] Starting Leann chat session...") - chat = LeannChat(index_path=index_path) - - print(f"You: {query}") - chat_response = chat.ask( - query, - top_k=20, - recompute_beighbor_embeddings=True, - complexity=16, - beam_width=1, - llm_config={ - "type": "openai", - "model": "gpt-4o", - "api_key": os.getenv("OPENAI_API_KEY"), - }, - llm_kwargs={"temperature": 0.0, "max_tokens": 1000}, - ) - print(f"Leann chat response: \033[36m{chat_response}\033[0m") - - -async def main(): - """Main function with integrated WeChat export functionality.""" - - # Parse command line arguments - parser = argparse.ArgumentParser( - description="LEANN WeChat History Reader - Create and query WeChat chat history index" - ) - parser.add_argument( - "--export-dir", - type=str, - default=DEFAULT_WECHAT_EXPORT_DIR, - help=f"Directory to store WeChat exports (default: {DEFAULT_WECHAT_EXPORT_DIR})", - ) - parser.add_argument( - "--index-dir", - type=str, - default="./wechat_history_magic_test_11Debug_new", - help="Directory to store the LEANN index (default: ./wechat_history_index_leann_test)", - ) - parser.add_argument( - "--max-entries", - type=int, - default=50, - help="Maximum number of chat entries to process (default: 5000)", - ) - parser.add_argument( - "--query", - type=str, - default=None, - help="Single query to run (default: runs example queries)", - ) - parser.add_argument( - "--force-export", - action="store_true", - default=False, - help="Force re-export of WeChat data even if exports exist", - ) - - args = parser.parse_args() - - INDEX_DIR = Path(args.index_dir) - INDEX_PATH = str(INDEX_DIR / "wechat_history.leann") - - print(f"Using WeChat export directory: {args.export_dir}") - print(f"Index directory: {INDEX_DIR}") - print(f"Max entries: {args.max_entries}") - - # Initialize WeChat reader with export capabilities - from history_data.wechat_history import WeChatHistoryReader - - reader = WeChatHistoryReader() - - # Find existing exports or create new ones using the centralized method - export_dirs = reader.find_or_export_wechat_data(args.export_dir) - if not export_dirs: - print("Failed to find or export WeChat data. Exiting.") - return - - # Create or load the LEANN index from all sources - index_path = create_leann_index_from_multiple_wechat_exports( - export_dirs, INDEX_PATH, max_count=args.max_entries - ) - - if index_path: - if args.query: - # Run single query - await query_leann_index(index_path, args.query) - else: - # Example queries - queries = [ - "ๆๆณไนฐ้ญๆฏๅธ็บฆ็ฟฐ้็็่กฃ,็ปๆไธไบๅฏนๅบ่ๅคฉ่ฎฐๅฝ?", - ] - - for query in queries: - print("\n" + "=" * 60) - await query_leann_index(index_path, query) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 9efefde..39710df 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -459,7 +459,14 @@ class LeannSearcher: self.meta_path_str = f"{index_path}.meta.json" if not Path(self.meta_path_str).exists(): - raise FileNotFoundError(f"Leann metadata file not found at {self.meta_path_str}") + parent_dir = Path(index_path).parent + print( + f"Leann metadata file not found at {self.meta_path_str}, and you may need to rm -rf {parent_dir}" + ) + # highlight in red the filenotfound error + raise FileNotFoundError( + f"Leann metadata file not found at {self.meta_path_str}, \033[91m you may need to rm -rf {parent_dir}\033[0m" + ) with open(self.meta_path_str, encoding="utf-8") as f: self.meta_data = json.load(f) backend_name = self.meta_data["backend_name"] @@ -493,6 +500,16 @@ class LeannSearcher: logger.info(f" Top_k: {top_k}") logger.info(f" Additional kwargs: {kwargs}") + # Smart top_k detection and adjustment + total_docs = len(self.passage_manager.global_offset_map) + original_top_k = top_k + if top_k > total_docs: + top_k = total_docs + logger.warning( + f" โ ๏ธ Requested top_k ({original_top_k}) exceeds total documents ({total_docs})" + ) + logger.warning(f" โ Auto-adjusted top_k to {top_k} to match available documents") + zmq_port = None start_time = time.time() diff --git a/pyproject.toml b/pyproject.toml index 906593e..d3b42e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,6 +134,14 @@ dev = [ "ruff>=0.12.4", ] +[tool.lychee] +accept = ["200", "403", "429", "503"] +timeout = 20 +max_retries = 2 +exclude = ["localhost", "127.0.0.1", "example.com"] +exclude_path = [".git/", ".venv/", "__pycache__/", "third_party/"] +scheme = ["https", "http"] + [tool.pytest.ini_options] testpaths = ["tests"] python_files = ["test_*.py"] diff --git a/test/mail_reader_llamaindex.py b/test/mail_reader_llamaindex.py deleted file mode 100644 index 33fa1e9..0000000 --- a/test/mail_reader_llamaindex.py +++ /dev/null @@ -1,161 +0,0 @@ -import email -import os -from typing import Any - -from llama_index.core import Document, VectorStoreIndex -from llama_index.core.readers.base import BaseReader - - -class EmlxReader(BaseReader): - """ - Apple Mail .emlx file reader. - - Reads individual .emlx files from Apple Mail's storage format. - """ - - def __init__(self) -> None: - """Initialize.""" - pass - - def load_data(self, input_dir: str, **load_kwargs: Any) -> list[Document]: - """ - Load data from the input directory containing .emlx files. - - Args: - input_dir: Directory containing .emlx files - **load_kwargs: - max_count (int): Maximum amount of messages to read. - """ - docs: list[Document] = [] - max_count = load_kwargs.get("max_count", 1000) - count = 0 - - # Walk through the directory recursively - for dirpath, dirnames, filenames in os.walk(input_dir): - # Skip hidden directories - dirnames[:] = [d for d in dirnames if not d.startswith(".")] - - for filename in filenames: - if count >= max_count: - break - - if filename.endswith(".emlx"): - filepath = os.path.join(dirpath, filename) - try: - # Read the .emlx file - with open(filepath, encoding="utf-8", errors="ignore") as f: - content = f.read() - - # .emlx files have a length prefix followed by the email content - # The first line contains the length, followed by the email - lines = content.split("\n", 1) - if len(lines) >= 2: - email_content = lines[1] - - # Parse the email using Python's email module - try: - msg = email.message_from_string(email_content) - - # Extract email metadata - subject = msg.get("Subject", "No Subject") - from_addr = msg.get("From", "Unknown") - to_addr = msg.get("To", "Unknown") - date = msg.get("Date", "Unknown") - - # Extract email body - body = "" - if msg.is_multipart(): - for part in msg.walk(): - if ( - part.get_content_type() == "text/plain" - or part.get_content_type() == "text/html" - ): - body += part.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - # break - else: - body = msg.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - - # Create document content - doc_content = f""" -From: {from_addr} -To: {to_addr} -Subject: {subject} -Date: {date} - -{body} -""" - - # Create metadata - metadata = { - "file_path": filepath, - "subject": subject, - "from": from_addr, - "to": to_addr, - "date": date, - "filename": filename, - } - if count == 0: - print("--------------------------------") - print("dir path", dirpath) - print(metadata) - print(doc_content) - print("--------------------------------") - body = [] - if msg.is_multipart(): - for part in msg.walk(): - print( - "-------------------------------- get content type -------------------------------" - ) - print(part.get_content_type()) - print(part) - # body.append(part.get_payload(decode=True).decode('utf-8', errors='ignore')) - print( - "-------------------------------- get content type -------------------------------" - ) - else: - body = msg.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - print(body) - - print(body) - print("--------------------------------") - doc = Document(text=doc_content, metadata=metadata) - docs.append(doc) - count += 1 - - except Exception as e: - print(f"!!!!!!! Error parsing email from {filepath}: {e} !!!!!!!!") - continue - - except Exception as e: - print(f"!!!!!!! Error reading file !!!!!!!! {filepath}: {e}") - continue - - print(f"Loaded {len(docs)} email documents") - return docs - - -# Use the custom EmlxReader instead of MboxReader -documents = EmlxReader().load_data( - "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages", - max_count=1000, -) # Returns list of documents - -# Configure the index with larger chunk size to handle long metadata -from llama_index.core.node_parser import SentenceSplitter - -# Create a custom text splitter with larger chunk size -text_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=200) - -index = VectorStoreIndex.from_documents( - documents, transformations=[text_splitter] -) # Initialize index with documents - -query_engine = index.as_query_engine() -res = query_engine.query("Hows Berkeley Graduate Student Instructor") -print(res) diff --git a/test/mail_reader_save_load.py b/test/mail_reader_save_load.py deleted file mode 100644 index e7fb39f..0000000 --- a/test/mail_reader_save_load.py +++ /dev/null @@ -1,219 +0,0 @@ -import email -import os -from typing import Any - -from llama_index.core import Document, StorageContext, VectorStoreIndex -from llama_index.core.node_parser import SentenceSplitter -from llama_index.core.readers.base import BaseReader - - -class EmlxReader(BaseReader): - """ - Apple Mail .emlx file reader. - - Reads individual .emlx files from Apple Mail's storage format. - """ - - def __init__(self) -> None: - """Initialize.""" - pass - - def load_data(self, input_dir: str, **load_kwargs: Any) -> list[Document]: - """ - Load data from the input directory containing .emlx files. - - Args: - input_dir: Directory containing .emlx files - **load_kwargs: - max_count (int): Maximum amount of messages to read. - """ - docs: list[Document] = [] - max_count = load_kwargs.get("max_count", 1000) - count = 0 - - # Walk through the directory recursively - for dirpath, dirnames, filenames in os.walk(input_dir): - # Skip hidden directories - dirnames[:] = [d for d in dirnames if not d.startswith(".")] - - for filename in filenames: - if count >= max_count: - break - - if filename.endswith(".emlx"): - filepath = os.path.join(dirpath, filename) - try: - # Read the .emlx file - with open(filepath, encoding="utf-8", errors="ignore") as f: - content = f.read() - - # .emlx files have a length prefix followed by the email content - # The first line contains the length, followed by the email - lines = content.split("\n", 1) - if len(lines) >= 2: - email_content = lines[1] - - # Parse the email using Python's email module - try: - msg = email.message_from_string(email_content) - - # Extract email metadata - subject = msg.get("Subject", "No Subject") - from_addr = msg.get("From", "Unknown") - to_addr = msg.get("To", "Unknown") - date = msg.get("Date", "Unknown") - - # Extract email body - body = "" - if msg.is_multipart(): - for part in msg.walk(): - if part.get_content_type() == "text/plain": - body = part.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - break - else: - body = msg.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - - # Create document content - doc_content = f""" -From: {from_addr} -To: {to_addr} -Subject: {subject} -Date: {date} - -{body} -""" - - # Create metadata - metadata = { - "file_path": filepath, - "subject": subject, - "from": from_addr, - "to": to_addr, - "date": date, - "filename": filename, - } - - doc = Document(text=doc_content, metadata=metadata) - docs.append(doc) - count += 1 - - except Exception as e: - print(f"Error parsing email from {filepath}: {e}") - continue - - except Exception as e: - print(f"Error reading file {filepath}: {e}") - continue - - print(f"Loaded {len(docs)} email documents") - return docs - - -def create_and_save_index(mail_path: str, save_dir: str = "mail_index", max_count: int = 1000): - """ - Create the index from mail data and save it to disk. - - Args: - mail_path: Path to the mail directory - save_dir: Directory to save the index - max_count: Maximum number of emails to process - """ - print("Creating index from mail data...") - - # Load documents - documents = EmlxReader().load_data(mail_path, max_count=max_count) - - if not documents: - print("No documents loaded. Exiting.") - return None - - # Create text splitter - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=0) - - # Create index - index = VectorStoreIndex.from_documents(documents, transformations=[text_splitter]) - - # Save the index - os.makedirs(save_dir, exist_ok=True) - index.storage_context.persist(persist_dir=save_dir) - print(f"Index saved to {save_dir}") - - return index - - -def load_index(save_dir: str = "mail_index"): - """ - Load the saved index from disk. - - Args: - save_dir: Directory where the index is saved - - Returns: - Loaded index or None if loading fails - """ - try: - # Load storage context - storage_context = StorageContext.from_defaults(persist_dir=save_dir) - - # Load index - index = VectorStoreIndex.from_vector_store( - storage_context.vector_store, storage_context=storage_context - ) - - print(f"Index loaded from {save_dir}") - return index - - except Exception as e: - print(f"Error loading index: {e}") - return None - - -def query_index(index, query: str): - """ - Query the loaded index. - - Args: - index: The loaded index - query: The query string - """ - if index is None: - print("No index available for querying.") - return - - query_engine = index.as_query_engine() - response = query_engine.query(query) - print(f"Query: {query}") - print(f"Response: {response}") - - -def main(): - mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages" - save_dir = "mail_index" - - # Check if index already exists - if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")): - print("Loading existing index...") - index = load_index(save_dir) - else: - print("Creating new index...") - index = create_and_save_index(mail_path, save_dir, max_count=1000) - - if index: - # Example queries - queries = [ - "Hows Berkeley Graduate Student Instructor", - "What emails mention GSR appointments?", - "Find emails about deadlines", - ] - - for query in queries: - print("\n" + "=" * 50) - query_index(index, query) - - -if __name__ == "__main__": - main() diff --git a/test/mail_reader_small_chunks.py b/test/mail_reader_small_chunks.py deleted file mode 100644 index 50bd452..0000000 --- a/test/mail_reader_small_chunks.py +++ /dev/null @@ -1,219 +0,0 @@ -import email -import os -from typing import Any - -from llama_index.core import Document, StorageContext, VectorStoreIndex -from llama_index.core.node_parser import SentenceSplitter -from llama_index.core.readers.base import BaseReader - - -class EmlxReader(BaseReader): - """ - Apple Mail .emlx file reader with reduced metadata. - - Reads individual .emlx files from Apple Mail's storage format. - """ - - def __init__(self) -> None: - """Initialize.""" - pass - - def load_data(self, input_dir: str, **load_kwargs: Any) -> list[Document]: - """ - Load data from the input directory containing .emlx files. - - Args: - input_dir: Directory containing .emlx files - **load_kwargs: - max_count (int): Maximum amount of messages to read. - """ - docs: list[Document] = [] - max_count = load_kwargs.get("max_count", 1000) - count = 0 - - # Walk through the directory recursively - for dirpath, dirnames, filenames in os.walk(input_dir): - # Skip hidden directories - dirnames[:] = [d for d in dirnames if not d.startswith(".")] - - for filename in filenames: - if count >= max_count: - break - - if filename.endswith(".emlx"): - filepath = os.path.join(dirpath, filename) - try: - # Read the .emlx file - with open(filepath, encoding="utf-8", errors="ignore") as f: - content = f.read() - - # .emlx files have a length prefix followed by the email content - # The first line contains the length, followed by the email - lines = content.split("\n", 1) - if len(lines) >= 2: - email_content = lines[1] - - # Parse the email using Python's email module - try: - msg = email.message_from_string(email_content) - - # Extract email metadata - subject = msg.get("Subject", "No Subject") - from_addr = msg.get("From", "Unknown") - to_addr = msg.get("To", "Unknown") - date = msg.get("Date", "Unknown") - - # Extract email body - body = "" - if msg.is_multipart(): - for part in msg.walk(): - if part.get_content_type() == "text/plain": - body = part.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - break - else: - body = msg.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - - # Create document content with metadata embedded in text - doc_content = f""" -From: {from_addr} -To: {to_addr} -Subject: {subject} -Date: {date} - -{body} -""" - - # Create minimal metadata (only essential info) - metadata = { - "subject": subject[:50], # Truncate subject - "from": from_addr[:30], # Truncate from - "date": date[:20], # Truncate date - "filename": filename, # Keep filename - } - - doc = Document(text=doc_content, metadata=metadata) - docs.append(doc) - count += 1 - - except Exception as e: - print(f"Error parsing email from {filepath}: {e}") - continue - - except Exception as e: - print(f"Error reading file {filepath}: {e}") - continue - - print(f"Loaded {len(docs)} email documents") - return docs - - -def create_and_save_index( - mail_path: str, save_dir: str = "mail_index_small", max_count: int = 1000 -): - """ - Create the index from mail data and save it to disk. - - Args: - mail_path: Path to the mail directory - save_dir: Directory to save the index - max_count: Maximum number of emails to process - """ - print("Creating index from mail data with small chunks...") - - # Load documents - documents = EmlxReader().load_data(mail_path, max_count=max_count) - - if not documents: - print("No documents loaded. Exiting.") - return None - - # Create text splitter with small chunk size - text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50) - - # Create index - index = VectorStoreIndex.from_documents(documents, transformations=[text_splitter]) - - # Save the index - os.makedirs(save_dir, exist_ok=True) - index.storage_context.persist(persist_dir=save_dir) - print(f"Index saved to {save_dir}") - - return index - - -def load_index(save_dir: str = "mail_index_small"): - """ - Load the saved index from disk. - - Args: - save_dir: Directory where the index is saved - - Returns: - Loaded index or None if loading fails - """ - try: - # Load storage context - storage_context = StorageContext.from_defaults(persist_dir=save_dir) - - # Load index - index = VectorStoreIndex.from_vector_store( - storage_context.vector_store, storage_context=storage_context - ) - - print(f"Index loaded from {save_dir}") - return index - - except Exception as e: - print(f"Error loading index: {e}") - return None - - -def query_index(index, query: str): - """ - Query the loaded index. - - Args: - index: The loaded index - query: The query string - """ - if index is None: - print("No index available for querying.") - return - - query_engine = index.as_query_engine() - response = query_engine.query(query) - print(f"Query: {query}") - print(f"Response: {response}") - - -def main(): - mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages" - save_dir = "mail_index_small" - - # Check if index already exists - if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")): - print("Loading existing index...") - index = load_index(save_dir) - else: - print("Creating new index...") - index = create_and_save_index(mail_path, save_dir, max_count=1000) - - if index: - # Example queries - queries = [ - "Hows Berkeley Graduate Student Instructor", - "What emails mention GSR appointments?", - "Find emails about deadlines", - ] - - for query in queries: - print("\n" + "=" * 50) - query_index(index, query) - - -if __name__ == "__main__": - main() diff --git a/test/mail_reader_test.py b/test/mail_reader_test.py deleted file mode 100644 index f94070a..0000000 --- a/test/mail_reader_test.py +++ /dev/null @@ -1,154 +0,0 @@ -import email -import os -from typing import Any - -from llama_index.core import Document, VectorStoreIndex -from llama_index.core.readers.base import BaseReader - - -class EmlxReader(BaseReader): - """ - Apple Mail .emlx file reader. - - Reads individual .emlx files from Apple Mail's storage format. - """ - - def __init__(self) -> None: - """Initialize.""" - pass - - def load_data(self, input_dir: str, **load_kwargs: Any) -> list[Document]: - """ - Load data from the input directory containing .emlx files. - - Args: - input_dir: Directory containing .emlx files - **load_kwargs: - max_count (int): Maximum amount of messages to read. - """ - docs: list[Document] = [] - max_count = load_kwargs.get("max_count", 1000) - count = 0 - - # Check if directory exists and is accessible - if not os.path.exists(input_dir): - print(f"Error: Directory '{input_dir}' does not exist") - return docs - - if not os.access(input_dir, os.R_OK): - print(f"Error: Directory '{input_dir}' is not accessible (permission denied)") - print("This is likely due to macOS security restrictions on Mail app data") - return docs - - print(f"Scanning directory: {input_dir}") - - # Walk through the directory recursively - for dirpath, dirnames, filenames in os.walk(input_dir): - # Skip hidden directories - dirnames[:] = [d for d in dirnames if not d.startswith(".")] - - for filename in filenames: - if count >= max_count: - break - - if filename.endswith(".emlx"): - filepath = os.path.join(dirpath, filename) - print(f"Found .emlx file: {filepath}") - try: - # Read the .emlx file - with open(filepath, encoding="utf-8", errors="ignore") as f: - content = f.read() - - # .emlx files have a length prefix followed by the email content - # The first line contains the length, followed by the email - lines = content.split("\n", 1) - if len(lines) >= 2: - email_content = lines[1] - - # Parse the email using Python's email module - try: - msg = email.message_from_string(email_content) - - # Extract email metadata - subject = msg.get("Subject", "No Subject") - from_addr = msg.get("From", "Unknown") - to_addr = msg.get("To", "Unknown") - date = msg.get("Date", "Unknown") - - # Extract email body - body = "" - if msg.is_multipart(): - for part in msg.walk(): - if part.get_content_type() == "text/plain": - body = part.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - break - else: - body = msg.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - - # Create document content - doc_content = f""" -From: {from_addr} -To: {to_addr} -Subject: {subject} -Date: {date} - -{body} -""" - - # Create metadata - metadata = { - "file_path": filepath, - "subject": subject, - "from": from_addr, - "to": to_addr, - "date": date, - "filename": filename, - } - - doc = Document(text=doc_content, metadata=metadata) - docs.append(doc) - count += 1 - - except Exception as e: - print(f"Error parsing email from {filepath}: {e}") - continue - - except Exception as e: - print(f"Error reading file {filepath}: {e}") - continue - - print(f"Loaded {len(docs)} email documents") - return docs - - -def main(): - # Use the current directory where the sample.emlx file is located - current_dir = os.path.dirname(os.path.abspath(__file__)) - - print("Testing EmlxReader with sample .emlx file...") - print(f"Scanning directory: {current_dir}") - - # Use the custom EmlxReader - documents = EmlxReader().load_data(current_dir, max_count=1000) - - if not documents: - print("No documents loaded. Make sure sample.emlx exists in the examples directory.") - return - - print(f"\nSuccessfully loaded {len(documents)} document(s)") - - # Initialize index with documents - index = VectorStoreIndex.from_documents(documents) - query_engine = index.as_query_engine() - - print("\nTesting query: 'Hows Berkeley Graduate Student Instructor'") - res = query_engine.query("Hows Berkeley Graduate Student Instructor") - print(f"Response: {res}") - - -if __name__ == "__main__": - main() diff --git a/test/query_saved_index.py b/test/query_saved_index.py deleted file mode 100644 index dfd3295..0000000 --- a/test/query_saved_index.py +++ /dev/null @@ -1,105 +0,0 @@ -import os - -from llama_index.core import StorageContext, VectorStoreIndex - - -def load_index(save_dir: str = "mail_index"): - """ - Load the saved index from disk. - - Args: - save_dir: Directory where the index is saved - - Returns: - Loaded index or None if loading fails - """ - try: - # Load storage context - storage_context = StorageContext.from_defaults(persist_dir=save_dir) - - # Load index - index = VectorStoreIndex.from_vector_store( - storage_context.vector_store, storage_context=storage_context - ) - - print(f"Index loaded from {save_dir}") - return index - - except Exception as e: - print(f"Error loading index: {e}") - return None - - -def query_index(index, query: str): - """ - Query the loaded index. - - Args: - index: The loaded index - query: The query string - """ - if index is None: - print("No index available for querying.") - return - - query_engine = index.as_query_engine() - response = query_engine.query(query) - print(f"\nQuery: {query}") - print(f"Response: {response}") - - -def main(): - save_dir = "mail_index" - - # Check if index exists - if not os.path.exists(save_dir) or not os.path.exists( - os.path.join(save_dir, "vector_store.json") - ): - print(f"Index not found in {save_dir}") - print("Please run mail_reader_save_load.py first to create the index.") - return - - # Load the index - index = load_index(save_dir) - - if not index: - print("Failed to load index.") - return - - print("\n" + "=" * 60) - print("Email Query Interface") - print("=" * 60) - print("Type 'quit' to exit") - print("Type 'help' for example queries") - print("=" * 60) - - # Interactive query loop - while True: - try: - query = input("\nEnter your query: ").strip() - - if query.lower() == "quit": - print("Goodbye!") - break - elif query.lower() == "help": - print("\nExample queries:") - print("- Hows Berkeley Graduate Student Instructor") - print("- What emails mention GSR appointments?") - print("- Find emails about deadlines") - print("- Search for emails from specific sender") - print("- Find emails about meetings") - continue - elif not query: - continue - - query_index(index, query) - - except KeyboardInterrupt: - print("\nGoodbye!") - break - except Exception as e: - print(f"Error processing query: {e}") - - -if __name__ == "__main__": - main() diff --git a/test/sanity_checks/debug_zmq_issue.py b/test/sanity_checks/debug_zmq_issue.py deleted file mode 100644 index d1bd156..0000000 --- a/test/sanity_checks/debug_zmq_issue.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 -""" -Debug script to test ZMQ communication with the exact same setup as main_cli_example.py -""" - -import sys -import time - -import zmq - -sys.path.append("packages/leann-backend-diskann") -from leann_backend_diskann import embedding_pb2 - - -def test_zmq_with_same_model(): - print("=== Testing ZMQ with same model as main_cli_example.py ===") - - # Test the exact same model that main_cli_example.py uses - model_name = "sentence-transformers/all-mpnet-base-v2" - - # Start server with the same model - import subprocess - - server_cmd = [ - sys.executable, - "-m", - "packages.leann-backend-diskann.leann_backend_diskann.embedding_server", - "--zmq-port", - "5556", # Use different port to avoid conflicts - "--model-name", - model_name, - ] - - print(f"Starting server with command: {' '.join(server_cmd)}") - server_process = subprocess.Popen( - server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True - ) - - # Wait for server to start - print("Waiting for server to start...") - time.sleep(10) - - # Check if server is running - if server_process.poll() is not None: - stdout, stderr = server_process.communicate() - print(f"Server failed to start. stdout: {stdout}") - print(f"Server failed to start. stderr: {stderr}") - return False - - print(f"Server started with PID: {server_process.pid}") - - try: - # Test client - context = zmq.Context() - socket = context.socket(zmq.REQ) - socket.connect("tcp://127.0.0.1:5556") - socket.setsockopt(zmq.RCVTIMEO, 30000) # 30 second timeout like C++ - socket.setsockopt(zmq.SNDTIMEO, 30000) - - # Create request with same format as C++ - request = embedding_pb2.NodeEmbeddingRequest() - request.node_ids.extend([0, 1, 2, 3, 4]) # Test with some node IDs - - print(f"Sending request with {len(request.node_ids)} node IDs...") - start_time = time.time() - - # Send request - socket.send(request.SerializeToString()) - - # Receive response - response_data = socket.recv() - end_time = time.time() - - print(f"Received response in {end_time - start_time:.3f} seconds") - print(f"Response size: {len(response_data)} bytes") - - # Parse response - response = embedding_pb2.NodeEmbeddingResponse() - response.ParseFromString(response_data) - - print(f"Response dimensions: {list(response.dimensions)}") - print(f"Embeddings data size: {len(response.embeddings_data)} bytes") - print(f"Missing IDs: {list(response.missing_ids)}") - - # Calculate expected size - if len(response.dimensions) == 2: - batch_size = response.dimensions[0] - embedding_dim = response.dimensions[1] - expected_bytes = batch_size * embedding_dim * 4 # 4 bytes per float - print(f"Expected bytes: {expected_bytes}, Actual: {len(response.embeddings_data)}") - - if len(response.embeddings_data) == expected_bytes: - print("โ Response format is correct!") - return True - else: - print("โ Response format mismatch!") - return False - else: - print("โ Invalid response dimensions!") - return False - - except Exception as e: - print(f"โ Error during ZMQ test: {e}") - return False - finally: - # Clean up - server_process.terminate() - server_process.wait() - print("Server terminated") - - -if __name__ == "__main__": - success = test_zmq_with_same_model() - if success: - print("\nโ ZMQ communication test passed!") - else: - print("\nโ ZMQ communication test failed!") diff --git a/tests/README.md b/tests/README.md index 06274fd..22822bd 100644 --- a/tests/README.md +++ b/tests/README.md @@ -18,8 +18,8 @@ Basic functionality tests that verify: - Basic index building and searching works for both HNSW and DiskANN backends - Uses parametrized tests to test both backends -### `test_main_cli.py` -Tests the main CLI example functionality: +### `test_document_rag.py` +Tests the document RAG example functionality: - Tests with facebook/contriever embeddings - Tests with OpenAI embeddings (if API key is available) - Tests error handling with invalid parameters diff --git a/tests/test_ci_minimal.py b/tests/test_ci_minimal.py index 4207802..b884cbe 100644 --- a/tests/test_ci_minimal.py +++ b/tests/test_ci_minimal.py @@ -20,7 +20,7 @@ def test_package_imports(): def test_cli_help(): """Test that CLI example shows help.""" result = subprocess.run( - [sys.executable, "examples/main_cli_example.py", "--help"], capture_output=True, text=True + [sys.executable, "apps/document_rag.py", "--help"], capture_output=True, text=True ) assert result.returncode == 0 diff --git a/tests/test_main_cli.py b/tests/test_document_rag.py similarity index 82% rename from tests/test_main_cli.py rename to tests/test_document_rag.py index 4eb0e9f..97c5700 100644 --- a/tests/test_main_cli.py +++ b/tests/test_document_rag.py @@ -1,5 +1,5 @@ """ -Test main_cli_example functionality using pytest. +Test document_rag functionality using pytest. """ import os @@ -14,20 +14,20 @@ import pytest @pytest.fixture def test_data_dir(): """Return the path to test data directory.""" - return Path("examples/data") + return Path("data") @pytest.mark.skipif( os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues" ) -def test_main_cli_simulated(test_data_dir): - """Test main_cli with simulated LLM.""" +def test_document_rag_simulated(test_data_dir): + """Test document_rag with simulated LLM.""" with tempfile.TemporaryDirectory() as temp_dir: # Use a subdirectory that doesn't exist yet to force index creation index_dir = Path(temp_dir) / "test_index" cmd = [ sys.executable, - "examples/main_cli_example.py", + "apps/document_rag.py", "--llm", "simulated", "--embedding-model", @@ -53,19 +53,19 @@ def test_main_cli_simulated(test_data_dir): # Verify output output = result.stdout + result.stderr - assert "Leann index built at" in output or "Using existing index" in output + assert "Index saved to" in output or "Using existing index" in output assert "This is a simulated answer" in output @pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OpenAI API key not available") -def test_main_cli_openai(test_data_dir): - """Test main_cli with OpenAI embeddings.""" +def test_document_rag_openai(test_data_dir): + """Test document_rag with OpenAI embeddings.""" with tempfile.TemporaryDirectory() as temp_dir: # Use a subdirectory that doesn't exist yet to force index creation index_dir = Path(temp_dir) / "test_index_openai" cmd = [ sys.executable, - "examples/main_cli_example.py", + "apps/document_rag.py", "--llm", "simulated", # Use simulated LLM to avoid GPT-4 costs "--embedding-model", @@ -99,12 +99,12 @@ def test_main_cli_openai(test_data_dir): ) -def test_main_cli_error_handling(test_data_dir): - """Test main_cli with invalid parameters.""" +def test_document_rag_error_handling(test_data_dir): + """Test document_rag with invalid parameters.""" with tempfile.TemporaryDirectory() as temp_dir: cmd = [ sys.executable, - "examples/main_cli_example.py", + "apps/document_rag.py", "--llm", "invalid_llm_type", "--index-dir", @@ -117,4 +117,4 @@ def test_main_cli_error_handling(test_data_dir): # Should fail with invalid LLM type assert result.returncode != 0 - assert "Unknown LLM type" in result.stderr or "invalid_llm_type" in result.stderr + assert "invalid choice" in result.stderr or "invalid_llm_type" in result.stderr