From 87c930d70501113b582b4cc96ee33ae57aa81945 Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Sun, 3 Aug 2025 22:27:04 -0700 Subject: [PATCH] fix email wrong -1 to process all file --- README.md | 2 +- examples/base_rag_example.py | 2 +- examples/email_data/LEANN_email_reader.py | 66 +++++++++++++++++------ examples/email_rag.py | 11 ++-- examples/history_data/history.py | 5 ++ examples/wechat_rag.py | 4 ++ packages/leann-core/src/leann/api.py | 8 ++- 7 files changed, 73 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 114912c..2150613 100755 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl ```bash # Core Parameters (General preprocessing for all examples) --index-dir DIR # Directory to store the index (default: current directory) ---query "YOUR QUESTION" # Single query mode. Omit for interactive chat (type 'quit' to exit) +--query "YOUR QUESTION" # Single query mode. Omit for interactive chat (type 'quit' to exit), and now you can play with your index interactively --max-items N # Limit data preprocessing (default: -1, process all data) --force-rebuild # Force rebuild index even if it exists diff --git a/examples/base_rag_example.py b/examples/base_rag_example.py index a164b3c..a135625 100644 --- a/examples/base_rag_example.py +++ b/examples/base_rag_example.py @@ -109,7 +109,7 @@ class BaseRAGExample(ABC): search_group.add_argument( "--search-complexity", type=int, - default=64, + default=32, help="Search complexity for graph traversal (default: 64)", ) diff --git a/examples/email_data/LEANN_email_reader.py b/examples/email_data/LEANN_email_reader.py index 393daf6..407e2ae 100644 --- a/examples/email_data/LEANN_email_reader.py +++ b/examples/email_data/LEANN_email_reader.py @@ -52,6 +52,11 @@ class EmlxReader(BaseReader): docs: list[Document] = [] max_count = load_kwargs.get("max_count", 1000) count = 0 + total_files = 0 + successful_files = 0 + failed_files = 0 + + print(f"Starting to process directory: {input_dir}") # Walk through the directory recursively for dirpath, dirnames, filenames in os.walk(input_dir): @@ -59,10 +64,12 @@ class EmlxReader(BaseReader): dirnames[:] = [d for d in dirnames if not d.startswith(".")] for filename in filenames: - if count >= max_count: + # Check if we've reached the max count (skip if max_count == -1) + if max_count > 0 and count >= max_count: break if filename.endswith(".emlx"): + total_files += 1 filepath = os.path.join(dirpath, filename) try: # Read the .emlx file @@ -98,17 +105,26 @@ class EmlxReader(BaseReader): and not self.include_html ): continue - body += part.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) - # break + try: + payload = part.get_payload(decode=True) + if payload: + body += payload.decode("utf-8", errors="ignore") + except Exception as e: + print(f"Error decoding payload: {e}") + continue else: - body = msg.get_payload(decode=True).decode( - "utf-8", errors="ignore" - ) + try: + payload = msg.get_payload(decode=True) + if payload: + body = payload.decode("utf-8", errors="ignore") + except Exception as e: + print(f"Error decoding single part payload: {e}") + body = "" - # Create document content with metadata embedded in text - doc_content = f""" + # Only create document if we have some content + if body.strip() or subject != "No Subject": + # Create document content with metadata embedded in text + doc_content = f""" [File]: {filename} [From]: {from_addr} [To]: {to_addr} @@ -118,18 +134,34 @@ class EmlxReader(BaseReader): {body} """ - # No separate metadata - everything is in the text - doc = Document(text=doc_content, metadata={}) - docs.append(doc) - count += 1 + # No separate metadata - everything is in the text + doc = Document(text=doc_content, metadata={}) + docs.append(doc) + count += 1 + successful_files += 1 + + # Print first few successful files for debugging + if successful_files <= 3: + print( + f"Successfully loaded: {filename} - Subject: {subject[:50]}..." + ) except Exception as e: - print(f"Error parsing email from {filepath}: {e}") + failed_files += 1 + if failed_files <= 5: # Only print first few errors + print(f"Error parsing email from {filepath}: {e}") continue except Exception as e: - print(f"Error reading file {filepath}: {e}") + failed_files += 1 + if failed_files <= 5: # Only print first few errors + print(f"Error reading file {filepath}: {e}") continue - print(f"Loaded {len(docs)} email documents") + print("Processing summary:") + print(f" Total .emlx files found: {total_files}") + print(f" Successfully loaded: {successful_files}") + print(f" Failed to load: {failed_files}") + print(f" Final documents: {len(docs)}") + return docs diff --git a/examples/email_rag.py b/examples/email_rag.py index b3c5483..5d040b0 100644 --- a/examples/email_rag.py +++ b/examples/email_rag.py @@ -78,7 +78,7 @@ class EmailRAG(BaseRAGExample): print(f"Found {len(messages_dirs)} mail directories") # Create reader - reader = EmlxReader() + reader = EmlxReader(include_html=args.include_html) # Process each directory all_documents = [] @@ -93,18 +93,18 @@ class EmailRAG(BaseRAGExample): print(f"Found {len(emlx_files)} email files") # Apply max_items limit per directory - max_per_dir = -1 + max_per_dir = -1 # Default to process all if args.max_items > 0: remaining = args.max_items - total_processed if remaining <= 0: break max_per_dir = remaining + # If args.max_items == -1, max_per_dir stays -1 (process all) - # Load emails + # Load emails - fix the parameter passing documents = reader.load_data( - file_path=str(messages_dir), + input_dir=str(messages_dir), max_count=max_per_dir, - include_html=args.include_html, ) if documents: @@ -121,6 +121,7 @@ class EmailRAG(BaseRAGExample): return [] print(f"\nTotal emails processed: {len(all_documents)}") + print("now starting to split into text chunks ... take some time") # Convert to text chunks # Email reader uses chunk_overlap=25 as in original diff --git a/examples/history_data/history.py b/examples/history_data/history.py index 4125244..bb2eac1 100644 --- a/examples/history_data/history.py +++ b/examples/history_data/history.py @@ -97,6 +97,11 @@ class ChromeHistoryReader(BaseReader): except Exception as e: print(f"Error reading Chrome history: {e}") + # add you may need to close your browser to make the database file available + # also highlight in red + print( + "\033[91mYou may need to close your browser to make the database file available\033[0m" + ) return docs return docs diff --git a/examples/wechat_rag.py b/examples/wechat_rag.py index a071f89..555d451 100644 --- a/examples/wechat_rag.py +++ b/examples/wechat_rag.py @@ -107,6 +107,10 @@ class WeChatRAG(BaseRAGExample): success = self._export_wechat_data(export_path) if not success: print("Failed to export WeChat data") + # add red: you may nned to restart your wechat to make the database file available + print( + "\033[91mYou may need to restart/quit your WeChat to make the database file available\033[0m" + ) return [] else: print(f"Using existing WeChat export: {export_path}") diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 9efefde..1e51774 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -459,7 +459,13 @@ class LeannSearcher: self.meta_path_str = f"{index_path}.meta.json" if not Path(self.meta_path_str).exists(): - raise FileNotFoundError(f"Leann metadata file not found at {self.meta_path_str}") + parent_dir = Path(index_path).parent + print( + f"Leann metadata file not found at {self.meta_path_str}, and you may need to rm -rf {parent_dir}" + ) + raise FileNotFoundError( + f"Leann metadata file not found at {self.meta_path_str}, you may need to rm -rf {parent_dir}" + ) with open(self.meta_path_str, encoding="utf-8") as f: self.meta_data = json.load(f) backend_name = self.meta_data["backend_name"]