From 87c930d70501113b582b4cc96ee33ae57aa81945 Mon Sep 17 00:00:00 2001
From: yichuan520030910320 <yichuan_wang@berkeley.edu>
Date: Sun, 3 Aug 2025 22:27:04 -0700
Subject: [PATCH] fix email wrong -1 to process all file

---
 README.md                                 |  2 +-
 examples/base_rag_example.py              |  2 +-
 examples/email_data/LEANN_email_reader.py | 66 +++++++++++++++++------
 examples/email_rag.py                     | 11 ++--
 examples/history_data/history.py          |  5 ++
 examples/wechat_rag.py                    |  4 ++
 packages/leann-core/src/leann/api.py      |  8 ++-
 7 files changed, 73 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 114912c..2150613 100755
--- a/README.md
+++ b/README.md
@@ -178,7 +178,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
 ```bash
 # Core Parameters (General preprocessing for all examples)
 --index-dir DIR          # Directory to store the index (default: current directory)
---query "YOUR QUESTION"  # Single query mode. Omit for interactive chat (type 'quit' to exit)
+--query "YOUR QUESTION"  # Single query mode. Omit for interactive chat (type 'quit' to exit), and now you can play with your index interactively
 --max-items N           # Limit data preprocessing (default: -1, process all data)
 --force-rebuild         # Force rebuild index even if it exists
 
diff --git a/examples/base_rag_example.py b/examples/base_rag_example.py
index a164b3c..a135625 100644
--- a/examples/base_rag_example.py
+++ b/examples/base_rag_example.py
@@ -109,7 +109,7 @@ class BaseRAGExample(ABC):
         search_group.add_argument(
             "--search-complexity",
             type=int,
-            default=64,
+            default=32,
             help="Search complexity for graph traversal (default: 64)",
         )
 
diff --git a/examples/email_data/LEANN_email_reader.py b/examples/email_data/LEANN_email_reader.py
index 393daf6..407e2ae 100644
--- a/examples/email_data/LEANN_email_reader.py
+++ b/examples/email_data/LEANN_email_reader.py
@@ -52,6 +52,11 @@ class EmlxReader(BaseReader):
         docs: list[Document] = []
         max_count = load_kwargs.get("max_count", 1000)
         count = 0
+        total_files = 0
+        successful_files = 0
+        failed_files = 0
+
+        print(f"Starting to process directory: {input_dir}")
 
         # Walk through the directory recursively
         for dirpath, dirnames, filenames in os.walk(input_dir):
@@ -59,10 +64,12 @@ class EmlxReader(BaseReader):
             dirnames[:] = [d for d in dirnames if not d.startswith(".")]
 
             for filename in filenames:
-                if count >= max_count:
+                # Check if we've reached the max count (skip if max_count == -1)
+                if max_count > 0 and count >= max_count:
                     break
 
                 if filename.endswith(".emlx"):
+                    total_files += 1
                     filepath = os.path.join(dirpath, filename)
                     try:
                         # Read the .emlx file
@@ -98,17 +105,26 @@ class EmlxReader(BaseReader):
                                                 and not self.include_html
                                             ):
                                                 continue
-                                            body += part.get_payload(decode=True).decode(
-                                                "utf-8", errors="ignore"
-                                            )
-                                            # break
+                                            try:
+                                                payload = part.get_payload(decode=True)
+                                                if payload:
+                                                    body += payload.decode("utf-8", errors="ignore")
+                                            except Exception as e:
+                                                print(f"Error decoding payload: {e}")
+                                                continue
                                 else:
-                                    body = msg.get_payload(decode=True).decode(
-                                        "utf-8", errors="ignore"
-                                    )
+                                    try:
+                                        payload = msg.get_payload(decode=True)
+                                        if payload:
+                                            body = payload.decode("utf-8", errors="ignore")
+                                    except Exception as e:
+                                        print(f"Error decoding single part payload: {e}")
+                                        body = ""
 
-                                # Create document content with metadata embedded in text
-                                doc_content = f"""
+                                # Only create document if we have some content
+                                if body.strip() or subject != "No Subject":
+                                    # Create document content with metadata embedded in text
+                                    doc_content = f"""
 [File]: {filename}
 [From]: {from_addr}
 [To]: {to_addr}
@@ -118,18 +134,34 @@ class EmlxReader(BaseReader):
 {body}
 """
 
-                                # No separate metadata - everything is in the text
-                                doc = Document(text=doc_content, metadata={})
-                                docs.append(doc)
-                                count += 1
+                                    # No separate metadata - everything is in the text
+                                    doc = Document(text=doc_content, metadata={})
+                                    docs.append(doc)
+                                    count += 1
+                                    successful_files += 1
+
+                                    # Print first few successful files for debugging
+                                    if successful_files <= 3:
+                                        print(
+                                            f"Successfully loaded: {filename} - Subject: {subject[:50]}..."
+                                        )
 
                             except Exception as e:
-                                print(f"Error parsing email from {filepath}: {e}")
+                                failed_files += 1
+                                if failed_files <= 5:  # Only print first few errors
+                                    print(f"Error parsing email from {filepath}: {e}")
                                 continue
 
                     except Exception as e:
-                        print(f"Error reading file {filepath}: {e}")
+                        failed_files += 1
+                        if failed_files <= 5:  # Only print first few errors
+                            print(f"Error reading file {filepath}: {e}")
                         continue
 
-        print(f"Loaded {len(docs)} email documents")
+        print("Processing summary:")
+        print(f"  Total .emlx files found: {total_files}")
+        print(f"  Successfully loaded: {successful_files}")
+        print(f"  Failed to load: {failed_files}")
+        print(f"  Final documents: {len(docs)}")
+
         return docs
diff --git a/examples/email_rag.py b/examples/email_rag.py
index b3c5483..5d040b0 100644
--- a/examples/email_rag.py
+++ b/examples/email_rag.py
@@ -78,7 +78,7 @@ class EmailRAG(BaseRAGExample):
         print(f"Found {len(messages_dirs)} mail directories")
 
         # Create reader
-        reader = EmlxReader()
+        reader = EmlxReader(include_html=args.include_html)
 
         # Process each directory
         all_documents = []
@@ -93,18 +93,18 @@ class EmailRAG(BaseRAGExample):
                 print(f"Found {len(emlx_files)} email files")
 
                 # Apply max_items limit per directory
-                max_per_dir = -1
+                max_per_dir = -1  # Default to process all
                 if args.max_items > 0:
                     remaining = args.max_items - total_processed
                     if remaining <= 0:
                         break
                     max_per_dir = remaining
+                # If args.max_items == -1, max_per_dir stays -1 (process all)
 
-                # Load emails
+                # Load emails - fix the parameter passing
                 documents = reader.load_data(
-                    file_path=str(messages_dir),
+                    input_dir=str(messages_dir),
                     max_count=max_per_dir,
-                    include_html=args.include_html,
                 )
 
                 if documents:
@@ -121,6 +121,7 @@ class EmailRAG(BaseRAGExample):
             return []
 
         print(f"\nTotal emails processed: {len(all_documents)}")
+        print("now starting to split into text chunks ... take some time")
 
         # Convert to text chunks
         # Email reader uses chunk_overlap=25 as in original
diff --git a/examples/history_data/history.py b/examples/history_data/history.py
index 4125244..bb2eac1 100644
--- a/examples/history_data/history.py
+++ b/examples/history_data/history.py
@@ -97,6 +97,11 @@ class ChromeHistoryReader(BaseReader):
 
         except Exception as e:
             print(f"Error reading Chrome history: {e}")
+            # add you may need to close your browser to make the database file available
+            # also highlight in red
+            print(
+                "\033[91mYou may need to close your browser to make the database file available\033[0m"
+            )
             return docs
 
         return docs
diff --git a/examples/wechat_rag.py b/examples/wechat_rag.py
index a071f89..555d451 100644
--- a/examples/wechat_rag.py
+++ b/examples/wechat_rag.py
@@ -107,6 +107,10 @@ class WeChatRAG(BaseRAGExample):
             success = self._export_wechat_data(export_path)
             if not success:
                 print("Failed to export WeChat data")
+                # add red: you may nned to restart your wechat to make the database file available
+                print(
+                    "\033[91mYou may need to restart/quit your WeChat to make the database file available\033[0m"
+                )
                 return []
         else:
             print(f"Using existing WeChat export: {export_path}")
diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py
index 9efefde..1e51774 100644
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -459,7 +459,13 @@ class LeannSearcher:
 
         self.meta_path_str = f"{index_path}.meta.json"
         if not Path(self.meta_path_str).exists():
-            raise FileNotFoundError(f"Leann metadata file not found at {self.meta_path_str}")
+            parent_dir = Path(index_path).parent
+            print(
+                f"Leann metadata file not found at {self.meta_path_str}, and you may need to rm -rf {parent_dir}"
+            )
+            raise FileNotFoundError(
+                f"Leann metadata file not found at {self.meta_path_str}, you may need to rm -rf {parent_dir}"
+            )
         with open(self.meta_path_str, encoding="utf-8") as f:
             self.meta_data = json.load(f)
         backend_name = self.meta_data["backend_name"]