fix email wrong -1 to process all file

2025-08-03 22:27:04 -07:00
parent 568cf597f4
commit 87c930d705
7 changed files with 73 additions and 25 deletions
--- a/README.md
+++ b/README.md
@@ -178,7 +178,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
 ```bash
 # Core Parameters (General preprocessing for all examples)
 --index-dir DIR          # Directory to store the index (default: current directory)
--query "YOUR QUESTION"  # Single query mode. Omit for interactive chat (type 'quit' to exit)
+--query "YOUR QUESTION"  # Single query mode. Omit for interactive chat (type 'quit' to exit), and now you can play with your index interactively
 --max-items N           # Limit data preprocessing (default: -1, process all data)
 --force-rebuild         # Force rebuild index even if it exists

--- a/examples/base_rag_example.py
+++ b/examples/base_rag_example.py
@@ -109,7 +109,7 @@ class BaseRAGExample(ABC):
        search_group.add_argument(
            "--search-complexity",
            type=int,
-            default=64,
+            default=32,
            help="Search complexity for graph traversal (default: 64)",
        )

--- a/examples/email_data/LEANN_email_reader.py
+++ b/examples/email_data/LEANN_email_reader.py
@@ -52,6 +52,11 @@ class EmlxReader(BaseReader):
        docs: list[Document] = []
        max_count = load_kwargs.get("max_count", 1000)
        count = 0
+        total_files = 0
+        successful_files = 0
+        failed_files = 0
+
+        print(f"Starting to process directory: {input_dir}")

        # Walk through the directory recursively
        for dirpath, dirnames, filenames in os.walk(input_dir):
@@ -59,10 +64,12 @@ class EmlxReader(BaseReader):
            dirnames[:] = [d for d in dirnames if not d.startswith(".")]

            for filename in filenames:
-                if count >= max_count:
+                # Check if we've reached the max count (skip if max_count == -1)
+                if max_count > 0 and count >= max_count:
                    break

                if filename.endswith(".emlx"):
+                    total_files += 1
                    filepath = os.path.join(dirpath, filename)
                    try:
                        # Read the .emlx file
@@ -98,17 +105,26 @@ class EmlxReader(BaseReader):
                                                and not self.include_html
                                            ):
                                                continue
-                                            body += part.get_payload(decode=True).decode(
-                                                "utf-8", errors="ignore"
-                                            )
-                                            # break
+                                            try:
+                                                payload = part.get_payload(decode=True)
+                                                if payload:
+                                                    body += payload.decode("utf-8", errors="ignore")
+                                            except Exception as e:
+                                                print(f"Error decoding payload: {e}")
+                                                continue
                                else:
-                                    body = msg.get_payload(decode=True).decode(
-                                        "utf-8", errors="ignore"
-                                    )
+                                    try:
+                                        payload = msg.get_payload(decode=True)
+                                        if payload:
+                                            body = payload.decode("utf-8", errors="ignore")
+                                    except Exception as e:
+                                        print(f"Error decoding single part payload: {e}")
+                                        body = ""

-                                # Create document content with metadata embedded in text
-                                doc_content = f"""
+                                # Only create document if we have some content
+                                if body.strip() or subject != "No Subject":
+                                    # Create document content with metadata embedded in text
+                                    doc_content = f"""
 [File]: {filename}
 [From]: {from_addr}
 [To]: {to_addr}
@@ -118,18 +134,34 @@ class EmlxReader(BaseReader):
 {body}
 """

-                                # No separate metadata - everything is in the text
-                                doc = Document(text=doc_content, metadata={})
-                                docs.append(doc)
-                                count += 1
+                                    # No separate metadata - everything is in the text
+                                    doc = Document(text=doc_content, metadata={})
+                                    docs.append(doc)
+                                    count += 1
+                                    successful_files += 1
+
+                                    # Print first few successful files for debugging
+                                    if successful_files <= 3:
+                                        print(
+                                            f"Successfully loaded: {filename} - Subject: {subject[:50]}..."
+                                        )

                            except Exception as e:
-                                print(f"Error parsing email from {filepath}: {e}")
+                                failed_files += 1
+                                if failed_files <= 5:  # Only print first few errors
+                                    print(f"Error parsing email from {filepath}: {e}")
                                continue

                    except Exception as e:
-                        print(f"Error reading file {filepath}: {e}")
+                        failed_files += 1
+                        if failed_files <= 5:  # Only print first few errors
+                            print(f"Error reading file {filepath}: {e}")
                        continue

-        print(f"Loaded {len(docs)} email documents")
+        print("Processing summary:")
+        print(f"  Total .emlx files found: {total_files}")
+        print(f"  Successfully loaded: {successful_files}")
+        print(f"  Failed to load: {failed_files}")
+        print(f"  Final documents: {len(docs)}")
+
        return docs
--- a/examples/email_rag.py
+++ b/examples/email_rag.py
@@ -78,7 +78,7 @@ class EmailRAG(BaseRAGExample):
        print(f"Found {len(messages_dirs)} mail directories")

        # Create reader
-        reader = EmlxReader()
+        reader = EmlxReader(include_html=args.include_html)

        # Process each directory
        all_documents = []
@@ -93,18 +93,18 @@ class EmailRAG(BaseRAGExample):
                print(f"Found {len(emlx_files)} email files")

                # Apply max_items limit per directory
-                max_per_dir = -1
+                max_per_dir = -1  # Default to process all
                if args.max_items > 0:
                    remaining = args.max_items - total_processed
                    if remaining <= 0:
                        break
                    max_per_dir = remaining
+                # If args.max_items == -1, max_per_dir stays -1 (process all)

-                # Load emails
+                # Load emails - fix the parameter passing
                documents = reader.load_data(
-                    file_path=str(messages_dir),
+                    input_dir=str(messages_dir),
                    max_count=max_per_dir,
-                    include_html=args.include_html,
                )

                if documents:
@@ -121,6 +121,7 @@ class EmailRAG(BaseRAGExample):
            return []

        print(f"\nTotal emails processed: {len(all_documents)}")
+        print("now starting to split into text chunks ... take some time")

        # Convert to text chunks
        # Email reader uses chunk_overlap=25 as in original
--- a/examples/history_data/history.py
+++ b/examples/history_data/history.py
@@ -97,6 +97,11 @@ class ChromeHistoryReader(BaseReader):

        except Exception as e:
            print(f"Error reading Chrome history: {e}")
+            # add you may need to close your browser to make the database file available
+            # also highlight in red
+            print(
+                "\033[91mYou may need to close your browser to make the database file available\033[0m"
+            )
            return docs

        return docs
--- a/examples/wechat_rag.py
+++ b/examples/wechat_rag.py
@@ -107,6 +107,10 @@ class WeChatRAG(BaseRAGExample):
            success = self._export_wechat_data(export_path)
            if not success:
                print("Failed to export WeChat data")
+                # add red: you may nned to restart your wechat to make the database file available
+                print(
+                    "\033[91mYou may need to restart/quit your WeChat to make the database file available\033[0m"
+                )
                return []
        else:
            print(f"Using existing WeChat export: {export_path}")
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -459,7 +459,13 @@ class LeannSearcher:

        self.meta_path_str = f"{index_path}.meta.json"
        if not Path(self.meta_path_str).exists():
-            raise FileNotFoundError(f"Leann metadata file not found at {self.meta_path_str}")
+            parent_dir = Path(index_path).parent
+            print(
+                f"Leann metadata file not found at {self.meta_path_str}, and you may need to rm -rf {parent_dir}"
+            )
+            raise FileNotFoundError(
+                f"Leann metadata file not found at {self.meta_path_str}, you may need to rm -rf {parent_dir}"
+            )
        with open(self.meta_path_str, encoding="utf-8") as f:
            self.meta_data = json.load(f)
        backend_name = self.meta_data["backend_name"]