fix email wrong -1 to process all file

This commit is contained in:
yichuan520030910320
2025-08-03 22:27:04 -07:00
parent 568cf597f4
commit 87c930d705
7 changed files with 73 additions and 25 deletions

View File

@@ -178,7 +178,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
```bash
# Core Parameters (General preprocessing for all examples)
--index-dir DIR # Directory to store the index (default: current directory)
--query "YOUR QUESTION" # Single query mode. Omit for interactive chat (type 'quit' to exit)
--query "YOUR QUESTION" # Single query mode. Omit for interactive chat (type 'quit' to exit), and now you can play with your index interactively
--max-items N # Limit data preprocessing (default: -1, process all data)
--force-rebuild # Force rebuild index even if it exists

View File

@@ -109,7 +109,7 @@ class BaseRAGExample(ABC):
search_group.add_argument(
"--search-complexity",
type=int,
default=64,
default=32,
help="Search complexity for graph traversal (default: 64)",
)

View File

@@ -52,6 +52,11 @@ class EmlxReader(BaseReader):
docs: list[Document] = []
max_count = load_kwargs.get("max_count", 1000)
count = 0
total_files = 0
successful_files = 0
failed_files = 0
print(f"Starting to process directory: {input_dir}")
# Walk through the directory recursively
for dirpath, dirnames, filenames in os.walk(input_dir):
@@ -59,10 +64,12 @@ class EmlxReader(BaseReader):
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
for filename in filenames:
if count >= max_count:
# Check if we've reached the max count (skip if max_count == -1)
if max_count > 0 and count >= max_count:
break
if filename.endswith(".emlx"):
total_files += 1
filepath = os.path.join(dirpath, filename)
try:
# Read the .emlx file
@@ -98,17 +105,26 @@ class EmlxReader(BaseReader):
and not self.include_html
):
continue
body += part.get_payload(decode=True).decode(
"utf-8", errors="ignore"
)
# break
try:
payload = part.get_payload(decode=True)
if payload:
body += payload.decode("utf-8", errors="ignore")
except Exception as e:
print(f"Error decoding payload: {e}")
continue
else:
body = msg.get_payload(decode=True).decode(
"utf-8", errors="ignore"
)
try:
payload = msg.get_payload(decode=True)
if payload:
body = payload.decode("utf-8", errors="ignore")
except Exception as e:
print(f"Error decoding single part payload: {e}")
body = ""
# Create document content with metadata embedded in text
doc_content = f"""
# Only create document if we have some content
if body.strip() or subject != "No Subject":
# Create document content with metadata embedded in text
doc_content = f"""
[File]: {filename}
[From]: {from_addr}
[To]: {to_addr}
@@ -118,18 +134,34 @@ class EmlxReader(BaseReader):
{body}
"""
# No separate metadata - everything is in the text
doc = Document(text=doc_content, metadata={})
docs.append(doc)
count += 1
# No separate metadata - everything is in the text
doc = Document(text=doc_content, metadata={})
docs.append(doc)
count += 1
successful_files += 1
# Print first few successful files for debugging
if successful_files <= 3:
print(
f"Successfully loaded: {filename} - Subject: {subject[:50]}..."
)
except Exception as e:
print(f"Error parsing email from {filepath}: {e}")
failed_files += 1
if failed_files <= 5: # Only print first few errors
print(f"Error parsing email from {filepath}: {e}")
continue
except Exception as e:
print(f"Error reading file {filepath}: {e}")
failed_files += 1
if failed_files <= 5: # Only print first few errors
print(f"Error reading file {filepath}: {e}")
continue
print(f"Loaded {len(docs)} email documents")
print("Processing summary:")
print(f" Total .emlx files found: {total_files}")
print(f" Successfully loaded: {successful_files}")
print(f" Failed to load: {failed_files}")
print(f" Final documents: {len(docs)}")
return docs

View File

@@ -78,7 +78,7 @@ class EmailRAG(BaseRAGExample):
print(f"Found {len(messages_dirs)} mail directories")
# Create reader
reader = EmlxReader()
reader = EmlxReader(include_html=args.include_html)
# Process each directory
all_documents = []
@@ -93,18 +93,18 @@ class EmailRAG(BaseRAGExample):
print(f"Found {len(emlx_files)} email files")
# Apply max_items limit per directory
max_per_dir = -1
max_per_dir = -1 # Default to process all
if args.max_items > 0:
remaining = args.max_items - total_processed
if remaining <= 0:
break
max_per_dir = remaining
# If args.max_items == -1, max_per_dir stays -1 (process all)
# Load emails
# Load emails - fix the parameter passing
documents = reader.load_data(
file_path=str(messages_dir),
input_dir=str(messages_dir),
max_count=max_per_dir,
include_html=args.include_html,
)
if documents:
@@ -121,6 +121,7 @@ class EmailRAG(BaseRAGExample):
return []
print(f"\nTotal emails processed: {len(all_documents)}")
print("now starting to split into text chunks ... take some time")
# Convert to text chunks
# Email reader uses chunk_overlap=25 as in original

View File

@@ -97,6 +97,11 @@ class ChromeHistoryReader(BaseReader):
except Exception as e:
print(f"Error reading Chrome history: {e}")
# add you may need to close your browser to make the database file available
# also highlight in red
print(
"\033[91mYou may need to close your browser to make the database file available\033[0m"
)
return docs
return docs

View File

@@ -107,6 +107,10 @@ class WeChatRAG(BaseRAGExample):
success = self._export_wechat_data(export_path)
if not success:
print("Failed to export WeChat data")
# add red: you may nned to restart your wechat to make the database file available
print(
"\033[91mYou may need to restart/quit your WeChat to make the database file available\033[0m"
)
return []
else:
print(f"Using existing WeChat export: {export_path}")

View File

@@ -459,7 +459,13 @@ class LeannSearcher:
self.meta_path_str = f"{index_path}.meta.json"
if not Path(self.meta_path_str).exists():
raise FileNotFoundError(f"Leann metadata file not found at {self.meta_path_str}")
parent_dir = Path(index_path).parent
print(
f"Leann metadata file not found at {self.meta_path_str}, and you may need to rm -rf {parent_dir}"
)
raise FileNotFoundError(
f"Leann metadata file not found at {self.meta_path_str}, you may need to rm -rf {parent_dir}"
)
with open(self.meta_path_str, encoding="utf-8") as f:
self.meta_data = json.load(f)
backend_name = self.meta_data["backend_name"]