fix email wrong -1 to process all file
This commit is contained in:
@@ -178,7 +178,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
|
|||||||
```bash
|
```bash
|
||||||
# Core Parameters (General preprocessing for all examples)
|
# Core Parameters (General preprocessing for all examples)
|
||||||
--index-dir DIR # Directory to store the index (default: current directory)
|
--index-dir DIR # Directory to store the index (default: current directory)
|
||||||
--query "YOUR QUESTION" # Single query mode. Omit for interactive chat (type 'quit' to exit)
|
--query "YOUR QUESTION" # Single query mode. Omit for interactive chat (type 'quit' to exit), and now you can play with your index interactively
|
||||||
--max-items N # Limit data preprocessing (default: -1, process all data)
|
--max-items N # Limit data preprocessing (default: -1, process all data)
|
||||||
--force-rebuild # Force rebuild index even if it exists
|
--force-rebuild # Force rebuild index even if it exists
|
||||||
|
|
||||||
|
|||||||
@@ -109,7 +109,7 @@ class BaseRAGExample(ABC):
|
|||||||
search_group.add_argument(
|
search_group.add_argument(
|
||||||
"--search-complexity",
|
"--search-complexity",
|
||||||
type=int,
|
type=int,
|
||||||
default=64,
|
default=32,
|
||||||
help="Search complexity for graph traversal (default: 64)",
|
help="Search complexity for graph traversal (default: 64)",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -52,6 +52,11 @@ class EmlxReader(BaseReader):
|
|||||||
docs: list[Document] = []
|
docs: list[Document] = []
|
||||||
max_count = load_kwargs.get("max_count", 1000)
|
max_count = load_kwargs.get("max_count", 1000)
|
||||||
count = 0
|
count = 0
|
||||||
|
total_files = 0
|
||||||
|
successful_files = 0
|
||||||
|
failed_files = 0
|
||||||
|
|
||||||
|
print(f"Starting to process directory: {input_dir}")
|
||||||
|
|
||||||
# Walk through the directory recursively
|
# Walk through the directory recursively
|
||||||
for dirpath, dirnames, filenames in os.walk(input_dir):
|
for dirpath, dirnames, filenames in os.walk(input_dir):
|
||||||
@@ -59,10 +64,12 @@ class EmlxReader(BaseReader):
|
|||||||
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
||||||
|
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
if count >= max_count:
|
# Check if we've reached the max count (skip if max_count == -1)
|
||||||
|
if max_count > 0 and count >= max_count:
|
||||||
break
|
break
|
||||||
|
|
||||||
if filename.endswith(".emlx"):
|
if filename.endswith(".emlx"):
|
||||||
|
total_files += 1
|
||||||
filepath = os.path.join(dirpath, filename)
|
filepath = os.path.join(dirpath, filename)
|
||||||
try:
|
try:
|
||||||
# Read the .emlx file
|
# Read the .emlx file
|
||||||
@@ -98,17 +105,26 @@ class EmlxReader(BaseReader):
|
|||||||
and not self.include_html
|
and not self.include_html
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
body += part.get_payload(decode=True).decode(
|
try:
|
||||||
"utf-8", errors="ignore"
|
payload = part.get_payload(decode=True)
|
||||||
)
|
if payload:
|
||||||
# break
|
body += payload.decode("utf-8", errors="ignore")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error decoding payload: {e}")
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
body = msg.get_payload(decode=True).decode(
|
try:
|
||||||
"utf-8", errors="ignore"
|
payload = msg.get_payload(decode=True)
|
||||||
)
|
if payload:
|
||||||
|
body = payload.decode("utf-8", errors="ignore")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error decoding single part payload: {e}")
|
||||||
|
body = ""
|
||||||
|
|
||||||
# Create document content with metadata embedded in text
|
# Only create document if we have some content
|
||||||
doc_content = f"""
|
if body.strip() or subject != "No Subject":
|
||||||
|
# Create document content with metadata embedded in text
|
||||||
|
doc_content = f"""
|
||||||
[File]: {filename}
|
[File]: {filename}
|
||||||
[From]: {from_addr}
|
[From]: {from_addr}
|
||||||
[To]: {to_addr}
|
[To]: {to_addr}
|
||||||
@@ -118,18 +134,34 @@ class EmlxReader(BaseReader):
|
|||||||
{body}
|
{body}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# No separate metadata - everything is in the text
|
# No separate metadata - everything is in the text
|
||||||
doc = Document(text=doc_content, metadata={})
|
doc = Document(text=doc_content, metadata={})
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
count += 1
|
count += 1
|
||||||
|
successful_files += 1
|
||||||
|
|
||||||
|
# Print first few successful files for debugging
|
||||||
|
if successful_files <= 3:
|
||||||
|
print(
|
||||||
|
f"Successfully loaded: {filename} - Subject: {subject[:50]}..."
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error parsing email from {filepath}: {e}")
|
failed_files += 1
|
||||||
|
if failed_files <= 5: # Only print first few errors
|
||||||
|
print(f"Error parsing email from {filepath}: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error reading file {filepath}: {e}")
|
failed_files += 1
|
||||||
|
if failed_files <= 5: # Only print first few errors
|
||||||
|
print(f"Error reading file {filepath}: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f"Loaded {len(docs)} email documents")
|
print("Processing summary:")
|
||||||
|
print(f" Total .emlx files found: {total_files}")
|
||||||
|
print(f" Successfully loaded: {successful_files}")
|
||||||
|
print(f" Failed to load: {failed_files}")
|
||||||
|
print(f" Final documents: {len(docs)}")
|
||||||
|
|
||||||
return docs
|
return docs
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ class EmailRAG(BaseRAGExample):
|
|||||||
print(f"Found {len(messages_dirs)} mail directories")
|
print(f"Found {len(messages_dirs)} mail directories")
|
||||||
|
|
||||||
# Create reader
|
# Create reader
|
||||||
reader = EmlxReader()
|
reader = EmlxReader(include_html=args.include_html)
|
||||||
|
|
||||||
# Process each directory
|
# Process each directory
|
||||||
all_documents = []
|
all_documents = []
|
||||||
@@ -93,18 +93,18 @@ class EmailRAG(BaseRAGExample):
|
|||||||
print(f"Found {len(emlx_files)} email files")
|
print(f"Found {len(emlx_files)} email files")
|
||||||
|
|
||||||
# Apply max_items limit per directory
|
# Apply max_items limit per directory
|
||||||
max_per_dir = -1
|
max_per_dir = -1 # Default to process all
|
||||||
if args.max_items > 0:
|
if args.max_items > 0:
|
||||||
remaining = args.max_items - total_processed
|
remaining = args.max_items - total_processed
|
||||||
if remaining <= 0:
|
if remaining <= 0:
|
||||||
break
|
break
|
||||||
max_per_dir = remaining
|
max_per_dir = remaining
|
||||||
|
# If args.max_items == -1, max_per_dir stays -1 (process all)
|
||||||
|
|
||||||
# Load emails
|
# Load emails - fix the parameter passing
|
||||||
documents = reader.load_data(
|
documents = reader.load_data(
|
||||||
file_path=str(messages_dir),
|
input_dir=str(messages_dir),
|
||||||
max_count=max_per_dir,
|
max_count=max_per_dir,
|
||||||
include_html=args.include_html,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if documents:
|
if documents:
|
||||||
@@ -121,6 +121,7 @@ class EmailRAG(BaseRAGExample):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
print(f"\nTotal emails processed: {len(all_documents)}")
|
print(f"\nTotal emails processed: {len(all_documents)}")
|
||||||
|
print("now starting to split into text chunks ... take some time")
|
||||||
|
|
||||||
# Convert to text chunks
|
# Convert to text chunks
|
||||||
# Email reader uses chunk_overlap=25 as in original
|
# Email reader uses chunk_overlap=25 as in original
|
||||||
|
|||||||
@@ -97,6 +97,11 @@ class ChromeHistoryReader(BaseReader):
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error reading Chrome history: {e}")
|
print(f"Error reading Chrome history: {e}")
|
||||||
|
# add you may need to close your browser to make the database file available
|
||||||
|
# also highlight in red
|
||||||
|
print(
|
||||||
|
"\033[91mYou may need to close your browser to make the database file available\033[0m"
|
||||||
|
)
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
return docs
|
return docs
|
||||||
|
|||||||
@@ -107,6 +107,10 @@ class WeChatRAG(BaseRAGExample):
|
|||||||
success = self._export_wechat_data(export_path)
|
success = self._export_wechat_data(export_path)
|
||||||
if not success:
|
if not success:
|
||||||
print("Failed to export WeChat data")
|
print("Failed to export WeChat data")
|
||||||
|
# add red: you may nned to restart your wechat to make the database file available
|
||||||
|
print(
|
||||||
|
"\033[91mYou may need to restart/quit your WeChat to make the database file available\033[0m"
|
||||||
|
)
|
||||||
return []
|
return []
|
||||||
else:
|
else:
|
||||||
print(f"Using existing WeChat export: {export_path}")
|
print(f"Using existing WeChat export: {export_path}")
|
||||||
|
|||||||
@@ -459,7 +459,13 @@ class LeannSearcher:
|
|||||||
|
|
||||||
self.meta_path_str = f"{index_path}.meta.json"
|
self.meta_path_str = f"{index_path}.meta.json"
|
||||||
if not Path(self.meta_path_str).exists():
|
if not Path(self.meta_path_str).exists():
|
||||||
raise FileNotFoundError(f"Leann metadata file not found at {self.meta_path_str}")
|
parent_dir = Path(index_path).parent
|
||||||
|
print(
|
||||||
|
f"Leann metadata file not found at {self.meta_path_str}, and you may need to rm -rf {parent_dir}"
|
||||||
|
)
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Leann metadata file not found at {self.meta_path_str}, you may need to rm -rf {parent_dir}"
|
||||||
|
)
|
||||||
with open(self.meta_path_str, encoding="utf-8") as f:
|
with open(self.meta_path_str, encoding="utf-8") as f:
|
||||||
self.meta_data = json.load(f)
|
self.meta_data = json.load(f)
|
||||||
backend_name = self.meta_data["backend_name"]
|
backend_name = self.meta_data["backend_name"]
|
||||||
|
|||||||
Reference in New Issue
Block a user