- Create BaseRAGExample base class for all RAG examples - Refactor 4 examples to use unified interface: - document_rag.py (replaces main_cli_example.py) - email_rag.py (replaces mail_reader_leann.py) - browser_rag.py (replaces google_history_reader_leann.py) - wechat_rag.py (replaces wechat_history_reader_leann.py) - Maintain 100% parameter compatibility with original files - Add interactive mode support for all examples - Unify parameter names (--max-items replaces --max-emails/--max-entries) - Update README.md with new examples usage - Add PARAMETER_CONSISTENCY.md documenting all parameter mappings - Keep main_cli_example.py for backward compatibility with migration notice All default values, LeannBuilder parameters, and chunking settings remain identical to ensure full compatibility with existing indexes.
158 lines
5.3 KiB
Python
158 lines
5.3 KiB
Python
"""
|
|
Browser History RAG example using the unified interface.
|
|
Supports Chrome browser history.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from base_rag_example import BaseRAGExample, create_text_chunks
|
|
from history_data.history import ChromeHistoryReader
|
|
|
|
|
|
class BrowserRAG(BaseRAGExample):
|
|
"""RAG example for Chrome browser history."""
|
|
|
|
def __init__(self):
|
|
super().__init__(
|
|
name="Browser History",
|
|
description="Process and query Chrome browser history with LEANN",
|
|
default_index_name="google_history_index", # Match original: "./google_history_index",
|
|
)
|
|
|
|
def _add_specific_arguments(self, parser):
|
|
"""Add browser-specific arguments."""
|
|
browser_group = parser.add_argument_group("Browser Parameters")
|
|
browser_group.add_argument(
|
|
"--chrome-profile",
|
|
type=str,
|
|
default=None,
|
|
help="Path to Chrome profile directory (auto-detected if not specified)",
|
|
)
|
|
browser_group.add_argument(
|
|
"--auto-find-profiles",
|
|
action="store_true",
|
|
default=True,
|
|
help="Automatically find all Chrome profiles (default: True)",
|
|
)
|
|
|
|
def _get_chrome_base_path(self) -> Path:
|
|
"""Get the base Chrome profile path based on OS."""
|
|
if sys.platform == "darwin":
|
|
return Path.home() / "Library" / "Application Support" / "Google" / "Chrome"
|
|
elif sys.platform.startswith("linux"):
|
|
return Path.home() / ".config" / "google-chrome"
|
|
elif sys.platform == "win32":
|
|
return Path(os.environ["LOCALAPPDATA"]) / "Google" / "Chrome" / "User Data"
|
|
else:
|
|
raise ValueError(f"Unsupported platform: {sys.platform}")
|
|
|
|
def _find_chrome_profiles(self) -> List[Path]:
|
|
"""Auto-detect all Chrome profiles."""
|
|
base_path = self._get_chrome_base_path()
|
|
if not base_path.exists():
|
|
return []
|
|
|
|
profiles = []
|
|
|
|
# Check Default profile
|
|
default_profile = base_path / "Default"
|
|
if default_profile.exists() and (default_profile / "History").exists():
|
|
profiles.append(default_profile)
|
|
|
|
# Check numbered profiles
|
|
for item in base_path.iterdir():
|
|
if item.is_dir() and item.name.startswith("Profile "):
|
|
if (item / "History").exists():
|
|
profiles.append(item)
|
|
|
|
return profiles
|
|
|
|
async def load_data(self, args) -> List[str]:
|
|
"""Load browser history and convert to text chunks."""
|
|
# Determine Chrome profiles
|
|
if args.chrome_profile and not args.auto_find_profiles:
|
|
profile_dirs = [Path(args.chrome_profile)]
|
|
else:
|
|
print("Auto-detecting Chrome profiles...")
|
|
profile_dirs = self._find_chrome_profiles()
|
|
|
|
# If specific profile given, filter to just that one
|
|
if args.chrome_profile:
|
|
profile_path = Path(args.chrome_profile)
|
|
profile_dirs = [p for p in profile_dirs if p == profile_path]
|
|
|
|
if not profile_dirs:
|
|
print("No Chrome profiles found!")
|
|
print("Please specify --chrome-profile manually")
|
|
return []
|
|
|
|
print(f"Found {len(profile_dirs)} Chrome profiles")
|
|
|
|
# Create reader
|
|
reader = ChromeHistoryReader()
|
|
|
|
# Process each profile
|
|
all_documents = []
|
|
total_processed = 0
|
|
|
|
for i, profile_dir in enumerate(profile_dirs):
|
|
print(f"\nProcessing profile {i + 1}/{len(profile_dirs)}: {profile_dir.name}")
|
|
|
|
try:
|
|
# Apply max_items limit per profile
|
|
max_per_profile = -1
|
|
if args.max_items > 0:
|
|
remaining = args.max_items - total_processed
|
|
if remaining <= 0:
|
|
break
|
|
max_per_profile = remaining
|
|
|
|
# Load history
|
|
documents = reader.load_data(
|
|
chrome_profile_path=str(profile_dir),
|
|
max_count=max_per_profile,
|
|
)
|
|
|
|
if documents:
|
|
all_documents.extend(documents)
|
|
total_processed += len(documents)
|
|
print(f"Processed {len(documents)} history entries from this profile")
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {profile_dir}: {e}")
|
|
continue
|
|
|
|
if not all_documents:
|
|
print("No browser history found to process!")
|
|
return []
|
|
|
|
print(f"\nTotal history entries processed: {len(all_documents)}")
|
|
|
|
# Convert to text chunks
|
|
all_texts = create_text_chunks(all_documents)
|
|
|
|
return all_texts
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
|
|
# Example queries for browser history RAG
|
|
print("\n🌐 Browser History RAG Example")
|
|
print("=" * 50)
|
|
print("\nExample queries you can try:")
|
|
print("- 'What websites did I visit about machine learning?'")
|
|
print("- 'Find my search history about programming'")
|
|
print("- 'What YouTube videos did I watch recently?'")
|
|
print("- 'Show me websites about travel planning'")
|
|
print("\nNote: Make sure Chrome is closed before running\n")
|
|
|
|
rag = BrowserRAG()
|
|
asyncio.run(rag.run())
|