fix: resolve all ruff linting errors and add lint CI check
- Fix ambiguous fullwidth characters (commas, parentheses) in strings and comments - Replace Chinese comments with English equivalents - Fix unused imports with proper noqa annotations for intentional imports - Fix bare except clauses with specific exception types - Fix redefined variables and undefined names - Add ruff noqa annotations for generated protobuf files - Add lint and format check to GitHub Actions CI pipeline
This commit is contained in:
@@ -3,14 +3,15 @@
|
||||
Memory comparison between Faiss HNSW and LEANN HNSW backend
|
||||
"""
|
||||
|
||||
import gc
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import psutil
|
||||
import gc
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import psutil
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
# Setup logging
|
||||
@@ -83,9 +84,7 @@ def test_faiss_hnsw():
|
||||
|
||||
for line in lines:
|
||||
if "Peak Memory:" in line:
|
||||
peak_memory = float(
|
||||
line.split("Peak Memory:")[1].split("MB")[0].strip()
|
||||
)
|
||||
peak_memory = float(line.split("Peak Memory:")[1].split("MB")[0].strip())
|
||||
|
||||
return {"peak_memory": peak_memory}
|
||||
|
||||
@@ -111,9 +110,8 @@ def test_leann_hnsw():
|
||||
|
||||
tracker.checkpoint("After imports")
|
||||
|
||||
from leann.api import LeannBuilder
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
|
||||
# Load and parse documents
|
||||
documents = SimpleDirectoryReader(
|
||||
@@ -197,16 +195,14 @@ def test_leann_hnsw():
|
||||
runtime_start_mem = get_memory_usage()
|
||||
print(f"Before load memory: {runtime_start_mem:.1f} MB")
|
||||
tracker.checkpoint("Before load memory")
|
||||
|
||||
|
||||
# Load searcher
|
||||
searcher = LeannSearcher(index_path)
|
||||
tracker.checkpoint("After searcher loading")
|
||||
|
||||
|
||||
|
||||
print("Running search queries...")
|
||||
queries = [
|
||||
"什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
|
||||
"什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
|
||||
"What is LEANN and how does it work?",
|
||||
"华为诺亚方舟实验室的主要研究内容",
|
||||
]
|
||||
@@ -304,21 +300,15 @@ def main():
|
||||
|
||||
print("\nLEANN vs Faiss Performance:")
|
||||
memory_saving = faiss_results["peak_memory"] - leann_results["peak_memory"]
|
||||
print(
|
||||
f" Search Memory: {memory_ratio:.1f}x less ({memory_saving:.1f} MB saved)"
|
||||
)
|
||||
print(f" Search Memory: {memory_ratio:.1f}x less ({memory_saving:.1f} MB saved)")
|
||||
|
||||
# Storage comparison
|
||||
if leann_storage_size > faiss_storage_size:
|
||||
storage_ratio = leann_storage_size / faiss_storage_size
|
||||
print(
|
||||
f" Storage Size: {storage_ratio:.1f}x larger (LEANN uses more storage)"
|
||||
)
|
||||
print(f" Storage Size: {storage_ratio:.1f}x larger (LEANN uses more storage)")
|
||||
elif faiss_storage_size > leann_storage_size:
|
||||
storage_ratio = faiss_storage_size / leann_storage_size
|
||||
print(
|
||||
f" Storage Size: {storage_ratio:.1f}x smaller (LEANN uses less storage)"
|
||||
)
|
||||
print(f" Storage Size: {storage_ratio:.1f}x smaller (LEANN uses less storage)")
|
||||
else:
|
||||
print(" Storage Size: similar")
|
||||
else:
|
||||
|
||||
@@ -3,37 +3,44 @@
|
||||
Document search demo with recompute mode
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Import backend packages to trigger plugin registration
|
||||
try:
|
||||
import leann_backend_diskann
|
||||
import leann_backend_hnsw
|
||||
import leann_backend_diskann # noqa: F401
|
||||
import leann_backend_hnsw # noqa: F401
|
||||
|
||||
print("INFO: Backend packages imported successfully.")
|
||||
except ImportError as e:
|
||||
print(f"WARNING: Could not import backend packages. Error: {e}")
|
||||
|
||||
# Import upper-level API from leann-core
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
from leann.api import LeannBuilder, LeannChat, LeannSearcher
|
||||
|
||||
|
||||
def load_sample_documents():
|
||||
"""Create sample documents for demonstration"""
|
||||
docs = [
|
||||
{"title": "Intro to Python", "content": "Python is a high-level, interpreted language known for simplicity."},
|
||||
{
|
||||
"title": "Intro to Python",
|
||||
"content": "Python is a high-level, interpreted language known for simplicity.",
|
||||
},
|
||||
{"title": "ML Basics", "content": "Machine learning builds systems that learn from data."},
|
||||
{"title": "Data Structures", "content": "Data structures like arrays, lists, and graphs organize data."},
|
||||
{
|
||||
"title": "Data Structures",
|
||||
"content": "Data structures like arrays, lists, and graphs organize data.",
|
||||
},
|
||||
]
|
||||
return docs
|
||||
|
||||
|
||||
def main():
|
||||
print("==========================================================")
|
||||
print("=== Leann Document Search Demo (DiskANN + Recompute) ===")
|
||||
print("==========================================================")
|
||||
|
||||
|
||||
INDEX_DIR = Path("./test_indices")
|
||||
INDEX_PATH = str(INDEX_DIR / "documents.diskann")
|
||||
BACKEND_TO_TEST = "diskann"
|
||||
@@ -44,94 +51,96 @@ def main():
|
||||
|
||||
# --- 1. Build index ---
|
||||
print(f"\n[PHASE 1] Building index using '{BACKEND_TO_TEST}' backend...")
|
||||
|
||||
builder = LeannBuilder(
|
||||
backend_name=BACKEND_TO_TEST,
|
||||
graph_degree=32,
|
||||
complexity=64
|
||||
)
|
||||
|
||||
|
||||
builder = LeannBuilder(backend_name=BACKEND_TO_TEST, graph_degree=32, complexity=64)
|
||||
|
||||
documents = load_sample_documents()
|
||||
print(f"Loaded {len(documents)} sample documents.")
|
||||
for doc in documents:
|
||||
builder.add_text(doc["content"], metadata={"title": doc["title"]})
|
||||
|
||||
|
||||
builder.build_index(INDEX_PATH)
|
||||
print(f"\nIndex built!")
|
||||
print("\nIndex built!")
|
||||
|
||||
# --- 2. Basic search demo ---
|
||||
print(f"\n[PHASE 2] Basic search using '{BACKEND_TO_TEST}' backend...")
|
||||
searcher = LeannSearcher(index_path=INDEX_PATH)
|
||||
|
||||
|
||||
query = "What is machine learning?"
|
||||
print(f"\nQuery: '{query}'")
|
||||
|
||||
|
||||
print("\n--- Basic search mode (PQ computation) ---")
|
||||
start_time = time.time()
|
||||
results = searcher.search(query, top_k=2)
|
||||
basic_time = time.time() - start_time
|
||||
|
||||
|
||||
print(f"⏱️ Basic search time: {basic_time:.3f} seconds")
|
||||
print(">>> Basic search results <<<")
|
||||
for i, res in enumerate(results, 1):
|
||||
print(f" {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}")
|
||||
print(
|
||||
f" {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}"
|
||||
)
|
||||
|
||||
# --- 3. Recompute search demo ---
|
||||
print(f"\n[PHASE 3] Recompute search using embedding server...")
|
||||
|
||||
print("\n[PHASE 3] Recompute search using embedding server...")
|
||||
|
||||
print("\n--- Recompute search mode (get real embeddings via network) ---")
|
||||
|
||||
|
||||
# Configure recompute parameters
|
||||
recompute_params = {
|
||||
"recompute_beighbor_embeddings": True, # Enable network recomputation
|
||||
"USE_DEFERRED_FETCH": False, # Don't use deferred fetch
|
||||
"skip_search_reorder": True, # Skip search reordering
|
||||
"dedup_node_dis": True, # Enable node distance deduplication
|
||||
"prune_ratio": 0.1, # Pruning ratio 10%
|
||||
"batch_recompute": False, # Don't use batch recomputation
|
||||
"global_pruning": False, # Don't use global pruning
|
||||
"zmq_port": 5555, # ZMQ port
|
||||
"embedding_model": "sentence-transformers/all-mpnet-base-v2"
|
||||
"USE_DEFERRED_FETCH": False, # Don't use deferred fetch
|
||||
"skip_search_reorder": True, # Skip search reordering
|
||||
"dedup_node_dis": True, # Enable node distance deduplication
|
||||
"prune_ratio": 0.1, # Pruning ratio 10%
|
||||
"batch_recompute": False, # Don't use batch recomputation
|
||||
"global_pruning": False, # Don't use global pruning
|
||||
"zmq_port": 5555, # ZMQ port
|
||||
"embedding_model": "sentence-transformers/all-mpnet-base-v2",
|
||||
}
|
||||
|
||||
|
||||
print("Recompute parameter configuration:")
|
||||
for key, value in recompute_params.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print(f"\n🔄 Executing Recompute search...")
|
||||
|
||||
print("\n🔄 Executing Recompute search...")
|
||||
try:
|
||||
start_time = time.time()
|
||||
recompute_results = searcher.search(query, top_k=2, **recompute_params)
|
||||
recompute_time = time.time() - start_time
|
||||
|
||||
|
||||
print(f"⏱️ Recompute search time: {recompute_time:.3f} seconds")
|
||||
print(">>> Recompute search results <<<")
|
||||
for i, res in enumerate(recompute_results, 1):
|
||||
print(f" {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}")
|
||||
|
||||
print(
|
||||
f" {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}"
|
||||
)
|
||||
|
||||
# Compare results
|
||||
print(f"\n--- Result comparison ---")
|
||||
print("\n--- Result comparison ---")
|
||||
print(f"Basic search time: {basic_time:.3f} seconds")
|
||||
print(f"Recompute time: {recompute_time:.3f} seconds")
|
||||
|
||||
|
||||
print("\nBasic search vs Recompute results:")
|
||||
for i in range(min(len(results), len(recompute_results))):
|
||||
basic_score = results[i].score
|
||||
recompute_score = recompute_results[i].score
|
||||
score_diff = abs(basic_score - recompute_score)
|
||||
print(f" Position {i+1}: PQ={basic_score:.4f}, Recompute={recompute_score:.4f}, Difference={score_diff:.4f}")
|
||||
|
||||
print(
|
||||
f" Position {i + 1}: PQ={basic_score:.4f}, Recompute={recompute_score:.4f}, Difference={score_diff:.4f}"
|
||||
)
|
||||
|
||||
if recompute_time > basic_time:
|
||||
print(f"✅ Recompute mode working correctly (more accurate but slower)")
|
||||
print("✅ Recompute mode working correctly (more accurate but slower)")
|
||||
else:
|
||||
print(f"ℹ️ Recompute time is unusually fast, network recomputation may not be enabled")
|
||||
|
||||
print("i️ Recompute time is unusually fast, network recomputation may not be enabled")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Recompute search failed: {e}")
|
||||
print("This usually indicates an embedding server connection issue")
|
||||
|
||||
# --- 4. Chat demo ---
|
||||
print(f"\n[PHASE 4] Starting chat session...")
|
||||
print("\n[PHASE 4] Starting chat session...")
|
||||
chat = LeannChat(index_path=INDEX_PATH)
|
||||
chat_response = chat.ask(query)
|
||||
print(f"You: {query}")
|
||||
@@ -143,4 +152,4 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
import os
|
||||
import email
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
from typing import Any
|
||||
|
||||
from llama_index.core import Document
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
|
||||
def find_all_messages_directories(root: str = None) -> List[Path]:
|
||||
|
||||
def find_all_messages_directories(root: str | None = None) -> list[Path]:
|
||||
"""
|
||||
Recursively find all 'Messages' directories under the given root.
|
||||
Returns a list of Path objects.
|
||||
@@ -14,86 +16,97 @@ def find_all_messages_directories(root: str = None) -> List[Path]:
|
||||
# Auto-detect user's mail path
|
||||
home_dir = os.path.expanduser("~")
|
||||
root = os.path.join(home_dir, "Library", "Mail")
|
||||
|
||||
|
||||
messages_dirs = []
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
for dirpath, _dirnames, _filenames in os.walk(root):
|
||||
if os.path.basename(dirpath) == "Messages":
|
||||
messages_dirs.append(Path(dirpath))
|
||||
return messages_dirs
|
||||
|
||||
|
||||
class EmlxReader(BaseReader):
|
||||
"""
|
||||
Apple Mail .emlx file reader with embedded metadata.
|
||||
|
||||
|
||||
Reads individual .emlx files from Apple Mail's storage format.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, include_html: bool = False) -> None:
|
||||
"""
|
||||
Initialize.
|
||||
|
||||
|
||||
Args:
|
||||
include_html: Whether to include HTML content in the email body (default: False)
|
||||
"""
|
||||
self.include_html = include_html
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]:
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> list[Document]:
|
||||
"""
|
||||
Load data from the input directory containing .emlx files.
|
||||
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing .emlx files
|
||||
**load_kwargs:
|
||||
max_count (int): Maximum amount of messages to read.
|
||||
"""
|
||||
docs: List[Document] = []
|
||||
max_count = load_kwargs.get('max_count', 1000)
|
||||
docs: list[Document] = []
|
||||
max_count = load_kwargs.get("max_count", 1000)
|
||||
count = 0
|
||||
|
||||
|
||||
# Walk through the directory recursively
|
||||
for dirpath, dirnames, filenames in os.walk(input_dir):
|
||||
# Skip hidden directories
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
||||
|
||||
|
||||
for filename in filenames:
|
||||
if count >= max_count:
|
||||
break
|
||||
|
||||
|
||||
if filename.endswith(".emlx"):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
try:
|
||||
# Read the .emlx file
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
with open(filepath, encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
|
||||
|
||||
# .emlx files have a length prefix followed by the email content
|
||||
# The first line contains the length, followed by the email
|
||||
lines = content.split('\n', 1)
|
||||
lines = content.split("\n", 1)
|
||||
if len(lines) >= 2:
|
||||
email_content = lines[1]
|
||||
|
||||
|
||||
# Parse the email using Python's email module
|
||||
try:
|
||||
msg = email.message_from_string(email_content)
|
||||
|
||||
|
||||
# Extract email metadata
|
||||
subject = msg.get('Subject', 'No Subject')
|
||||
from_addr = msg.get('From', 'Unknown')
|
||||
to_addr = msg.get('To', 'Unknown')
|
||||
date = msg.get('Date', 'Unknown')
|
||||
|
||||
subject = msg.get("Subject", "No Subject")
|
||||
from_addr = msg.get("From", "Unknown")
|
||||
to_addr = msg.get("To", "Unknown")
|
||||
date = msg.get("Date", "Unknown")
|
||||
|
||||
# Extract email body
|
||||
body = ""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html":
|
||||
if part.get_content_type() == "text/html" and not self.include_html:
|
||||
if (
|
||||
part.get_content_type() == "text/plain"
|
||||
or part.get_content_type() == "text/html"
|
||||
):
|
||||
if (
|
||||
part.get_content_type() == "text/html"
|
||||
and not self.include_html
|
||||
):
|
||||
continue
|
||||
body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
body += part.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
# break
|
||||
else:
|
||||
body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
|
||||
body = msg.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
|
||||
# Create document content with metadata embedded in text
|
||||
doc_content = f"""
|
||||
[File]: {filename}
|
||||
@@ -104,19 +117,19 @@ class EmlxReader(BaseReader):
|
||||
[EMAIL BODY Start]:
|
||||
{body}
|
||||
"""
|
||||
|
||||
|
||||
# No separate metadata - everything is in the text
|
||||
doc = Document(text=doc_content, metadata={})
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing email from {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading file {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
print(f"Loaded {len(docs)} email documents")
|
||||
return docs
|
||||
return docs
|
||||
|
||||
@@ -7,9 +7,9 @@ Contains simple parser for mbox files.
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from fsspec import AbstractFileSystem
|
||||
from typing import Any
|
||||
|
||||
from fsspec import AbstractFileSystem
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
from llama_index.core.schema import Document
|
||||
|
||||
@@ -27,11 +27,7 @@ class MboxReader(BaseReader):
|
||||
"""
|
||||
|
||||
DEFAULT_MESSAGE_FORMAT: str = (
|
||||
"Date: {_date}\n"
|
||||
"From: {_from}\n"
|
||||
"To: {_to}\n"
|
||||
"Subject: {_subject}\n"
|
||||
"Content: {_content}"
|
||||
"Date: {_date}\nFrom: {_from}\nTo: {_to}\nSubject: {_subject}\nContent: {_content}"
|
||||
)
|
||||
|
||||
def __init__(
|
||||
@@ -45,9 +41,7 @@ class MboxReader(BaseReader):
|
||||
try:
|
||||
from bs4 import BeautifulSoup # noqa
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`beautifulsoup4` package not found: `pip install beautifulsoup4`"
|
||||
)
|
||||
raise ImportError("`beautifulsoup4` package not found: `pip install beautifulsoup4`")
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
self.max_count = max_count
|
||||
@@ -56,9 +50,9 @@ class MboxReader(BaseReader):
|
||||
def load_data(
|
||||
self,
|
||||
file: Path,
|
||||
extra_info: Optional[Dict] = None,
|
||||
fs: Optional[AbstractFileSystem] = None,
|
||||
) -> List[Document]:
|
||||
extra_info: dict | None = None,
|
||||
fs: AbstractFileSystem | None = None,
|
||||
) -> list[Document]:
|
||||
"""Parse file into string."""
|
||||
# Import required libraries
|
||||
import mailbox
|
||||
@@ -74,7 +68,7 @@ class MboxReader(BaseReader):
|
||||
)
|
||||
|
||||
i = 0
|
||||
results: List[str] = []
|
||||
results: list[str] = []
|
||||
# Load file using mailbox
|
||||
bytes_parser = BytesParser(policy=default).parse
|
||||
mbox = mailbox.mbox(file, factory=bytes_parser) # type: ignore
|
||||
@@ -124,7 +118,7 @@ class MboxReader(BaseReader):
|
||||
class EmlxMboxReader(MboxReader):
|
||||
"""
|
||||
EmlxMboxReader - Modified MboxReader that handles directories of .emlx files.
|
||||
|
||||
|
||||
Extends MboxReader to work with Apple Mail's .emlx format by:
|
||||
1. Reading .emlx files from a directory
|
||||
2. Converting them to mbox format in memory
|
||||
@@ -134,13 +128,13 @@ class EmlxMboxReader(MboxReader):
|
||||
def load_data(
|
||||
self,
|
||||
directory: Path,
|
||||
extra_info: Optional[Dict] = None,
|
||||
fs: Optional[AbstractFileSystem] = None,
|
||||
) -> List[Document]:
|
||||
extra_info: dict | None = None,
|
||||
fs: AbstractFileSystem | None = None,
|
||||
) -> list[Document]:
|
||||
"""Parse .emlx files from directory into strings using MboxReader logic."""
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
import tempfile
|
||||
|
||||
if fs:
|
||||
logger.warning(
|
||||
"fs was specified but EmlxMboxReader doesn't support loading "
|
||||
@@ -150,37 +144,37 @@ class EmlxMboxReader(MboxReader):
|
||||
# Find all .emlx files in the directory
|
||||
emlx_files = list(directory.glob("*.emlx"))
|
||||
logger.info(f"Found {len(emlx_files)} .emlx files in {directory}")
|
||||
|
||||
|
||||
if not emlx_files:
|
||||
logger.warning(f"No .emlx files found in {directory}")
|
||||
return []
|
||||
|
||||
# Create a temporary mbox file
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.mbox', delete=False) as temp_mbox:
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".mbox", delete=False) as temp_mbox:
|
||||
temp_mbox_path = temp_mbox.name
|
||||
|
||||
|
||||
# Convert .emlx files to mbox format
|
||||
for emlx_file in emlx_files:
|
||||
try:
|
||||
# Read the .emlx file
|
||||
with open(emlx_file, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
with open(emlx_file, encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
|
||||
|
||||
# .emlx format: first line is length, rest is email content
|
||||
lines = content.split('\n', 1)
|
||||
lines = content.split("\n", 1)
|
||||
if len(lines) >= 2:
|
||||
email_content = lines[1] # Skip the length line
|
||||
|
||||
|
||||
# Write to mbox format (each message starts with "From " and ends with blank line)
|
||||
temp_mbox.write(f"From {emlx_file.name} {email_content}\n\n")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to process {emlx_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
# Close the temporary file so MboxReader can read it
|
||||
temp_mbox.close()
|
||||
|
||||
|
||||
try:
|
||||
# Use the parent MboxReader's logic to parse the mbox file
|
||||
return super().load_data(Path(temp_mbox_path), extra_info, fs)
|
||||
@@ -188,5 +182,5 @@ class EmlxMboxReader(MboxReader):
|
||||
# Clean up temporary file
|
||||
try:
|
||||
os.unlink(temp_mbox_path)
|
||||
except:
|
||||
pass
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test only Faiss HNSW"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import psutil
|
||||
import gc
|
||||
import os
|
||||
|
||||
|
||||
def get_memory_usage():
|
||||
@@ -37,20 +37,20 @@ def main():
|
||||
import faiss
|
||||
except ImportError:
|
||||
print("Faiss is not installed.")
|
||||
print("Please install it with `uv pip install faiss-cpu` and you can then run this script again")
|
||||
print(
|
||||
"Please install it with `uv pip install faiss-cpu` and you can then run this script again"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
from llama_index.core import (
|
||||
SimpleDirectoryReader,
|
||||
VectorStoreIndex,
|
||||
StorageContext,
|
||||
Settings,
|
||||
node_parser,
|
||||
Document,
|
||||
SimpleDirectoryReader,
|
||||
StorageContext,
|
||||
VectorStoreIndex,
|
||||
)
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||
|
||||
tracker = MemoryTracker("Faiss HNSW")
|
||||
tracker.checkpoint("Initial")
|
||||
@@ -90,8 +90,9 @@ def main():
|
||||
vector_store=vector_store, persist_dir="./storage_faiss"
|
||||
)
|
||||
from llama_index.core import load_index_from_storage
|
||||
|
||||
index = load_index_from_storage(storage_context=storage_context)
|
||||
print(f"Index loaded from ./storage_faiss")
|
||||
print("Index loaded from ./storage_faiss")
|
||||
tracker.checkpoint("After loading existing index")
|
||||
index_loaded = True
|
||||
except Exception as e:
|
||||
@@ -99,19 +100,18 @@ def main():
|
||||
print("Cleaning up corrupted index and building new one...")
|
||||
# Clean up corrupted index
|
||||
import shutil
|
||||
|
||||
if os.path.exists("./storage_faiss"):
|
||||
shutil.rmtree("./storage_faiss")
|
||||
|
||||
|
||||
if not index_loaded:
|
||||
print("Building new Faiss HNSW index...")
|
||||
|
||||
|
||||
# Use the correct Faiss building pattern from the example
|
||||
vector_store = FaissVectorStore(faiss_index=faiss_index)
|
||||
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents,
|
||||
storage_context=storage_context,
|
||||
transformations=[node_parser]
|
||||
documents, storage_context=storage_context, transformations=[node_parser]
|
||||
)
|
||||
tracker.checkpoint("After index building")
|
||||
|
||||
@@ -124,10 +124,10 @@ def main():
|
||||
runtime_start_mem = get_memory_usage()
|
||||
print(f"Before load memory: {runtime_start_mem:.1f} MB")
|
||||
tracker.checkpoint("Before load memory")
|
||||
|
||||
|
||||
query_engine = index.as_query_engine(similarity_top_k=20)
|
||||
queries = [
|
||||
"什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
|
||||
"什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
|
||||
"What is LEANN and how does it work?",
|
||||
"华为诺亚方舟实验室的主要研究内容",
|
||||
]
|
||||
@@ -141,7 +141,7 @@ def main():
|
||||
|
||||
runtime_end_mem = get_memory_usage()
|
||||
runtime_overhead = runtime_end_mem - runtime_start_mem
|
||||
|
||||
|
||||
peak_memory = tracker.summary()
|
||||
print(f"Peak Memory: {peak_memory:.1f} MB")
|
||||
print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
import os
|
||||
import asyncio
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
try:
|
||||
import dotenv
|
||||
|
||||
dotenv.load_dotenv()
|
||||
except ModuleNotFoundError:
|
||||
# python-dotenv is not installed; skip loading environment variables
|
||||
dotenv = None
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
|
||||
from leann.api import LeannBuilder, LeannChat
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
# dotenv.load_dotenv() # handled above if python-dotenv is available
|
||||
@@ -17,42 +19,45 @@ from llama_index.core.node_parser import SentenceSplitter
|
||||
# Default Chrome profile path
|
||||
DEFAULT_CHROME_PROFILE = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
|
||||
|
||||
def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], index_path: str = "chrome_history_index.leann", max_count: int = -1):
|
||||
|
||||
def create_leann_index_from_multiple_chrome_profiles(
|
||||
profile_dirs: list[Path], index_path: str = "chrome_history_index.leann", max_count: int = -1
|
||||
):
|
||||
"""
|
||||
Create LEANN index from multiple Chrome profile data sources.
|
||||
|
||||
|
||||
Args:
|
||||
profile_dirs: List of Path objects pointing to Chrome profile directories
|
||||
index_path: Path to save the LEANN index
|
||||
max_count: Maximum number of history entries to process per profile
|
||||
"""
|
||||
print("Creating LEANN index from multiple Chrome profile data sources...")
|
||||
|
||||
|
||||
# Load documents using ChromeHistoryReader from history_data
|
||||
from history_data.history import ChromeHistoryReader
|
||||
|
||||
reader = ChromeHistoryReader()
|
||||
|
||||
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
all_documents = []
|
||||
total_processed = 0
|
||||
|
||||
|
||||
# Process each Chrome profile directory
|
||||
for i, profile_dir in enumerate(profile_dirs):
|
||||
print(f"\nProcessing Chrome profile {i+1}/{len(profile_dirs)}: {profile_dir}")
|
||||
|
||||
print(f"\nProcessing Chrome profile {i + 1}/{len(profile_dirs)}: {profile_dir}")
|
||||
|
||||
try:
|
||||
documents = reader.load_data(
|
||||
chrome_profile_path=str(profile_dir),
|
||||
max_count=max_count
|
||||
chrome_profile_path=str(profile_dir), max_count=max_count
|
||||
)
|
||||
if documents:
|
||||
print(f"Loaded {len(documents)} history documents from {profile_dir}")
|
||||
all_documents.extend(documents)
|
||||
total_processed += len(documents)
|
||||
|
||||
|
||||
# Check if we've reached the max count
|
||||
if max_count > 0 and total_processed >= max_count:
|
||||
print(f"Reached max count of {max_count} documents")
|
||||
@@ -62,18 +67,22 @@ def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], i
|
||||
except Exception as e:
|
||||
print(f"Error processing {profile_dir}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
if not all_documents:
|
||||
print("No documents loaded from any source. Exiting.")
|
||||
# highlight info that you need to close all chrome browser before running this script and high light the instruction!!
|
||||
print("\033[91mYou need to close or quit all chrome browser before running this script\033[0m")
|
||||
print(
|
||||
"\033[91mYou need to close or quit all chrome browser before running this script\033[0m"
|
||||
)
|
||||
return None
|
||||
|
||||
print(f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles")
|
||||
|
||||
|
||||
print(
|
||||
f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles"
|
||||
)
|
||||
|
||||
# Create text splitter with 256 chunk size
|
||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
|
||||
|
||||
|
||||
# Convert Documents to text strings and chunk them
|
||||
all_texts = []
|
||||
for doc in all_documents:
|
||||
@@ -83,43 +92,48 @@ def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], i
|
||||
text = node.get_content()
|
||||
# text = '[Title] ' + doc.metadata["title"] + '\n' + text
|
||||
all_texts.append(text)
|
||||
|
||||
|
||||
print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
|
||||
|
||||
|
||||
# Create LEANN index directory
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="facebook/contriever",
|
||||
graph_degree=32,
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True,
|
||||
num_threads=1 # Force single-threaded mode
|
||||
num_threads=1, # Force single-threaded mode
|
||||
)
|
||||
|
||||
print(f"Adding {len(all_texts)} history chunks to index...")
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
|
||||
|
||||
builder.build_index(index_path)
|
||||
print(f"\nLEANN index built at {index_path}!")
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
|
||||
return index_path
|
||||
|
||||
def create_leann_index(profile_path: str = None, index_path: str = "chrome_history_index.leann", max_count: int = 1000):
|
||||
|
||||
def create_leann_index(
|
||||
profile_path: str | None = None,
|
||||
index_path: str = "chrome_history_index.leann",
|
||||
max_count: int = 1000,
|
||||
):
|
||||
"""
|
||||
Create LEANN index from Chrome history data.
|
||||
|
||||
|
||||
Args:
|
||||
profile_path: Path to the Chrome profile directory (optional, uses default if None)
|
||||
index_path: Path to save the LEANN index
|
||||
@@ -127,33 +141,31 @@ def create_leann_index(profile_path: str = None, index_path: str = "chrome_histo
|
||||
"""
|
||||
print("Creating LEANN index from Chrome history data...")
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Load documents using ChromeHistoryReader from history_data
|
||||
from history_data.history import ChromeHistoryReader
|
||||
|
||||
reader = ChromeHistoryReader()
|
||||
|
||||
documents = reader.load_data(
|
||||
chrome_profile_path=profile_path,
|
||||
max_count=max_count
|
||||
)
|
||||
|
||||
|
||||
documents = reader.load_data(chrome_profile_path=profile_path, max_count=max_count)
|
||||
|
||||
if not documents:
|
||||
print("No documents loaded. Exiting.")
|
||||
return None
|
||||
|
||||
|
||||
print(f"Loaded {len(documents)} history documents")
|
||||
|
||||
|
||||
# Create text splitter with 256 chunk size
|
||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
|
||||
|
||||
|
||||
# Convert Documents to text strings and chunk them
|
||||
all_texts = []
|
||||
for doc in documents:
|
||||
@@ -161,54 +173,55 @@ def create_leann_index(profile_path: str = None, index_path: str = "chrome_histo
|
||||
nodes = text_splitter.get_nodes_from_documents([doc])
|
||||
for node in nodes:
|
||||
all_texts.append(node.get_content())
|
||||
|
||||
|
||||
print(f"Created {len(all_texts)} text chunks from {len(documents)} documents")
|
||||
|
||||
|
||||
# Create LEANN index directory
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="facebook/contriever",
|
||||
graph_degree=32,
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True,
|
||||
num_threads=1 # Force single-threaded mode
|
||||
num_threads=1, # Force single-threaded mode
|
||||
)
|
||||
|
||||
print(f"Adding {len(all_texts)} history chunks to index...")
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
|
||||
|
||||
builder.build_index(index_path)
|
||||
print(f"\nLEANN index built at {index_path}!")
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
|
||||
return index_path
|
||||
|
||||
|
||||
async def query_leann_index(index_path: str, query: str):
|
||||
"""
|
||||
Query the LEANN index.
|
||||
|
||||
|
||||
Args:
|
||||
index_path: Path to the LEANN index
|
||||
query: The query string
|
||||
"""
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
print("\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=index_path)
|
||||
|
||||
|
||||
print(f"You: {query}")
|
||||
chat_response = chat.ask(
|
||||
query,
|
||||
top_k=10,
|
||||
query,
|
||||
top_k=10,
|
||||
recompute_beighbor_embeddings=True,
|
||||
complexity=32,
|
||||
beam_width=1,
|
||||
@@ -217,40 +230,60 @@ async def query_leann_index(index_path: str, query: str):
|
||||
"model": "gpt-4o",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
llm_kwargs={
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 1000
|
||||
}
|
||||
llm_kwargs={"temperature": 0.0, "max_tokens": 1000},
|
||||
)
|
||||
|
||||
print(f"Leann chat response: \033[36m{chat_response}\033[0m")
|
||||
|
||||
|
||||
async def main():
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index')
|
||||
parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE,
|
||||
help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this')
|
||||
parser.add_argument('--index-dir', type=str, default="./google_history_index",
|
||||
help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)')
|
||||
parser.add_argument('--max-entries', type=int, default=1000,
|
||||
help='Maximum number of history entries to process (default: 1000)')
|
||||
parser.add_argument('--query', type=str, default=None,
|
||||
help='Single query to run (default: runs example queries)')
|
||||
parser.add_argument('--auto-find-profiles', action='store_true', default=True,
|
||||
help='Automatically find all Chrome profiles (default: True)')
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LEANN Chrome History Reader - Create and query browser history index"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chrome-profile",
|
||||
type=str,
|
||||
default=DEFAULT_CHROME_PROFILE,
|
||||
help=f"Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index-dir",
|
||||
type=str,
|
||||
default="./google_history_index",
|
||||
help="Directory to store the LEANN index (default: ./chrome_history_index_leann_test)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-entries",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Maximum number of history entries to process (default: 1000)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Single query to run (default: runs example queries)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--auto-find-profiles",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Automatically find all Chrome profiles (default: True)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
INDEX_DIR = Path(args.index_dir)
|
||||
INDEX_PATH = str(INDEX_DIR / "chrome_history.leann")
|
||||
|
||||
|
||||
print(f"Using Chrome profile: {args.chrome_profile}")
|
||||
print(f"Index directory: {INDEX_DIR}")
|
||||
print(f"Max entries: {args.max_entries}")
|
||||
|
||||
|
||||
# Find Chrome profile directories
|
||||
from history_data.history import ChromeHistoryReader
|
||||
|
||||
|
||||
if args.auto_find_profiles:
|
||||
profile_dirs = ChromeHistoryReader.find_chrome_profiles()
|
||||
if not profile_dirs:
|
||||
@@ -263,10 +296,12 @@ async def main():
|
||||
print(f"Chrome profile not found: {profile_path}")
|
||||
return
|
||||
profile_dirs = [profile_path]
|
||||
|
||||
|
||||
# Create or load the LEANN index from all sources
|
||||
index_path = create_leann_index_from_multiple_chrome_profiles(profile_dirs, INDEX_PATH, args.max_entries)
|
||||
|
||||
index_path = create_leann_index_from_multiple_chrome_profiles(
|
||||
profile_dirs, INDEX_PATH, args.max_entries
|
||||
)
|
||||
|
||||
if index_path:
|
||||
if args.query:
|
||||
# Run single query
|
||||
@@ -275,12 +310,13 @@ async def main():
|
||||
# Example queries
|
||||
queries = [
|
||||
"What websites did I visit about machine learning?",
|
||||
"Find my search history about programming"
|
||||
"Find my search history about programming",
|
||||
]
|
||||
|
||||
|
||||
for query in queries:
|
||||
print("\n" + "="*60)
|
||||
print("\n" + "=" * 60)
|
||||
await query_leann_index(index_path, query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
from .history import ChromeHistoryReader
|
||||
|
||||
__all__ = ['ChromeHistoryReader']
|
||||
__all__ = ["ChromeHistoryReader"]
|
||||
|
||||
@@ -1,77 +1,81 @@
|
||||
import sqlite3
|
||||
import os
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
from typing import Any
|
||||
|
||||
from llama_index.core import Document
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
|
||||
|
||||
class ChromeHistoryReader(BaseReader):
|
||||
"""
|
||||
Chrome browser history reader that extracts browsing data from SQLite database.
|
||||
|
||||
|
||||
Reads Chrome history from the default Chrome profile location and creates documents
|
||||
with embedded metadata similar to the email reader structure.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize."""
|
||||
pass
|
||||
|
||||
def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
|
||||
|
||||
def load_data(self, input_dir: str | None = None, **load_kwargs: Any) -> list[Document]:
|
||||
"""
|
||||
Load Chrome history data from the default Chrome profile location.
|
||||
|
||||
|
||||
Args:
|
||||
input_dir: Not used for Chrome history (kept for compatibility)
|
||||
**load_kwargs:
|
||||
max_count (int): Maximum amount of history entries to read.
|
||||
chrome_profile_path (str): Custom path to Chrome profile directory.
|
||||
"""
|
||||
docs: List[Document] = []
|
||||
max_count = load_kwargs.get('max_count', 1000)
|
||||
chrome_profile_path = load_kwargs.get('chrome_profile_path', None)
|
||||
|
||||
docs: list[Document] = []
|
||||
max_count = load_kwargs.get("max_count", 1000)
|
||||
chrome_profile_path = load_kwargs.get("chrome_profile_path", None)
|
||||
|
||||
# Default Chrome profile path on macOS
|
||||
if chrome_profile_path is None:
|
||||
chrome_profile_path = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
|
||||
|
||||
chrome_profile_path = os.path.expanduser(
|
||||
"~/Library/Application Support/Google/Chrome/Default"
|
||||
)
|
||||
|
||||
history_db_path = os.path.join(chrome_profile_path, "History")
|
||||
|
||||
|
||||
if not os.path.exists(history_db_path):
|
||||
print(f"Chrome history database not found at: {history_db_path}")
|
||||
return docs
|
||||
|
||||
|
||||
try:
|
||||
# Connect to the Chrome history database
|
||||
print(f"Connecting to database: {history_db_path}")
|
||||
conn = sqlite3.connect(history_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
# Query to get browsing history with metadata (removed created_time column)
|
||||
query = """
|
||||
SELECT
|
||||
SELECT
|
||||
datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime') as last_visit,
|
||||
url,
|
||||
title,
|
||||
visit_count,
|
||||
typed_count,
|
||||
url,
|
||||
title,
|
||||
visit_count,
|
||||
typed_count,
|
||||
hidden
|
||||
FROM urls
|
||||
FROM urls
|
||||
ORDER BY last_visit_time DESC
|
||||
"""
|
||||
|
||||
|
||||
print(f"Executing query on database: {history_db_path}")
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
print(f"Query returned {len(rows)} rows")
|
||||
|
||||
|
||||
count = 0
|
||||
for row in rows:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
|
||||
last_visit, url, title, visit_count, typed_count, hidden = row
|
||||
|
||||
|
||||
# Create document content with metadata embedded in text
|
||||
doc_content = f"""
|
||||
[Title]: {title}
|
||||
@@ -80,38 +84,38 @@ class ChromeHistoryReader(BaseReader):
|
||||
[Visit times]: {visit_count}
|
||||
[Typed times]: {typed_count}
|
||||
"""
|
||||
|
||||
|
||||
# Create document with embedded metadata
|
||||
doc = Document(text=doc_content, metadata={ "title": title[0:150]})
|
||||
doc = Document(text=doc_content, metadata={"title": title[0:150]})
|
||||
# if len(title) > 150:
|
||||
# print(f"Title is too long: {title}")
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
|
||||
conn.close()
|
||||
print(f"Loaded {len(docs)} Chrome history documents")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading Chrome history: {e}")
|
||||
return docs
|
||||
|
||||
|
||||
return docs
|
||||
|
||||
@staticmethod
|
||||
def find_chrome_profiles() -> List[Path]:
|
||||
def find_chrome_profiles() -> list[Path]:
|
||||
"""
|
||||
Find all Chrome profile directories.
|
||||
|
||||
|
||||
Returns:
|
||||
List of Path objects pointing to Chrome profile directories
|
||||
"""
|
||||
chrome_base_path = Path(os.path.expanduser("~/Library/Application Support/Google/Chrome"))
|
||||
profile_dirs = []
|
||||
|
||||
|
||||
if not chrome_base_path.exists():
|
||||
print(f"Chrome directory not found at: {chrome_base_path}")
|
||||
return profile_dirs
|
||||
|
||||
|
||||
# Find all profile directories
|
||||
for profile_dir in chrome_base_path.iterdir():
|
||||
if profile_dir.is_dir() and profile_dir.name != "System Profile":
|
||||
@@ -119,53 +123,59 @@ class ChromeHistoryReader(BaseReader):
|
||||
if history_path.exists():
|
||||
profile_dirs.append(profile_dir)
|
||||
print(f"Found Chrome profile: {profile_dir}")
|
||||
|
||||
|
||||
print(f"Found {len(profile_dirs)} Chrome profiles")
|
||||
return profile_dirs
|
||||
|
||||
@staticmethod
|
||||
def export_history_to_file(output_file: str = "chrome_history_export.txt", max_count: int = 1000):
|
||||
def export_history_to_file(
|
||||
output_file: str = "chrome_history_export.txt", max_count: int = 1000
|
||||
):
|
||||
"""
|
||||
Export Chrome history to a text file using the same SQL query format.
|
||||
|
||||
|
||||
Args:
|
||||
output_file: Path to the output file
|
||||
max_count: Maximum number of entries to export
|
||||
"""
|
||||
chrome_profile_path = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
|
||||
chrome_profile_path = os.path.expanduser(
|
||||
"~/Library/Application Support/Google/Chrome/Default"
|
||||
)
|
||||
history_db_path = os.path.join(chrome_profile_path, "History")
|
||||
|
||||
|
||||
if not os.path.exists(history_db_path):
|
||||
print(f"Chrome history database not found at: {history_db_path}")
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(history_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
query = """
|
||||
SELECT
|
||||
SELECT
|
||||
datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime') as last_visit,
|
||||
url,
|
||||
title,
|
||||
visit_count,
|
||||
typed_count,
|
||||
url,
|
||||
title,
|
||||
visit_count,
|
||||
typed_count,
|
||||
hidden
|
||||
FROM urls
|
||||
FROM urls
|
||||
ORDER BY last_visit_time DESC
|
||||
LIMIT ?
|
||||
"""
|
||||
|
||||
|
||||
cursor.execute(query, (max_count,))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
for row in rows:
|
||||
last_visit, url, title, visit_count, typed_count, hidden = row
|
||||
f.write(f"{last_visit}\t{url}\t{title}\t{visit_count}\t{typed_count}\t{hidden}\n")
|
||||
|
||||
f.write(
|
||||
f"{last_visit}\t{url}\t{title}\t{visit_count}\t{typed_count}\t{hidden}\n"
|
||||
)
|
||||
|
||||
conn.close()
|
||||
print(f"Exported {len(rows)} history entries to {output_file}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error exporting Chrome history: {e}")
|
||||
print(f"Error exporting Chrome history: {e}")
|
||||
|
||||
@@ -2,30 +2,31 @@ import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Any, Dict, Optional
|
||||
from typing import Any
|
||||
|
||||
from llama_index.core import Document
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class WeChatHistoryReader(BaseReader):
|
||||
"""
|
||||
WeChat chat history reader that extracts chat data from exported JSON files.
|
||||
|
||||
|
||||
Reads WeChat chat history from exported JSON files (from wechat-exporter tool)
|
||||
and creates documents with embedded metadata similar to the Chrome history reader structure.
|
||||
|
||||
|
||||
Also includes utilities for automatic WeChat chat history export.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize."""
|
||||
self.packages_dir = Path(__file__).parent.parent.parent / "packages"
|
||||
self.wechat_exporter_dir = self.packages_dir / "wechat-exporter"
|
||||
self.wechat_decipher_dir = self.packages_dir / "wechat-decipher-macos"
|
||||
|
||||
|
||||
def check_wechat_running(self) -> bool:
|
||||
"""Check if WeChat is currently running."""
|
||||
try:
|
||||
@@ -33,24 +34,30 @@ class WeChatHistoryReader(BaseReader):
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def install_wechattweak(self) -> bool:
|
||||
"""Install WeChatTweak CLI tool."""
|
||||
try:
|
||||
# Create wechat-exporter directory if it doesn't exist
|
||||
self.wechat_exporter_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
wechattweak_path = self.wechat_exporter_dir / "wechattweak-cli"
|
||||
if not wechattweak_path.exists():
|
||||
print("Downloading WeChatTweak CLI...")
|
||||
subprocess.run([
|
||||
"curl", "-L", "-o", str(wechattweak_path),
|
||||
"https://github.com/JettChenT/WeChatTweak-CLI/releases/latest/download/wechattweak-cli"
|
||||
], check=True)
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
"curl",
|
||||
"-L",
|
||||
"-o",
|
||||
str(wechattweak_path),
|
||||
"https://github.com/JettChenT/WeChatTweak-CLI/releases/latest/download/wechattweak-cli",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
# Make executable
|
||||
wechattweak_path.chmod(0o755)
|
||||
|
||||
|
||||
# Install WeChatTweak
|
||||
print("Installing WeChatTweak...")
|
||||
subprocess.run(["sudo", str(wechattweak_path), "install"], check=True)
|
||||
@@ -58,7 +65,7 @@ class WeChatHistoryReader(BaseReader):
|
||||
except Exception as e:
|
||||
print(f"Error installing WeChatTweak: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def restart_wechat(self):
|
||||
"""Restart WeChat to apply WeChatTweak."""
|
||||
try:
|
||||
@@ -69,302 +76,325 @@ class WeChatHistoryReader(BaseReader):
|
||||
time.sleep(5) # Wait for WeChat to start
|
||||
except Exception as e:
|
||||
print(f"Error restarting WeChat: {e}")
|
||||
|
||||
|
||||
def check_api_available(self) -> bool:
|
||||
"""Check if WeChatTweak API is available."""
|
||||
try:
|
||||
result = subprocess.run([
|
||||
"curl", "-s", "http://localhost:48065/wechat/allcontacts"
|
||||
], capture_output=True, text=True, timeout=5)
|
||||
result = subprocess.run(
|
||||
["curl", "-s", "http://localhost:48065/wechat/allcontacts"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
return result.returncode == 0 and result.stdout.strip()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
def _extract_readable_text(self, content: str) -> str:
|
||||
"""
|
||||
Extract readable text from message content, removing XML and system messages.
|
||||
|
||||
|
||||
Args:
|
||||
content: The raw message content (can be string or dict)
|
||||
|
||||
|
||||
Returns:
|
||||
Cleaned, readable text
|
||||
"""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
|
||||
# Handle dictionary content (like quoted messages)
|
||||
if isinstance(content, dict):
|
||||
# Extract text from dictionary structure
|
||||
text_parts = []
|
||||
if 'title' in content:
|
||||
text_parts.append(str(content['title']))
|
||||
if 'quoted' in content:
|
||||
text_parts.append(str(content['quoted']))
|
||||
if 'content' in content:
|
||||
text_parts.append(str(content['content']))
|
||||
if 'text' in content:
|
||||
text_parts.append(str(content['text']))
|
||||
|
||||
if "title" in content:
|
||||
text_parts.append(str(content["title"]))
|
||||
if "quoted" in content:
|
||||
text_parts.append(str(content["quoted"]))
|
||||
if "content" in content:
|
||||
text_parts.append(str(content["content"]))
|
||||
if "text" in content:
|
||||
text_parts.append(str(content["text"]))
|
||||
|
||||
if text_parts:
|
||||
return " | ".join(text_parts)
|
||||
else:
|
||||
# If we can't extract meaningful text from dict, return empty
|
||||
return ""
|
||||
|
||||
|
||||
# Handle string content
|
||||
if not isinstance(content, str):
|
||||
return ""
|
||||
|
||||
|
||||
# Remove common prefixes like "wxid_xxx:\n"
|
||||
clean_content = re.sub(r'^wxid_[^:]+:\s*', '', content)
|
||||
clean_content = re.sub(r'^[^:]+:\s*', '', clean_content)
|
||||
|
||||
clean_content = re.sub(r"^wxid_[^:]+:\s*", "", content)
|
||||
clean_content = re.sub(r"^[^:]+:\s*", "", clean_content)
|
||||
|
||||
# If it's just XML or system message, return empty
|
||||
if clean_content.strip().startswith('<') or 'recalled a message' in clean_content:
|
||||
if clean_content.strip().startswith("<") or "recalled a message" in clean_content:
|
||||
return ""
|
||||
|
||||
|
||||
return clean_content.strip()
|
||||
|
||||
|
||||
def _is_text_message(self, content: str) -> bool:
|
||||
"""
|
||||
Check if a message contains readable text content.
|
||||
|
||||
|
||||
Args:
|
||||
content: The message content (can be string or dict)
|
||||
|
||||
|
||||
Returns:
|
||||
True if the message contains readable text, False otherwise
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
|
||||
|
||||
# Handle dictionary content
|
||||
if isinstance(content, dict):
|
||||
# Check if dict has any readable text fields
|
||||
text_fields = ['title', 'quoted', 'content', 'text']
|
||||
text_fields = ["title", "quoted", "content", "text"]
|
||||
for field in text_fields:
|
||||
if field in content and content[field]:
|
||||
if content.get(field):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# Handle string content
|
||||
if not isinstance(content, str):
|
||||
return False
|
||||
|
||||
|
||||
# Skip image messages (contain XML with img tags)
|
||||
if '<img' in content and 'cdnurl' in content:
|
||||
if "<img" in content and "cdnurl" in content:
|
||||
return False
|
||||
|
||||
|
||||
# Skip emoji messages (contain emoji XML tags)
|
||||
if '<emoji' in content and 'productid' in content:
|
||||
if "<emoji" in content and "productid" in content:
|
||||
return False
|
||||
|
||||
|
||||
# Skip voice messages
|
||||
if '<voice' in content:
|
||||
if "<voice" in content:
|
||||
return False
|
||||
|
||||
|
||||
# Skip video messages
|
||||
if '<video' in content:
|
||||
if "<video" in content:
|
||||
return False
|
||||
|
||||
|
||||
# Skip file messages
|
||||
if '<appmsg' in content and 'appid' in content:
|
||||
if "<appmsg" in content and "appid" in content:
|
||||
return False
|
||||
|
||||
|
||||
# Skip system messages (like "recalled a message")
|
||||
if 'recalled a message' in content:
|
||||
if "recalled a message" in content:
|
||||
return False
|
||||
|
||||
|
||||
# Check if there's actual readable text (not just XML or system messages)
|
||||
# Remove common prefixes like "wxid_xxx:\n" and check for actual content
|
||||
clean_content = re.sub(r'^wxid_[^:]+:\s*', '', content)
|
||||
clean_content = re.sub(r'^[^:]+:\s*', '', clean_content)
|
||||
|
||||
clean_content = re.sub(r"^wxid_[^:]+:\s*", "", content)
|
||||
clean_content = re.sub(r"^[^:]+:\s*", "", clean_content)
|
||||
|
||||
# If after cleaning we have meaningful text, consider it readable
|
||||
if len(clean_content.strip()) > 0 and not clean_content.strip().startswith('<'):
|
||||
if len(clean_content.strip()) > 0 and not clean_content.strip().startswith("<"):
|
||||
return True
|
||||
|
||||
|
||||
return False
|
||||
|
||||
def _concatenate_messages(self, messages: List[Dict], max_length: int = 128,
|
||||
time_window_minutes: int = 30, overlap_messages: int = 0) -> List[Dict]:
|
||||
|
||||
def _concatenate_messages(
|
||||
self,
|
||||
messages: list[dict],
|
||||
max_length: int = 128,
|
||||
time_window_minutes: int = 30,
|
||||
overlap_messages: int = 0,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Concatenate messages based on length and time rules.
|
||||
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries
|
||||
max_length: Maximum length for concatenated message groups. Use -1 to disable length constraint.
|
||||
time_window_minutes: Time window in minutes to group messages together. Use -1 to disable time constraint.
|
||||
overlap_messages: Number of messages to overlap between consecutive groups
|
||||
|
||||
|
||||
Returns:
|
||||
List of concatenated message groups
|
||||
"""
|
||||
if not messages:
|
||||
return []
|
||||
|
||||
|
||||
concatenated_groups = []
|
||||
current_group = []
|
||||
current_length = 0
|
||||
last_timestamp = None
|
||||
|
||||
|
||||
for message in messages:
|
||||
# Extract message info
|
||||
content = message.get('content', '')
|
||||
message_text = message.get('message', '')
|
||||
create_time = message.get('createTime', 0)
|
||||
from_user = message.get('fromUser', '')
|
||||
to_user = message.get('toUser', '')
|
||||
is_sent_from_self = message.get('isSentFromSelf', False)
|
||||
|
||||
content = message.get("content", "")
|
||||
message_text = message.get("message", "")
|
||||
create_time = message.get("createTime", 0)
|
||||
message.get("fromUser", "")
|
||||
message.get("toUser", "")
|
||||
message.get("isSentFromSelf", False)
|
||||
|
||||
# Extract readable text
|
||||
readable_text = self._extract_readable_text(content)
|
||||
if not readable_text:
|
||||
readable_text = message_text
|
||||
|
||||
|
||||
# Skip empty messages
|
||||
if not readable_text.strip():
|
||||
continue
|
||||
|
||||
|
||||
# Check time window constraint (only if time_window_minutes != -1)
|
||||
if time_window_minutes != -1 and last_timestamp is not None and create_time > 0:
|
||||
time_diff_minutes = (create_time - last_timestamp) / 60
|
||||
if time_diff_minutes > time_window_minutes:
|
||||
# Time gap too large, start new group
|
||||
if current_group:
|
||||
concatenated_groups.append({
|
||||
'messages': current_group,
|
||||
'total_length': current_length,
|
||||
'start_time': current_group[0].get('createTime', 0),
|
||||
'end_time': current_group[-1].get('createTime', 0)
|
||||
})
|
||||
concatenated_groups.append(
|
||||
{
|
||||
"messages": current_group,
|
||||
"total_length": current_length,
|
||||
"start_time": current_group[0].get("createTime", 0),
|
||||
"end_time": current_group[-1].get("createTime", 0),
|
||||
}
|
||||
)
|
||||
# Keep last few messages for overlap
|
||||
if overlap_messages > 0 and len(current_group) > overlap_messages:
|
||||
current_group = current_group[-overlap_messages:]
|
||||
current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
|
||||
current_length = sum(
|
||||
len(
|
||||
self._extract_readable_text(msg.get("content", ""))
|
||||
or msg.get("message", "")
|
||||
)
|
||||
for msg in current_group
|
||||
)
|
||||
else:
|
||||
current_group = []
|
||||
current_length = 0
|
||||
|
||||
|
||||
# Check length constraint (only if max_length != -1)
|
||||
message_length = len(readable_text)
|
||||
if max_length != -1 and current_length + message_length > max_length and current_group:
|
||||
# Current group would exceed max length, save it and start new
|
||||
concatenated_groups.append({
|
||||
'messages': current_group,
|
||||
'total_length': current_length,
|
||||
'start_time': current_group[0].get('createTime', 0),
|
||||
'end_time': current_group[-1].get('createTime', 0)
|
||||
})
|
||||
concatenated_groups.append(
|
||||
{
|
||||
"messages": current_group,
|
||||
"total_length": current_length,
|
||||
"start_time": current_group[0].get("createTime", 0),
|
||||
"end_time": current_group[-1].get("createTime", 0),
|
||||
}
|
||||
)
|
||||
# Keep last few messages for overlap
|
||||
if overlap_messages > 0 and len(current_group) > overlap_messages:
|
||||
current_group = current_group[-overlap_messages:]
|
||||
current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
|
||||
current_length = sum(
|
||||
len(
|
||||
self._extract_readable_text(msg.get("content", ""))
|
||||
or msg.get("message", "")
|
||||
)
|
||||
for msg in current_group
|
||||
)
|
||||
else:
|
||||
current_group = []
|
||||
current_length = 0
|
||||
|
||||
|
||||
# Add message to current group
|
||||
current_group.append(message)
|
||||
current_length += message_length
|
||||
last_timestamp = create_time
|
||||
|
||||
|
||||
# Add the last group if it exists
|
||||
if current_group:
|
||||
concatenated_groups.append({
|
||||
'messages': current_group,
|
||||
'total_length': current_length,
|
||||
'start_time': current_group[0].get('createTime', 0),
|
||||
'end_time': current_group[-1].get('createTime', 0)
|
||||
})
|
||||
|
||||
concatenated_groups.append(
|
||||
{
|
||||
"messages": current_group,
|
||||
"total_length": current_length,
|
||||
"start_time": current_group[0].get("createTime", 0),
|
||||
"end_time": current_group[-1].get("createTime", 0),
|
||||
}
|
||||
)
|
||||
|
||||
return concatenated_groups
|
||||
|
||||
def _create_concatenated_content(self, message_group: Dict, contact_name: str) -> str:
|
||||
|
||||
def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str:
|
||||
"""
|
||||
Create concatenated content from a group of messages.
|
||||
|
||||
|
||||
Args:
|
||||
message_group: Dictionary containing messages and metadata
|
||||
contact_name: Name of the contact
|
||||
|
||||
|
||||
Returns:
|
||||
Formatted concatenated content
|
||||
"""
|
||||
messages = message_group['messages']
|
||||
start_time = message_group['start_time']
|
||||
end_time = message_group['end_time']
|
||||
|
||||
messages = message_group["messages"]
|
||||
start_time = message_group["start_time"]
|
||||
end_time = message_group["end_time"]
|
||||
|
||||
# Format timestamps
|
||||
if start_time:
|
||||
try:
|
||||
start_timestamp = datetime.fromtimestamp(start_time)
|
||||
start_time_str = start_timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
start_time_str = start_timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
start_time_str = str(start_time)
|
||||
else:
|
||||
start_time_str = "Unknown"
|
||||
|
||||
|
||||
if end_time:
|
||||
try:
|
||||
end_timestamp = datetime.fromtimestamp(end_time)
|
||||
end_time_str = end_timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
end_time_str = end_timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
end_time_str = str(end_time)
|
||||
else:
|
||||
end_time_str = "Unknown"
|
||||
|
||||
|
||||
# Build concatenated message content
|
||||
message_parts = []
|
||||
for message in messages:
|
||||
content = message.get('content', '')
|
||||
message_text = message.get('message', '')
|
||||
create_time = message.get('createTime', 0)
|
||||
is_sent_from_self = message.get('isSentFromSelf', False)
|
||||
|
||||
content = message.get("content", "")
|
||||
message_text = message.get("message", "")
|
||||
create_time = message.get("createTime", 0)
|
||||
is_sent_from_self = message.get("isSentFromSelf", False)
|
||||
|
||||
# Extract readable text
|
||||
readable_text = self._extract_readable_text(content)
|
||||
if not readable_text:
|
||||
readable_text = message_text
|
||||
|
||||
|
||||
# Format individual message
|
||||
if create_time:
|
||||
try:
|
||||
timestamp = datetime.fromtimestamp(create_time)
|
||||
# change to YYYY-MM-DD HH:MM:SS
|
||||
time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
time_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
time_str = str(create_time)
|
||||
else:
|
||||
time_str = "Unknown"
|
||||
|
||||
|
||||
sender = "[Me]" if is_sent_from_self else "[Contact]"
|
||||
message_parts.append(f"({time_str}) {sender}: {readable_text}")
|
||||
|
||||
|
||||
concatenated_text = "\n".join(message_parts)
|
||||
|
||||
|
||||
# Create final document content
|
||||
doc_content = f"""
|
||||
Contact: {contact_name}
|
||||
Time Range: {start_time_str} - {end_time_str}
|
||||
Messages ({len(messages)} messages, {message_group['total_length']} chars):
|
||||
Messages ({len(messages)} messages, {message_group["total_length"]} chars):
|
||||
|
||||
{concatenated_text}
|
||||
"""
|
||||
# TODO @yichuan give better format and rich info here!
|
||||
# TODO @yichuan give better format and rich info here!
|
||||
doc_content = f"""
|
||||
{concatenated_text}
|
||||
"""
|
||||
return doc_content, contact_name
|
||||
|
||||
def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
|
||||
|
||||
def load_data(self, input_dir: str | None = None, **load_kwargs: Any) -> list[Document]:
|
||||
"""
|
||||
Load WeChat chat history data from exported JSON files.
|
||||
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing exported WeChat JSON files
|
||||
**load_kwargs:
|
||||
@@ -376,97 +406,103 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
|
||||
time_window_minutes (int): Time window in minutes to group messages together (default: 30).
|
||||
overlap_messages (int): Number of messages to overlap between consecutive groups (default: 2).
|
||||
"""
|
||||
docs: List[Document] = []
|
||||
max_count = load_kwargs.get('max_count', 1000)
|
||||
wechat_export_dir = load_kwargs.get('wechat_export_dir', None)
|
||||
include_non_text = load_kwargs.get('include_non_text', False)
|
||||
concatenate_messages = load_kwargs.get('concatenate_messages', False)
|
||||
max_length = load_kwargs.get('max_length', 1000)
|
||||
time_window_minutes = load_kwargs.get('time_window_minutes', 30)
|
||||
|
||||
docs: list[Document] = []
|
||||
max_count = load_kwargs.get("max_count", 1000)
|
||||
wechat_export_dir = load_kwargs.get("wechat_export_dir", None)
|
||||
include_non_text = load_kwargs.get("include_non_text", False)
|
||||
concatenate_messages = load_kwargs.get("concatenate_messages", False)
|
||||
load_kwargs.get("max_length", 1000)
|
||||
load_kwargs.get("time_window_minutes", 30)
|
||||
|
||||
# Default WeChat export path
|
||||
if wechat_export_dir is None:
|
||||
wechat_export_dir = "./wechat_export_test"
|
||||
|
||||
|
||||
if not os.path.exists(wechat_export_dir):
|
||||
print(f"WeChat export directory not found at: {wechat_export_dir}")
|
||||
return docs
|
||||
|
||||
|
||||
try:
|
||||
# Find all JSON files in the export directory
|
||||
json_files = list(Path(wechat_export_dir).glob("*.json"))
|
||||
print(f"Found {len(json_files)} WeChat chat history files")
|
||||
|
||||
|
||||
count = 0
|
||||
for json_file in json_files:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
|
||||
try:
|
||||
with open(json_file, 'r', encoding='utf-8') as f:
|
||||
with open(json_file, encoding="utf-8") as f:
|
||||
chat_data = json.load(f)
|
||||
|
||||
|
||||
# Extract contact name from filename
|
||||
contact_name = json_file.stem
|
||||
|
||||
|
||||
if concatenate_messages:
|
||||
# Filter messages to only include readable text messages
|
||||
readable_messages = []
|
||||
for message in chat_data:
|
||||
try:
|
||||
content = message.get('content', '')
|
||||
content = message.get("content", "")
|
||||
if not include_non_text and not self._is_text_message(content):
|
||||
continue
|
||||
|
||||
|
||||
readable_text = self._extract_readable_text(content)
|
||||
if not readable_text and not include_non_text:
|
||||
continue
|
||||
|
||||
|
||||
readable_messages.append(message)
|
||||
except Exception as e:
|
||||
print(f"Error processing message in {json_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
# Concatenate messages based on rules
|
||||
message_groups = self._concatenate_messages(
|
||||
readable_messages,
|
||||
max_length=-1,
|
||||
readable_messages,
|
||||
max_length=-1,
|
||||
time_window_minutes=-1,
|
||||
overlap_messages=0 # Keep 2 messages overlap between groups
|
||||
overlap_messages=0, # Keep 2 messages overlap between groups
|
||||
)
|
||||
|
||||
|
||||
# Create documents from concatenated groups
|
||||
for message_group in message_groups:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
doc_content, contact_name = self._create_concatenated_content(message_group, contact_name)
|
||||
doc = Document(text=doc_content, metadata={"contact_name": contact_name})
|
||||
|
||||
doc_content, contact_name = self._create_concatenated_content(
|
||||
message_group, contact_name
|
||||
)
|
||||
doc = Document(
|
||||
text=doc_content, metadata={"contact_name": contact_name}
|
||||
)
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
print(f"Created {len(message_groups)} concatenated message groups for {contact_name}")
|
||||
|
||||
|
||||
print(
|
||||
f"Created {len(message_groups)} concatenated message groups for {contact_name}"
|
||||
)
|
||||
|
||||
else:
|
||||
# Original single-message processing
|
||||
for message in chat_data:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
|
||||
# Extract message information
|
||||
from_user = message.get('fromUser', '')
|
||||
to_user = message.get('toUser', '')
|
||||
content = message.get('content', '')
|
||||
message_text = message.get('message', '')
|
||||
create_time = message.get('createTime', 0)
|
||||
is_sent_from_self = message.get('isSentFromSelf', False)
|
||||
|
||||
message.get("fromUser", "")
|
||||
message.get("toUser", "")
|
||||
content = message.get("content", "")
|
||||
message_text = message.get("message", "")
|
||||
create_time = message.get("createTime", 0)
|
||||
is_sent_from_self = message.get("isSentFromSelf", False)
|
||||
|
||||
# Handle content that might be dict or string
|
||||
try:
|
||||
# Check if this is a readable text message
|
||||
if not include_non_text and not self._is_text_message(content):
|
||||
continue
|
||||
|
||||
|
||||
# Extract readable text
|
||||
readable_text = self._extract_readable_text(content)
|
||||
if not readable_text and not include_non_text:
|
||||
@@ -475,17 +511,17 @@ Messages ({len(messages)} messages, {message_group['total_length']} chars):
|
||||
# Skip messages that cause processing errors
|
||||
print(f"Error processing message in {json_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
# Convert timestamp to readable format
|
||||
if create_time:
|
||||
try:
|
||||
timestamp = datetime.fromtimestamp(create_time)
|
||||
time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
time_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
time_str = str(create_time)
|
||||
else:
|
||||
time_str = "Unknown"
|
||||
|
||||
|
||||
# Create document content with metadata header and contact info
|
||||
doc_content = f"""
|
||||
Contact: {contact_name}
|
||||
@@ -493,57 +529,64 @@ Is sent from self: {is_sent_from_self}
|
||||
Time: {time_str}
|
||||
Message: {readable_text if readable_text else message_text}
|
||||
"""
|
||||
|
||||
|
||||
# Create document with embedded metadata
|
||||
doc = Document(text=doc_content, metadata={})
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading {json_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
print(f"Loaded {len(docs)} WeChat chat documents")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading WeChat history: {e}")
|
||||
return docs
|
||||
|
||||
|
||||
return docs
|
||||
|
||||
@staticmethod
|
||||
def find_wechat_export_dirs() -> List[Path]:
|
||||
def find_wechat_export_dirs() -> list[Path]:
|
||||
"""
|
||||
Find all WeChat export directories.
|
||||
|
||||
|
||||
Returns:
|
||||
List of Path objects pointing to WeChat export directories
|
||||
"""
|
||||
export_dirs = []
|
||||
|
||||
|
||||
# Look for common export directory names
|
||||
possible_dirs = [
|
||||
Path("./wechat_export_test"),
|
||||
Path("./wechat_export"),
|
||||
Path("./wechat_chat_history"),
|
||||
Path("./chat_export")
|
||||
Path("./chat_export"),
|
||||
]
|
||||
|
||||
|
||||
for export_dir in possible_dirs:
|
||||
if export_dir.exists() and export_dir.is_dir():
|
||||
json_files = list(export_dir.glob("*.json"))
|
||||
if json_files:
|
||||
export_dirs.append(export_dir)
|
||||
print(f"Found WeChat export directory: {export_dir} with {len(json_files)} files")
|
||||
|
||||
print(
|
||||
f"Found WeChat export directory: {export_dir} with {len(json_files)} files"
|
||||
)
|
||||
|
||||
print(f"Found {len(export_dirs)} WeChat export directories")
|
||||
return export_dirs
|
||||
|
||||
@staticmethod
|
||||
def export_chat_to_file(output_file: str = "wechat_chat_export.txt", max_count: int = 1000, export_dir: str = None, include_non_text: bool = False):
|
||||
def export_chat_to_file(
|
||||
output_file: str = "wechat_chat_export.txt",
|
||||
max_count: int = 1000,
|
||||
export_dir: str | None = None,
|
||||
include_non_text: bool = False,
|
||||
):
|
||||
"""
|
||||
Export WeChat chat history to a text file.
|
||||
|
||||
|
||||
Args:
|
||||
output_file: Path to the output file
|
||||
max_count: Maximum number of entries to export
|
||||
@@ -552,36 +595,36 @@ Message: {readable_text if readable_text else message_text}
|
||||
"""
|
||||
if export_dir is None:
|
||||
export_dir = "./wechat_export_test"
|
||||
|
||||
|
||||
if not os.path.exists(export_dir):
|
||||
print(f"WeChat export directory not found at: {export_dir}")
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
json_files = list(Path(export_dir).glob("*.json"))
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
count = 0
|
||||
for json_file in json_files:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
|
||||
try:
|
||||
with open(json_file, 'r', encoding='utf-8') as json_f:
|
||||
with open(json_file, encoding="utf-8") as json_f:
|
||||
chat_data = json.load(json_f)
|
||||
|
||||
|
||||
contact_name = json_file.stem
|
||||
f.write(f"\n=== Chat with {contact_name} ===\n")
|
||||
|
||||
|
||||
for message in chat_data:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
from_user = message.get('fromUser', '')
|
||||
content = message.get('content', '')
|
||||
message_text = message.get('message', '')
|
||||
create_time = message.get('createTime', 0)
|
||||
|
||||
|
||||
from_user = message.get("fromUser", "")
|
||||
content = message.get("content", "")
|
||||
message_text = message.get("message", "")
|
||||
create_time = message.get("createTime", 0)
|
||||
|
||||
# Skip non-text messages unless requested
|
||||
if not include_non_text:
|
||||
reader = WeChatHistoryReader()
|
||||
@@ -591,83 +634,90 @@ Message: {readable_text if readable_text else message_text}
|
||||
if not readable_text:
|
||||
continue
|
||||
message_text = readable_text
|
||||
|
||||
|
||||
if create_time:
|
||||
try:
|
||||
timestamp = datetime.fromtimestamp(create_time)
|
||||
time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
time_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
time_str = str(create_time)
|
||||
else:
|
||||
time_str = "Unknown"
|
||||
|
||||
|
||||
f.write(f"[{time_str}] {from_user}: {message_text}\n")
|
||||
count += 1
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {json_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
print(f"Exported {count} chat entries to {output_file}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error exporting WeChat chat history: {e}")
|
||||
|
||||
def export_wechat_chat_history(self, export_dir: str = "./wechat_export_direct") -> Optional[Path]:
|
||||
def export_wechat_chat_history(self, export_dir: str = "./wechat_export_direct") -> Path | None:
|
||||
"""
|
||||
Export WeChat chat history using wechat-exporter tool.
|
||||
|
||||
|
||||
Args:
|
||||
export_dir: Directory to save exported chat history
|
||||
|
||||
|
||||
Returns:
|
||||
Path to export directory if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
# Create export directory
|
||||
export_path = Path(export_dir)
|
||||
export_path.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
print(f"Exporting WeChat chat history to {export_path}...")
|
||||
|
||||
|
||||
# Check if wechat-exporter directory exists
|
||||
if not self.wechat_exporter_dir.exists():
|
||||
print(f"wechat-exporter directory not found at: {self.wechat_exporter_dir}")
|
||||
return None
|
||||
|
||||
|
||||
# Install requirements if needed
|
||||
requirements_file = self.wechat_exporter_dir / "requirements.txt"
|
||||
if requirements_file.exists():
|
||||
print("Installing wechat-exporter requirements...")
|
||||
subprocess.run([
|
||||
"uv", "pip", "install", "-r", str(requirements_file)
|
||||
], check=True)
|
||||
|
||||
subprocess.run(["uv", "pip", "install", "-r", str(requirements_file)], check=True)
|
||||
|
||||
# Run the export command
|
||||
print("Running wechat-exporter...")
|
||||
result = subprocess.run([
|
||||
sys.executable, str(self.wechat_exporter_dir / "main.py"),
|
||||
"export-all", str(export_path)
|
||||
], capture_output=True, text=True, check=True)
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(self.wechat_exporter_dir / "main.py"),
|
||||
"export-all",
|
||||
str(export_path),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
print("Export command output:")
|
||||
print(result.stdout)
|
||||
if result.stderr:
|
||||
print("Export errors:")
|
||||
print(result.stderr)
|
||||
|
||||
|
||||
# Check if export was successful
|
||||
if export_path.exists() and any(export_path.glob("*.json")):
|
||||
json_files = list(export_path.glob("*.json"))
|
||||
print(f"Successfully exported {len(json_files)} chat history files to {export_path}")
|
||||
print(
|
||||
f"Successfully exported {len(json_files)} chat history files to {export_path}"
|
||||
)
|
||||
return export_path
|
||||
else:
|
||||
print("Export completed but no JSON files found")
|
||||
return None
|
||||
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Export command failed: {e}")
|
||||
print(f"Command output: {e.stdout}")
|
||||
@@ -678,18 +728,18 @@ Message: {readable_text if readable_text else message_text}
|
||||
print("Please ensure WeChat is running and WeChatTweak is installed.")
|
||||
return None
|
||||
|
||||
def find_or_export_wechat_data(self, export_dir: str = "./wechat_export_direct") -> List[Path]:
|
||||
def find_or_export_wechat_data(self, export_dir: str = "./wechat_export_direct") -> list[Path]:
|
||||
"""
|
||||
Find existing WeChat exports or create new ones.
|
||||
|
||||
|
||||
Args:
|
||||
export_dir: Directory to save exported chat history if needed
|
||||
|
||||
|
||||
Returns:
|
||||
List of Path objects pointing to WeChat export directories
|
||||
"""
|
||||
export_dirs = []
|
||||
|
||||
|
||||
# Look for existing exports in common locations
|
||||
possible_export_dirs = [
|
||||
Path("./wechat_database_export"),
|
||||
@@ -697,23 +747,25 @@ Message: {readable_text if readable_text else message_text}
|
||||
Path("./wechat_export"),
|
||||
Path("./wechat_export_direct"),
|
||||
Path("./wechat_chat_history"),
|
||||
Path("./chat_export")
|
||||
Path("./chat_export"),
|
||||
]
|
||||
|
||||
|
||||
for export_dir_path in possible_export_dirs:
|
||||
if export_dir_path.exists() and any(export_dir_path.glob("*.json")):
|
||||
export_dirs.append(export_dir_path)
|
||||
print(f"Found existing export: {export_dir_path}")
|
||||
|
||||
|
||||
# If no existing exports, try to export automatically
|
||||
if not export_dirs:
|
||||
print("No existing WeChat exports found. Starting direct export...")
|
||||
|
||||
|
||||
# Try to export using wechat-exporter
|
||||
exported_path = self.export_wechat_chat_history(export_dir)
|
||||
if exported_path:
|
||||
export_dirs = [exported_path]
|
||||
else:
|
||||
print("Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed.")
|
||||
|
||||
return export_dirs
|
||||
print(
|
||||
"Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed."
|
||||
)
|
||||
|
||||
return export_dirs
|
||||
|
||||
@@ -1,33 +1,42 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import dotenv
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
|
||||
import dotenv
|
||||
|
||||
# Add the project root to Python path so we can import from examples
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
from leann.api import LeannBuilder, LeannChat
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
# Auto-detect user's mail path
|
||||
def get_mail_path():
|
||||
"""Get the mail path for the current user"""
|
||||
home_dir = os.path.expanduser("~")
|
||||
return os.path.join(home_dir, "Library", "Mail")
|
||||
|
||||
|
||||
# Default mail path for macOS
|
||||
DEFAULT_MAIL_PATH = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data"
|
||||
|
||||
def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_path: str = "mail_index.leann", max_count: int = -1, include_html: bool = False, embedding_model: str = "facebook/contriever"):
|
||||
|
||||
def create_leann_index_from_multiple_sources(
|
||||
messages_dirs: list[Path],
|
||||
index_path: str = "mail_index.leann",
|
||||
max_count: int = -1,
|
||||
include_html: bool = False,
|
||||
embedding_model: str = "facebook/contriever",
|
||||
):
|
||||
"""
|
||||
Create LEANN index from multiple mail data sources.
|
||||
|
||||
|
||||
Args:
|
||||
messages_dirs: List of Path objects pointing to Messages directories
|
||||
index_path: Path to save the LEANN index
|
||||
@@ -35,31 +44,32 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa
|
||||
include_html: Whether to include HTML content in email processing
|
||||
"""
|
||||
print("Creating LEANN index from multiple mail data sources...")
|
||||
|
||||
|
||||
# Load documents using EmlxReader from LEANN_email_reader
|
||||
from examples.email_data.LEANN_email_reader import EmlxReader
|
||||
|
||||
reader = EmlxReader(include_html=include_html)
|
||||
# from email_data.email import EmlxMboxReader
|
||||
# from pathlib import Path
|
||||
# reader = EmlxMboxReader()
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
all_documents = []
|
||||
total_processed = 0
|
||||
|
||||
|
||||
# Process each Messages directory
|
||||
for i, messages_dir in enumerate(messages_dirs):
|
||||
print(f"\nProcessing Messages directory {i+1}/{len(messages_dirs)}: {messages_dir}")
|
||||
|
||||
print(f"\nProcessing Messages directory {i + 1}/{len(messages_dirs)}: {messages_dir}")
|
||||
|
||||
try:
|
||||
documents = reader.load_data(messages_dir)
|
||||
if documents:
|
||||
print(f"Loaded {len(documents)} email documents from {messages_dir}")
|
||||
all_documents.extend(documents)
|
||||
total_processed += len(documents)
|
||||
|
||||
|
||||
# Check if we've reached the max count
|
||||
if max_count > 0 and total_processed >= max_count:
|
||||
print(f"Reached max count of {max_count} documents")
|
||||
@@ -69,16 +79,18 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa
|
||||
except Exception as e:
|
||||
print(f"Error processing {messages_dir}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
if not all_documents:
|
||||
print("No documents loaded from any source. Exiting.")
|
||||
return None
|
||||
|
||||
print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories and starting to split them into chunks")
|
||||
|
||||
|
||||
print(
|
||||
f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories and starting to split them into chunks"
|
||||
)
|
||||
|
||||
# Create text splitter with 256 chunk size
|
||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
|
||||
|
||||
|
||||
# Convert Documents to text strings and chunk them
|
||||
all_texts = []
|
||||
for doc in all_documents:
|
||||
@@ -88,44 +100,53 @@ def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_pa
|
||||
text = node.get_content()
|
||||
# text = '[subject] ' + doc.metadata["subject"] + '\n' + text
|
||||
all_texts.append(text)
|
||||
|
||||
print(f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks")
|
||||
|
||||
|
||||
print(
|
||||
f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks"
|
||||
)
|
||||
|
||||
# Create LEANN index directory
|
||||
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model=embedding_model,
|
||||
graph_degree=32,
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True,
|
||||
num_threads=1 # Force single-threaded mode
|
||||
num_threads=1, # Force single-threaded mode
|
||||
)
|
||||
|
||||
print(f"Adding {len(all_texts)} email chunks to index...")
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
|
||||
|
||||
builder.build_index(index_path)
|
||||
print(f"\nLEANN index built at {index_path}!")
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
|
||||
return index_path
|
||||
|
||||
def create_leann_index(mail_path: str, index_path: str = "mail_index.leann", max_count: int = 1000, include_html: bool = False, embedding_model: str = "facebook/contriever"):
|
||||
|
||||
def create_leann_index(
|
||||
mail_path: str,
|
||||
index_path: str = "mail_index.leann",
|
||||
max_count: int = 1000,
|
||||
include_html: bool = False,
|
||||
embedding_model: str = "facebook/contriever",
|
||||
):
|
||||
"""
|
||||
Create LEANN index from mail data.
|
||||
|
||||
|
||||
Args:
|
||||
mail_path: Path to the mail directory
|
||||
index_path: Path to save the LEANN index
|
||||
@@ -134,32 +155,33 @@ def create_leann_index(mail_path: str, index_path: str = "mail_index.leann", max
|
||||
"""
|
||||
print("Creating LEANN index from mail data...")
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Load documents using EmlxReader from LEANN_email_reader
|
||||
from examples.email_data.LEANN_email_reader import EmlxReader
|
||||
|
||||
reader = EmlxReader(include_html=include_html)
|
||||
# from email_data.email import EmlxMboxReader
|
||||
# from pathlib import Path
|
||||
# reader = EmlxMboxReader()
|
||||
documents = reader.load_data(Path(mail_path))
|
||||
|
||||
|
||||
if not documents:
|
||||
print("No documents loaded. Exiting.")
|
||||
return None
|
||||
|
||||
|
||||
print(f"Loaded {len(documents)} email documents")
|
||||
|
||||
|
||||
# Create text splitter with 256 chunk size
|
||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
|
||||
|
||||
|
||||
# Convert Documents to text strings and chunk them
|
||||
all_texts = []
|
||||
for doc in documents:
|
||||
@@ -167,111 +189,135 @@ def create_leann_index(mail_path: str, index_path: str = "mail_index.leann", max
|
||||
nodes = text_splitter.get_nodes_from_documents([doc])
|
||||
for node in nodes:
|
||||
all_texts.append(node.get_content())
|
||||
|
||||
|
||||
print(f"Created {len(all_texts)} text chunks from {len(documents)} documents")
|
||||
|
||||
|
||||
# Create LEANN index directory
|
||||
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model=embedding_model,
|
||||
graph_degree=32,
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True,
|
||||
num_threads=1 # Force single-threaded mode
|
||||
num_threads=1, # Force single-threaded mode
|
||||
)
|
||||
|
||||
print(f"Adding {len(all_texts)} email chunks to index...")
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
|
||||
|
||||
builder.build_index(index_path)
|
||||
print(f"\nLEANN index built at {index_path}!")
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
|
||||
return index_path
|
||||
|
||||
|
||||
async def query_leann_index(index_path: str, query: str):
|
||||
"""
|
||||
Query the LEANN index.
|
||||
|
||||
|
||||
Args:
|
||||
index_path: Path to the LEANN index
|
||||
query: The query string
|
||||
"""
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=index_path,
|
||||
llm_config={"type": "openai", "model": "gpt-4o"})
|
||||
|
||||
print("\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=index_path, llm_config={"type": "openai", "model": "gpt-4o"})
|
||||
|
||||
print(f"You: {query}")
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
time.time()
|
||||
chat_response = chat.ask(
|
||||
query,
|
||||
top_k=20,
|
||||
query,
|
||||
top_k=20,
|
||||
recompute_beighbor_embeddings=True,
|
||||
complexity=32,
|
||||
beam_width=1,
|
||||
)
|
||||
end_time = time.time()
|
||||
time.time()
|
||||
# print(f"Time taken: {end_time - start_time} seconds")
|
||||
# highlight the answer
|
||||
print(f"Leann chat response: \033[36m{chat_response}\033[0m")
|
||||
|
||||
|
||||
async def main():
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
|
||||
parser = argparse.ArgumentParser(description="LEANN Mail Reader - Create and query email index")
|
||||
# Remove --mail-path argument and auto-detect all Messages directories
|
||||
# Remove DEFAULT_MAIL_PATH
|
||||
parser.add_argument('--index-dir', type=str, default="./mail_index",
|
||||
help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
|
||||
parser.add_argument('--max-emails', type=int, default=1000,
|
||||
help='Maximum number of emails to process (-1 means all)')
|
||||
parser.add_argument('--query', type=str, default="Give me some funny advertisement about apple or other companies",
|
||||
help='Single query to run (default: runs example queries)')
|
||||
parser.add_argument('--include-html', action='store_true', default=False,
|
||||
help='Include HTML content in email processing (default: False)')
|
||||
parser.add_argument('--embedding-model', type=str, default="facebook/contriever",
|
||||
help='Embedding model to use (default: facebook/contriever)')
|
||||
|
||||
parser.add_argument(
|
||||
"--index-dir",
|
||||
type=str,
|
||||
default="./mail_index",
|
||||
help="Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-emails",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Maximum number of emails to process (-1 means all)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query",
|
||||
type=str,
|
||||
default="Give me some funny advertisement about apple or other companies",
|
||||
help="Single query to run (default: runs example queries)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-html",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Include HTML content in email processing (default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-model",
|
||||
type=str,
|
||||
default="facebook/contriever",
|
||||
help="Embedding model to use (default: facebook/contriever)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"args: {args}")
|
||||
|
||||
|
||||
# Automatically find all Messages directories under the current user's Mail directory
|
||||
from examples.email_data.LEANN_email_reader import find_all_messages_directories
|
||||
|
||||
mail_path = get_mail_path()
|
||||
print(f"Searching for email data in: {mail_path}")
|
||||
messages_dirs = find_all_messages_directories(mail_path)
|
||||
# messages_dirs = find_all_messages_directories(DEFAULT_MAIL_PATH)
|
||||
# messages_dirs = [DEFAULT_MAIL_PATH]
|
||||
# messages_dirs = messages_dirs[:1]
|
||||
|
||||
print('len(messages_dirs): ', len(messages_dirs))
|
||||
|
||||
|
||||
|
||||
print("len(messages_dirs): ", len(messages_dirs))
|
||||
|
||||
if not messages_dirs:
|
||||
print("No Messages directories found. Exiting.")
|
||||
return
|
||||
|
||||
|
||||
INDEX_DIR = Path(args.index_dir)
|
||||
INDEX_PATH = str(INDEX_DIR / "mail_documents.leann")
|
||||
print(f"Index directory: {INDEX_DIR}")
|
||||
print(f"Found {len(messages_dirs)} Messages directories.")
|
||||
|
||||
|
||||
# Create or load the LEANN index from all sources
|
||||
index_path = create_leann_index_from_multiple_sources(messages_dirs, INDEX_PATH, args.max_emails, args.include_html, args.embedding_model)
|
||||
|
||||
index_path = create_leann_index_from_multiple_sources(
|
||||
messages_dirs, INDEX_PATH, args.max_emails, args.include_html, args.embedding_model
|
||||
)
|
||||
|
||||
if index_path:
|
||||
if args.query:
|
||||
# Run single query
|
||||
@@ -281,11 +327,12 @@ async def main():
|
||||
queries = [
|
||||
"Hows Berkeley Graduate Student Instructor",
|
||||
"how's the icloud related advertisement saying",
|
||||
"Whats the number of class recommend to take per semester for incoming EECS students"
|
||||
"Whats the number of class recommend to take per semester for incoming EECS students",
|
||||
]
|
||||
for query in queries:
|
||||
print("\n" + "="*60)
|
||||
print("\n" + "=" * 60)
|
||||
await query_leann_index(index_path, query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -1,26 +1,30 @@
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
|
||||
# Add the project root to Python path so we can import from examples
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from llama_index.core import VectorStoreIndex, StorageContext
|
||||
import torch
|
||||
from llama_index.core import StorageContext, VectorStoreIndex
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
# --- EMBEDDING MODEL ---
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
import torch
|
||||
|
||||
# --- END EMBEDDING MODEL ---
|
||||
|
||||
# Import EmlxReader from the new module
|
||||
from examples.email_data.LEANN_email_reader import EmlxReader
|
||||
|
||||
def create_and_save_index(mail_path: str, save_dir: str = "mail_index_embedded", max_count: int = 1000, include_html: bool = False):
|
||||
|
||||
def create_and_save_index(
|
||||
mail_path: str,
|
||||
save_dir: str = "mail_index_embedded",
|
||||
max_count: int = 1000,
|
||||
include_html: bool = False,
|
||||
):
|
||||
print("Creating index from mail data with embedded metadata...")
|
||||
documents = EmlxReader(include_html=include_html).load_data(mail_path, max_count=max_count)
|
||||
if not documents:
|
||||
@@ -30,7 +34,7 @@ def create_and_save_index(mail_path: str, save_dir: str = "mail_index_embedded",
|
||||
# Use facebook/contriever as the embedder
|
||||
embed_model = HuggingFaceEmbedding(model_name="facebook/contriever")
|
||||
# set on device
|
||||
import torch
|
||||
|
||||
if torch.cuda.is_available():
|
||||
embed_model._model.to("cuda")
|
||||
# set mps
|
||||
@@ -39,21 +43,19 @@ def create_and_save_index(mail_path: str, save_dir: str = "mail_index_embedded",
|
||||
else:
|
||||
embed_model._model.to("cpu")
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents,
|
||||
transformations=[text_splitter],
|
||||
embed_model=embed_model
|
||||
documents, transformations=[text_splitter], embed_model=embed_model
|
||||
)
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
index.storage_context.persist(persist_dir=save_dir)
|
||||
print(f"Index saved to {save_dir}")
|
||||
return index
|
||||
|
||||
|
||||
def load_index(save_dir: str = "mail_index_embedded"):
|
||||
try:
|
||||
storage_context = StorageContext.from_defaults(persist_dir=save_dir)
|
||||
index = VectorStoreIndex.from_vector_store(
|
||||
storage_context.vector_store,
|
||||
storage_context=storage_context
|
||||
storage_context.vector_store, storage_context=storage_context
|
||||
)
|
||||
print(f"Index loaded from {save_dir}")
|
||||
return index
|
||||
@@ -61,6 +63,7 @@ def load_index(save_dir: str = "mail_index_embedded"):
|
||||
print(f"Error loading index: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def query_index(index, query: str):
|
||||
if index is None:
|
||||
print("No index available for querying.")
|
||||
@@ -70,39 +73,57 @@ def query_index(index, query: str):
|
||||
print(f"Query: {query}")
|
||||
print(f"Response: {response}")
|
||||
|
||||
|
||||
def main():
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description='LlamaIndex Mail Reader - Create and query email index')
|
||||
parser.add_argument('--mail-path', type=str,
|
||||
default="/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages",
|
||||
help='Path to mail data directory')
|
||||
parser.add_argument('--save-dir', type=str, default="mail_index_embedded",
|
||||
help='Directory to store the index (default: mail_index_embedded)')
|
||||
parser.add_argument('--max-emails', type=int, default=10000,
|
||||
help='Maximum number of emails to process')
|
||||
parser.add_argument('--include-html', action='store_true', default=False,
|
||||
help='Include HTML content in email processing (default: False)')
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LlamaIndex Mail Reader - Create and query email index"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mail-path",
|
||||
type=str,
|
||||
default="/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages",
|
||||
help="Path to mail data directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-dir",
|
||||
type=str,
|
||||
default="mail_index_embedded",
|
||||
help="Directory to store the index (default: mail_index_embedded)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-emails", type=int, default=10000, help="Maximum number of emails to process"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-html",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Include HTML content in email processing (default: False)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
mail_path = args.mail_path
|
||||
save_dir = args.save_dir
|
||||
|
||||
|
||||
if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")):
|
||||
print("Loading existing index...")
|
||||
index = load_index(save_dir)
|
||||
else:
|
||||
print("Creating new index...")
|
||||
index = create_and_save_index(mail_path, save_dir, max_count=args.max_emails, include_html=args.include_html)
|
||||
index = create_and_save_index(
|
||||
mail_path, save_dir, max_count=args.max_emails, include_html=args.include_html
|
||||
)
|
||||
if index:
|
||||
queries = [
|
||||
"Hows Berkeley Graduate Student Instructor",
|
||||
"how's the icloud related advertisement saying",
|
||||
"Whats the number of class recommend to take per semester for incoming EECS students"
|
||||
"Whats the number of class recommend to take per semester for incoming EECS students",
|
||||
]
|
||||
for query in queries:
|
||||
print("\n" + "="*50)
|
||||
print("\n" + "=" * 50)
|
||||
query_index(index, query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import argparse
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
import dotenv
|
||||
from leann.api import LeannBuilder, LeannChat
|
||||
from pathlib import Path
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
@@ -56,7 +57,7 @@ async def main(args):
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
print("\n[PHASE 2] Starting Leann chat session...")
|
||||
|
||||
llm_config = {"type": "hf", "model": "Qwen/Qwen3-4B"}
|
||||
llm_config = {"type": "ollama", "model": "qwen3:8b"}
|
||||
@@ -64,7 +65,7 @@ async def main(args):
|
||||
|
||||
chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
|
||||
# query = (
|
||||
# "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
|
||||
# "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
|
||||
# )
|
||||
query = args.query
|
||||
|
||||
@@ -74,9 +75,7 @@ async def main(args):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run Leann Chat with various LLM backends."
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Run Leann Chat with various LLM backends.")
|
||||
parser.add_argument(
|
||||
"--llm",
|
||||
type=str,
|
||||
|
||||
@@ -14,48 +14,55 @@ Key features:
|
||||
- Document-level result consolidation
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
from dataclasses import dataclass
|
||||
from collections import defaultdict
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
@dataclass
|
||||
class PatchResult:
|
||||
"""Represents a single patch search result."""
|
||||
|
||||
patch_id: int
|
||||
image_name: str
|
||||
image_path: str
|
||||
coordinates: Tuple[int, int, int, int] # (x1, y1, x2, y2)
|
||||
coordinates: tuple[int, int, int, int] # (x1, y1, x2, y2)
|
||||
score: float
|
||||
attention_score: float
|
||||
scale: float
|
||||
metadata: Dict[str, Any]
|
||||
metadata: dict[str, Any]
|
||||
|
||||
|
||||
@dataclass
|
||||
class AggregatedResult:
|
||||
"""Represents an aggregated document-level result."""
|
||||
|
||||
image_name: str
|
||||
image_path: str
|
||||
doc_score: float
|
||||
patch_count: int
|
||||
best_patch: PatchResult
|
||||
all_patches: List[PatchResult]
|
||||
all_patches: list[PatchResult]
|
||||
aggregation_method: str
|
||||
spatial_clusters: Optional[List[List[PatchResult]]] = None
|
||||
spatial_clusters: list[list[PatchResult]] | None = None
|
||||
|
||||
|
||||
class MultiVectorAggregator:
|
||||
"""
|
||||
Aggregates multiple patch-level results into document-level results.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
aggregation_method: str = "maxsim",
|
||||
spatial_clustering: bool = True,
|
||||
cluster_distance_threshold: float = 100.0):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
aggregation_method: str = "maxsim",
|
||||
spatial_clustering: bool = True,
|
||||
cluster_distance_threshold: float = 100.0,
|
||||
):
|
||||
"""
|
||||
Initialize the aggregator.
|
||||
|
||||
|
||||
Args:
|
||||
aggregation_method: "maxsim", "voting", "weighted", or "mean"
|
||||
spatial_clustering: Whether to cluster spatially close patches
|
||||
@@ -64,23 +71,23 @@ class MultiVectorAggregator:
|
||||
self.aggregation_method = aggregation_method
|
||||
self.spatial_clustering = spatial_clustering
|
||||
self.cluster_distance_threshold = cluster_distance_threshold
|
||||
|
||||
def aggregate_results(self,
|
||||
search_results: List[Dict[str, Any]],
|
||||
top_k: int = 10) -> List[AggregatedResult]:
|
||||
|
||||
def aggregate_results(
|
||||
self, search_results: list[dict[str, Any]], top_k: int = 10
|
||||
) -> list[AggregatedResult]:
|
||||
"""
|
||||
Aggregate patch-level search results into document-level results.
|
||||
|
||||
|
||||
Args:
|
||||
search_results: List of search results from LeannSearcher
|
||||
top_k: Number of top documents to return
|
||||
|
||||
|
||||
Returns:
|
||||
List of aggregated document results
|
||||
"""
|
||||
# Group results by image
|
||||
image_groups = defaultdict(list)
|
||||
|
||||
|
||||
for result in search_results:
|
||||
metadata = result.metadata
|
||||
if "image_name" in metadata and "patch_id" in metadata:
|
||||
@@ -92,55 +99,57 @@ class MultiVectorAggregator:
|
||||
score=result.score,
|
||||
attention_score=metadata.get("attention_score", 0.0),
|
||||
scale=metadata.get("scale", 1.0),
|
||||
metadata=metadata
|
||||
metadata=metadata,
|
||||
)
|
||||
image_groups[metadata["image_name"]].append(patch_result)
|
||||
|
||||
|
||||
# Aggregate each image group
|
||||
aggregated_results = []
|
||||
for image_name, patches in image_groups.items():
|
||||
if len(patches) == 0:
|
||||
continue
|
||||
|
||||
|
||||
agg_result = self._aggregate_image_patches(image_name, patches)
|
||||
aggregated_results.append(agg_result)
|
||||
|
||||
|
||||
# Sort by aggregated score and return top-k
|
||||
aggregated_results.sort(key=lambda x: x.doc_score, reverse=True)
|
||||
return aggregated_results[:top_k]
|
||||
|
||||
def _aggregate_image_patches(self, image_name: str, patches: List[PatchResult]) -> AggregatedResult:
|
||||
|
||||
def _aggregate_image_patches(
|
||||
self, image_name: str, patches: list[PatchResult]
|
||||
) -> AggregatedResult:
|
||||
"""Aggregate patches for a single image."""
|
||||
|
||||
|
||||
if self.aggregation_method == "maxsim":
|
||||
doc_score = max(patch.score for patch in patches)
|
||||
best_patch = max(patches, key=lambda p: p.score)
|
||||
|
||||
|
||||
elif self.aggregation_method == "voting":
|
||||
# Count patches above threshold
|
||||
threshold = np.percentile([p.score for p in patches], 75)
|
||||
doc_score = sum(1 for patch in patches if patch.score >= threshold)
|
||||
best_patch = max(patches, key=lambda p: p.score)
|
||||
|
||||
|
||||
elif self.aggregation_method == "weighted":
|
||||
# Weight by attention scores
|
||||
total_weighted_score = sum(p.score * p.attention_score for p in patches)
|
||||
total_weights = sum(p.attention_score for p in patches)
|
||||
doc_score = total_weighted_score / max(total_weights, 1e-8)
|
||||
best_patch = max(patches, key=lambda p: p.score * p.attention_score)
|
||||
|
||||
|
||||
elif self.aggregation_method == "mean":
|
||||
doc_score = np.mean([patch.score for patch in patches])
|
||||
best_patch = max(patches, key=lambda p: p.score)
|
||||
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown aggregation method: {self.aggregation_method}")
|
||||
|
||||
|
||||
# Spatial clustering if enabled
|
||||
spatial_clusters = None
|
||||
if self.spatial_clustering:
|
||||
spatial_clusters = self._cluster_patches_spatially(patches)
|
||||
|
||||
|
||||
return AggregatedResult(
|
||||
image_name=image_name,
|
||||
image_path=patches[0].image_path,
|
||||
@@ -149,23 +158,23 @@ class MultiVectorAggregator:
|
||||
best_patch=best_patch,
|
||||
all_patches=sorted(patches, key=lambda p: p.score, reverse=True),
|
||||
aggregation_method=self.aggregation_method,
|
||||
spatial_clusters=spatial_clusters
|
||||
spatial_clusters=spatial_clusters,
|
||||
)
|
||||
|
||||
def _cluster_patches_spatially(self, patches: List[PatchResult]) -> List[List[PatchResult]]:
|
||||
|
||||
def _cluster_patches_spatially(self, patches: list[PatchResult]) -> list[list[PatchResult]]:
|
||||
"""Cluster patches that are spatially close to each other."""
|
||||
if len(patches) <= 1:
|
||||
return [patches]
|
||||
|
||||
|
||||
clusters = []
|
||||
remaining_patches = patches.copy()
|
||||
|
||||
|
||||
while remaining_patches:
|
||||
# Start new cluster with highest scoring remaining patch
|
||||
seed_patch = max(remaining_patches, key=lambda p: p.score)
|
||||
current_cluster = [seed_patch]
|
||||
remaining_patches.remove(seed_patch)
|
||||
|
||||
|
||||
# Add nearby patches to cluster
|
||||
added_to_cluster = True
|
||||
while added_to_cluster:
|
||||
@@ -175,145 +184,175 @@ class MultiVectorAggregator:
|
||||
current_cluster.append(patch)
|
||||
remaining_patches.remove(patch)
|
||||
added_to_cluster = True
|
||||
|
||||
|
||||
clusters.append(current_cluster)
|
||||
|
||||
|
||||
return sorted(clusters, key=lambda cluster: max(p.score for p in cluster), reverse=True)
|
||||
|
||||
def _is_patch_nearby(self, patch: PatchResult, cluster: List[PatchResult]) -> bool:
|
||||
|
||||
def _is_patch_nearby(self, patch: PatchResult, cluster: list[PatchResult]) -> bool:
|
||||
"""Check if a patch is spatially close to any patch in the cluster."""
|
||||
patch_center = self._get_patch_center(patch.coordinates)
|
||||
|
||||
|
||||
for cluster_patch in cluster:
|
||||
cluster_center = self._get_patch_center(cluster_patch.coordinates)
|
||||
distance = np.sqrt((patch_center[0] - cluster_center[0])**2 +
|
||||
(patch_center[1] - cluster_center[1])**2)
|
||||
|
||||
distance = np.sqrt(
|
||||
(patch_center[0] - cluster_center[0]) ** 2
|
||||
+ (patch_center[1] - cluster_center[1]) ** 2
|
||||
)
|
||||
|
||||
if distance <= self.cluster_distance_threshold:
|
||||
return True
|
||||
|
||||
|
||||
return False
|
||||
|
||||
def _get_patch_center(self, coordinates: Tuple[int, int, int, int]) -> Tuple[float, float]:
|
||||
|
||||
def _get_patch_center(self, coordinates: tuple[int, int, int, int]) -> tuple[float, float]:
|
||||
"""Get center point of a patch."""
|
||||
x1, y1, x2, y2 = coordinates
|
||||
return ((x1 + x2) / 2, (y1 + y2) / 2)
|
||||
|
||||
def print_aggregated_results(self, results: List[AggregatedResult], max_patches_per_doc: int = 3):
|
||||
|
||||
def print_aggregated_results(
|
||||
self, results: list[AggregatedResult], max_patches_per_doc: int = 3
|
||||
):
|
||||
"""Pretty print aggregated results."""
|
||||
print(f"\n🔍 Aggregated Results (method: {self.aggregation_method})")
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
for i, result in enumerate(results):
|
||||
print(f"\n{i+1}. {result.image_name}")
|
||||
print(f"\n{i + 1}. {result.image_name}")
|
||||
print(f" Doc Score: {result.doc_score:.4f} | Patches: {result.patch_count}")
|
||||
print(f" Path: {result.image_path}")
|
||||
|
||||
|
||||
# Show best patch
|
||||
best = result.best_patch
|
||||
print(f" 🌟 Best Patch: #{best.patch_id} at {best.coordinates} (score: {best.score:.4f})")
|
||||
|
||||
print(
|
||||
f" 🌟 Best Patch: #{best.patch_id} at {best.coordinates} (score: {best.score:.4f})"
|
||||
)
|
||||
|
||||
# Show top patches
|
||||
print(f" 📍 Top Patches:")
|
||||
print(" 📍 Top Patches:")
|
||||
for j, patch in enumerate(result.all_patches[:max_patches_per_doc]):
|
||||
print(f" {j+1}. Patch #{patch.patch_id}: {patch.score:.4f} at {patch.coordinates}")
|
||||
|
||||
print(
|
||||
f" {j + 1}. Patch #{patch.patch_id}: {patch.score:.4f} at {patch.coordinates}"
|
||||
)
|
||||
|
||||
# Show spatial clusters if available
|
||||
if result.spatial_clusters and len(result.spatial_clusters) > 1:
|
||||
print(f" 🗂️ Spatial Clusters: {len(result.spatial_clusters)}")
|
||||
for j, cluster in enumerate(result.spatial_clusters[:2]): # Show top 2 clusters
|
||||
cluster_score = max(p.score for p in cluster)
|
||||
print(f" Cluster {j+1}: {len(cluster)} patches (best: {cluster_score:.4f})")
|
||||
print(
|
||||
f" Cluster {j + 1}: {len(cluster)} patches (best: {cluster_score:.4f})"
|
||||
)
|
||||
|
||||
|
||||
def demo_aggregation():
|
||||
"""Demonstrate the multi-vector aggregation functionality."""
|
||||
print("=== Multi-Vector Aggregation Demo ===")
|
||||
|
||||
|
||||
# Simulate some patch-level search results
|
||||
# In real usage, these would come from LeannSearcher.search()
|
||||
|
||||
|
||||
class MockResult:
|
||||
def __init__(self, score, metadata):
|
||||
self.score = score
|
||||
self.metadata = metadata
|
||||
|
||||
|
||||
# Simulate results for 2 images with multiple patches each
|
||||
mock_results = [
|
||||
# Image 1: cats_and_kitchen.jpg - 4 patches
|
||||
MockResult(0.85, {
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 3,
|
||||
"coordinates": [100, 50, 224, 174], # Kitchen area
|
||||
"attention_score": 0.92,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(0.78, {
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 7,
|
||||
"coordinates": [200, 300, 324, 424], # Cat area
|
||||
"attention_score": 0.88,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(0.72, {
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 12,
|
||||
"coordinates": [150, 100, 274, 224], # Appliances
|
||||
"attention_score": 0.75,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(0.65, {
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 15,
|
||||
"coordinates": [50, 250, 174, 374], # Furniture
|
||||
"attention_score": 0.70,
|
||||
"scale": 1.0
|
||||
}),
|
||||
|
||||
# Image 2: city_street.jpg - 3 patches
|
||||
MockResult(0.68, {
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 2,
|
||||
"coordinates": [300, 100, 424, 224], # Buildings
|
||||
"attention_score": 0.80,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(0.62, {
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 8,
|
||||
"coordinates": [100, 350, 224, 474], # Street level
|
||||
"attention_score": 0.75,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(0.55, {
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 11,
|
||||
"coordinates": [400, 200, 524, 324], # Sky area
|
||||
"attention_score": 0.60,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(
|
||||
0.85,
|
||||
{
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 3,
|
||||
"coordinates": [100, 50, 224, 174], # Kitchen area
|
||||
"attention_score": 0.92,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(
|
||||
0.78,
|
||||
{
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 7,
|
||||
"coordinates": [200, 300, 324, 424], # Cat area
|
||||
"attention_score": 0.88,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(
|
||||
0.72,
|
||||
{
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 12,
|
||||
"coordinates": [150, 100, 274, 224], # Appliances
|
||||
"attention_score": 0.75,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(
|
||||
0.65,
|
||||
{
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 15,
|
||||
"coordinates": [50, 250, 174, 374], # Furniture
|
||||
"attention_score": 0.70,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
# Image 2: city_street.jpg - 3 patches
|
||||
MockResult(
|
||||
0.68,
|
||||
{
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 2,
|
||||
"coordinates": [300, 100, 424, 224], # Buildings
|
||||
"attention_score": 0.80,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(
|
||||
0.62,
|
||||
{
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 8,
|
||||
"coordinates": [100, 350, 224, 474], # Street level
|
||||
"attention_score": 0.75,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(
|
||||
0.55,
|
||||
{
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 11,
|
||||
"coordinates": [400, 200, 524, 324], # Sky area
|
||||
"attention_score": 0.60,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# Test different aggregation methods
|
||||
methods = ["maxsim", "voting", "weighted", "mean"]
|
||||
|
||||
|
||||
for method in methods:
|
||||
print(f"\n{'='*20} {method.upper()} AGGREGATION {'='*20}")
|
||||
|
||||
print(f"\n{'=' * 20} {method.upper()} AGGREGATION {'=' * 20}")
|
||||
|
||||
aggregator = MultiVectorAggregator(
|
||||
aggregation_method=method,
|
||||
spatial_clustering=True,
|
||||
cluster_distance_threshold=100.0
|
||||
aggregation_method=method, spatial_clustering=True, cluster_distance_threshold=100.0
|
||||
)
|
||||
|
||||
|
||||
aggregated = aggregator.aggregate_results(mock_results, top_k=5)
|
||||
aggregator.print_aggregated_results(aggregated)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo_aggregation()
|
||||
demo_aggregation()
|
||||
|
||||
@@ -6,22 +6,24 @@ Complete example showing how to build and search with OpenAI embeddings using HN
|
||||
"""
|
||||
|
||||
import os
|
||||
import dotenv
|
||||
from pathlib import Path
|
||||
|
||||
import dotenv
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
# Load environment variables
|
||||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
def main():
|
||||
# Check if OpenAI API key is available
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("ERROR: OPENAI_API_KEY environment variable not set")
|
||||
return False
|
||||
|
||||
|
||||
print(f"✅ OpenAI API key found: {api_key[:10]}...")
|
||||
|
||||
|
||||
# Sample texts
|
||||
sample_texts = [
|
||||
"Machine learning is a powerful technology that enables computers to learn from data.",
|
||||
@@ -33,15 +35,15 @@ def main():
|
||||
"Artificial intelligence aims to create machines that can perform human-like tasks.",
|
||||
"Python is a popular programming language used extensively in data science and AI.",
|
||||
"Neural networks are inspired by the structure and function of the human brain.",
|
||||
"Big data refers to extremely large datasets that require special tools to process."
|
||||
"Big data refers to extremely large datasets that require special tools to process.",
|
||||
]
|
||||
|
||||
|
||||
INDEX_DIR = Path("./simple_openai_test_index")
|
||||
INDEX_PATH = str(INDEX_DIR / "simple_test.leann")
|
||||
|
||||
print(f"\n=== Building Index with OpenAI Embeddings ===")
|
||||
|
||||
print("\n=== Building Index with OpenAI Embeddings ===")
|
||||
print(f"Index path: {INDEX_PATH}")
|
||||
|
||||
|
||||
try:
|
||||
# Use proper configuration for OpenAI embeddings
|
||||
builder = LeannBuilder(
|
||||
@@ -49,60 +51,63 @@ def main():
|
||||
embedding_model="text-embedding-3-small",
|
||||
embedding_mode="openai",
|
||||
# HNSW settings for OpenAI embeddings
|
||||
M=16, # Smaller graph degree
|
||||
efConstruction=64, # Smaller construction complexity
|
||||
is_compact=True, # Enable compact storage for recompute
|
||||
is_recompute=True, # MUST enable for OpenAI embeddings
|
||||
M=16, # Smaller graph degree
|
||||
efConstruction=64, # Smaller construction complexity
|
||||
is_compact=True, # Enable compact storage for recompute
|
||||
is_recompute=True, # MUST enable for OpenAI embeddings
|
||||
num_threads=1,
|
||||
)
|
||||
|
||||
|
||||
print(f"Adding {len(sample_texts)} texts to the index...")
|
||||
for i, text in enumerate(sample_texts):
|
||||
metadata = {"id": f"doc_{i}", "topic": "AI"}
|
||||
builder.add_text(text, metadata)
|
||||
|
||||
|
||||
print("Building index...")
|
||||
builder.build_index(INDEX_PATH)
|
||||
print(f"✅ Index built successfully!")
|
||||
|
||||
print("✅ Index built successfully!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error building index: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
print(f"\n=== Testing Search ===")
|
||||
|
||||
|
||||
print("\n=== Testing Search ===")
|
||||
|
||||
try:
|
||||
searcher = LeannSearcher(INDEX_PATH)
|
||||
|
||||
|
||||
test_queries = [
|
||||
"What is machine learning?",
|
||||
"How do neural networks work?",
|
||||
"Programming languages for data science"
|
||||
"Programming languages for data science",
|
||||
]
|
||||
|
||||
|
||||
for query in test_queries:
|
||||
print(f"\n🔍 Query: '{query}'")
|
||||
results = searcher.search(query, top_k=3)
|
||||
|
||||
|
||||
print(f" Found {len(results)} results:")
|
||||
for i, result in enumerate(results):
|
||||
print(f" {i+1}. Score: {result.score:.4f}")
|
||||
print(f" {i + 1}. Score: {result.score:.4f}")
|
||||
print(f" Text: {result.text[:80]}...")
|
||||
|
||||
print(f"\n✅ Search test completed successfully!")
|
||||
|
||||
print("\n✅ Search test completed successfully!")
|
||||
return True
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error during search: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = main()
|
||||
if success:
|
||||
print(f"\n🎉 Simple OpenAI index test completed successfully!")
|
||||
print("\n🎉 Simple OpenAI index test completed successfully!")
|
||||
else:
|
||||
print(f"\n💥 Simple OpenAI index test failed!")
|
||||
print("\n💥 Simple OpenAI index test failed!")
|
||||
|
||||
@@ -1,18 +1,23 @@
|
||||
import asyncio
|
||||
from leann.api import LeannChat
|
||||
from pathlib import Path
|
||||
|
||||
from leann.api import LeannChat
|
||||
|
||||
INDEX_DIR = Path("./test_pdf_index_huawei")
|
||||
INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
|
||||
|
||||
|
||||
async def main():
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
print("\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=INDEX_PATH)
|
||||
query = "What is the main idea of RL and give me 5 exapmle of classic RL algorithms?"
|
||||
query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
|
||||
# query = "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
|
||||
response = chat.ask(query,top_k=20,recompute_beighbor_embeddings=True,complexity=32,beam_width=1)
|
||||
# query = "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
|
||||
response = chat.ask(
|
||||
query, top_k=20, recompute_beighbor_embeddings=True, complexity=32, beam_width=1
|
||||
)
|
||||
print(f"\n[PHASE 2] Response: {response}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -5,24 +5,21 @@ It correctly compares results by fetching the text content for both the new sear
|
||||
results and the golden standard results, making the comparison robust to ID changes.
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import numpy as np
|
||||
from typing import List
|
||||
|
||||
from leann.api import LeannSearcher, LeannBuilder
|
||||
import numpy as np
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
|
||||
def download_data_if_needed(data_root: Path, download_embeddings: bool = False):
|
||||
"""Checks if the data directory exists, and if not, downloads it from HF Hub."""
|
||||
if not data_root.exists():
|
||||
print(f"Data directory '{data_root}' not found.")
|
||||
print(
|
||||
"Downloading evaluation data from Hugging Face Hub... (this may take a moment)"
|
||||
)
|
||||
print("Downloading evaluation data from Hugging Face Hub... (this may take a moment)")
|
||||
try:
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
@@ -63,7 +60,7 @@ def download_data_if_needed(data_root: Path, download_embeddings: bool = False):
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def download_embeddings_if_needed(data_root: Path, dataset_type: str = None):
|
||||
def download_embeddings_if_needed(data_root: Path, dataset_type: str | None = None):
|
||||
"""Download embeddings files specifically."""
|
||||
embeddings_dir = data_root / "embeddings"
|
||||
|
||||
@@ -101,7 +98,7 @@ def download_embeddings_if_needed(data_root: Path, dataset_type: str = None):
|
||||
|
||||
|
||||
# --- Helper Function to get Golden Passages ---
|
||||
def get_golden_texts(searcher: LeannSearcher, golden_ids: List[int]) -> set:
|
||||
def get_golden_texts(searcher: LeannSearcher, golden_ids: list[int]) -> set:
|
||||
"""
|
||||
Retrieves the text for golden passage IDs directly from the LeannSearcher's
|
||||
passage manager.
|
||||
@@ -113,24 +110,20 @@ def get_golden_texts(searcher: LeannSearcher, golden_ids: List[int]) -> set:
|
||||
passage_data = searcher.passage_manager.get_passage(str(gid))
|
||||
golden_texts.add(passage_data["text"])
|
||||
except KeyError:
|
||||
print(
|
||||
f"Warning: Golden passage ID '{gid}' not found in the index's passage data."
|
||||
)
|
||||
print(f"Warning: Golden passage ID '{gid}' not found in the index's passage data.")
|
||||
return golden_texts
|
||||
|
||||
|
||||
def load_queries(file_path: Path) -> List[str]:
|
||||
def load_queries(file_path: Path) -> list[str]:
|
||||
queries = []
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
queries.append(data["query"])
|
||||
return queries
|
||||
|
||||
|
||||
def build_index_from_embeddings(
|
||||
embeddings_file: str, output_path: str, backend: str = "hnsw"
|
||||
):
|
||||
def build_index_from_embeddings(embeddings_file: str, output_path: str, backend: str = "hnsw"):
|
||||
"""
|
||||
Build a LEANN index from pre-computed embeddings.
|
||||
|
||||
@@ -173,9 +166,7 @@ def build_index_from_embeddings(
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run recall evaluation on a LEANN index."
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Run recall evaluation on a LEANN index.")
|
||||
parser.add_argument(
|
||||
"index_path",
|
||||
type=str,
|
||||
@@ -202,9 +193,7 @@ def main():
|
||||
parser.add_argument(
|
||||
"--num-queries", type=int, default=10, help="Number of queries to evaluate."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top-k", type=int, default=3, help="The 'k' value for recall@k."
|
||||
)
|
||||
parser.add_argument("--top-k", type=int, default=3, help="The 'k' value for recall@k.")
|
||||
parser.add_argument(
|
||||
"--ef-search", type=int, default=120, help="The 'efSearch' parameter for HNSW."
|
||||
)
|
||||
@@ -219,9 +208,7 @@ def main():
|
||||
# Download data based on mode
|
||||
if args.mode == "build":
|
||||
# For building mode, we need embeddings
|
||||
download_data_if_needed(
|
||||
data_root, download_embeddings=False
|
||||
) # Basic data first
|
||||
download_data_if_needed(data_root, download_embeddings=False) # Basic data first
|
||||
|
||||
# Auto-detect dataset type and download embeddings
|
||||
if args.embeddings_file:
|
||||
@@ -262,9 +249,7 @@ def main():
|
||||
print(f"Index built successfully: {built_index_path}")
|
||||
|
||||
# Ask if user wants to run evaluation
|
||||
eval_response = (
|
||||
input("Run evaluation on the built index? (y/n): ").strip().lower()
|
||||
)
|
||||
eval_response = input("Run evaluation on the built index? (y/n): ").strip().lower()
|
||||
if eval_response != "y":
|
||||
print("Index building complete. Exiting.")
|
||||
return
|
||||
@@ -293,12 +278,8 @@ def main():
|
||||
break
|
||||
|
||||
if not args.index_path:
|
||||
print(
|
||||
"No indices found. The data download should have included pre-built indices."
|
||||
)
|
||||
print(
|
||||
"Please check the data/indices/ directory or provide --index-path manually."
|
||||
)
|
||||
print("No indices found. The data download should have included pre-built indices.")
|
||||
print("Please check the data/indices/ directory or provide --index-path manually.")
|
||||
sys.exit(1)
|
||||
|
||||
# Detect dataset type from index path to select the correct ground truth
|
||||
@@ -310,14 +291,10 @@ def main():
|
||||
else:
|
||||
# Fallback: try to infer from the index directory name
|
||||
dataset_type = Path(args.index_path).name
|
||||
print(
|
||||
f"WARNING: Could not detect dataset type from path, inferred '{dataset_type}'."
|
||||
)
|
||||
print(f"WARNING: Could not detect dataset type from path, inferred '{dataset_type}'.")
|
||||
|
||||
queries_file = data_root / "queries" / "nq_open.jsonl"
|
||||
golden_results_file = (
|
||||
data_root / "ground_truth" / dataset_type / "flat_results_nq_k3.json"
|
||||
)
|
||||
golden_results_file = data_root / "ground_truth" / dataset_type / "flat_results_nq_k3.json"
|
||||
|
||||
print(f"INFO: Detected dataset type: {dataset_type}")
|
||||
print(f"INFO: Using queries file: {queries_file}")
|
||||
@@ -327,7 +304,7 @@ def main():
|
||||
searcher = LeannSearcher(args.index_path)
|
||||
queries = load_queries(queries_file)
|
||||
|
||||
with open(golden_results_file, "r") as f:
|
||||
with open(golden_results_file) as f:
|
||||
golden_results_data = json.load(f)
|
||||
|
||||
num_eval_queries = min(args.num_queries, len(queries))
|
||||
@@ -339,9 +316,7 @@ def main():
|
||||
|
||||
for i in range(num_eval_queries):
|
||||
start_time = time.time()
|
||||
new_results = searcher.search(
|
||||
queries[i], top_k=args.top_k, ef=args.ef_search
|
||||
)
|
||||
new_results = searcher.search(queries[i], top_k=args.top_k, ef=args.ef_search)
|
||||
search_times.append(time.time() - start_time)
|
||||
|
||||
# Correct Recall Calculation: Based on TEXT content
|
||||
|
||||
@@ -4,18 +4,25 @@ Run: uv run python examples/simple_demo.py
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from leann import LeannBuilder, LeannSearcher, LeannChat
|
||||
|
||||
from leann import LeannBuilder, LeannChat, LeannSearcher
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Simple demo of Leann with selectable embedding models.")
|
||||
parser.add_argument("--embedding_model", type=str, default="sentence-transformers/all-mpnet-base-v2",
|
||||
help="The embedding model to use, e.g., 'sentence-transformers/all-mpnet-base-v2' or 'text-embedding-ada-002'.")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Simple demo of Leann with selectable embedding models."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding_model",
|
||||
type=str,
|
||||
default="sentence-transformers/all-mpnet-base-v2",
|
||||
help="The embedding model to use, e.g., 'sentence-transformers/all-mpnet-base-v2' or 'text-embedding-ada-002'.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"=== Leann Simple Demo with {args.embedding_model} ===")
|
||||
print()
|
||||
|
||||
|
||||
# Sample knowledge base
|
||||
chunks = [
|
||||
"Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
|
||||
@@ -27,7 +34,7 @@ def main():
|
||||
"Big data refers to extremely large datasets that require special tools and techniques to process.",
|
||||
"Cloud computing provides on-demand access to computing resources over the internet.",
|
||||
]
|
||||
|
||||
|
||||
print("1. Building index (no embeddings stored)...")
|
||||
builder = LeannBuilder(
|
||||
embedding_model=args.embedding_model,
|
||||
@@ -37,45 +44,45 @@ def main():
|
||||
builder.add_text(chunk)
|
||||
builder.build_index("demo_knowledge.leann")
|
||||
print()
|
||||
|
||||
|
||||
print("2. Searching with real-time embeddings...")
|
||||
searcher = LeannSearcher("demo_knowledge.leann")
|
||||
|
||||
|
||||
queries = [
|
||||
"What is machine learning?",
|
||||
"How does neural network work?",
|
||||
"How does neural network work?",
|
||||
"Tell me about data processing",
|
||||
]
|
||||
|
||||
|
||||
for query in queries:
|
||||
print(f"Query: {query}")
|
||||
results = searcher.search(query, top_k=2)
|
||||
|
||||
|
||||
for i, result in enumerate(results, 1):
|
||||
print(f" {i}. Score: {result.score:.3f}")
|
||||
print(f" Text: {result.text[:100]}...")
|
||||
print()
|
||||
|
||||
|
||||
print("3. Interactive chat demo:")
|
||||
print(" (Note: Requires OpenAI API key for real responses)")
|
||||
|
||||
|
||||
chat = LeannChat("demo_knowledge.leann")
|
||||
|
||||
|
||||
# Demo questions
|
||||
demo_questions: list[str] = [
|
||||
"What is the difference between machine learning and deep learning?",
|
||||
"How is data science related to big data?",
|
||||
]
|
||||
|
||||
|
||||
for question in demo_questions:
|
||||
print(f" Q: {question}")
|
||||
response = chat.ask(question)
|
||||
print(f" A: {response}")
|
||||
print()
|
||||
|
||||
|
||||
print("Demo completed! Try running:")
|
||||
print(" uv run python examples/document_search.py")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
import os
|
||||
import asyncio
|
||||
import dotenv
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Any, Optional
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
|
||||
import dotenv
|
||||
from leann.api import LeannBuilder, LeannChat
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
import requests
|
||||
import time
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
@@ -16,7 +14,7 @@ DEFAULT_WECHAT_EXPORT_DIR = "./wechat_export_direct"
|
||||
|
||||
|
||||
def create_leann_index_from_multiple_wechat_exports(
|
||||
export_dirs: List[Path],
|
||||
export_dirs: list[Path],
|
||||
index_path: str = "wechat_history_index.leann",
|
||||
max_count: int = -1,
|
||||
):
|
||||
@@ -38,15 +36,13 @@ def create_leann_index_from_multiple_wechat_exports(
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
all_documents = []
|
||||
total_processed = 0
|
||||
|
||||
# Process each WeChat export directory
|
||||
for i, export_dir in enumerate(export_dirs):
|
||||
print(
|
||||
f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}"
|
||||
)
|
||||
print(f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}")
|
||||
|
||||
try:
|
||||
documents = reader.load_data(
|
||||
@@ -86,7 +82,12 @@ def create_leann_index_from_multiple_wechat_exports(
|
||||
# Split the document into chunks
|
||||
nodes = text_splitter.get_nodes_from_documents([doc])
|
||||
for node in nodes:
|
||||
text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content()
|
||||
text = (
|
||||
"[Contact] means the message is from: "
|
||||
+ doc.metadata["contact_name"]
|
||||
+ "\n"
|
||||
+ node.get_content()
|
||||
)
|
||||
all_texts.append(text)
|
||||
|
||||
print(
|
||||
@@ -94,12 +95,12 @@ def create_leann_index_from_multiple_wechat_exports(
|
||||
)
|
||||
|
||||
# Create LEANN index directory
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"--- Building new LEANN index ---")
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
@@ -125,7 +126,7 @@ def create_leann_index_from_multiple_wechat_exports(
|
||||
|
||||
|
||||
def create_leann_index(
|
||||
export_dir: str = None,
|
||||
export_dir: str | None = None,
|
||||
index_path: str = "wechat_history_index.leann",
|
||||
max_count: int = 1000,
|
||||
):
|
||||
@@ -141,12 +142,12 @@ def create_leann_index(
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"--- Building new LEANN index ---")
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Load documents using WeChatHistoryReader from history_data
|
||||
from history_data.wechat_history import WeChatHistoryReader
|
||||
@@ -179,12 +180,12 @@ def create_leann_index(
|
||||
print(f"Created {len(all_texts)} text chunks from {len(documents)} documents")
|
||||
|
||||
# Create LEANN index directory
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
print("--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print(f"--- Building new LEANN index ---")
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
@@ -217,7 +218,7 @@ async def query_leann_index(index_path: str, query: str):
|
||||
index_path: Path to the LEANN index
|
||||
query: The query string
|
||||
"""
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
print("\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=index_path)
|
||||
|
||||
print(f"You: {query}")
|
||||
@@ -307,7 +308,7 @@ async def main():
|
||||
else:
|
||||
# Example queries
|
||||
queries = [
|
||||
"我想买魔术师约翰逊的球衣,给我一些对应聊天记录?",
|
||||
"我想买魔术师约翰逊的球衣,给我一些对应聊天记录?",
|
||||
]
|
||||
|
||||
for query in queries:
|
||||
|
||||
Reference in New Issue
Block a user