* fix: auto-detect normalized embeddings and use cosine distance - Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere) - Automatically set distance_metric='cosine' for normalized embeddings - Add warnings when using non-optimal distance metrics - Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2) - Fix DiskANN zmq_port compatibility with lazy loading strategy - Add documentation for normalized embeddings feature This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric. * style: format * feat: add OpenAI embeddings support to google_history_reader_leann.py - Add --embedding-model and --embedding-mode arguments - Support automatic detection of normalized embeddings - Works correctly with cosine distance for OpenAI embeddings * feat: add --use-existing-index option to google_history_reader_leann.py - Allow using existing index without rebuilding - Useful for testing pre-built indices * fix: Improve OpenAI embeddings handling in HNSW backend * fix: improve macOS C++ compatibility and add CI tests * refactor: improve test structure and fix main_cli example - Move pytest configuration from pytest.ini to pyproject.toml - Remove unnecessary run_tests.py script (use test extras instead) - Fix main_cli_example.py to properly use command line arguments for LLM config - Add test_readme_examples.py to test code examples from README - Refactor tests to use pytest fixtures and parametrization - Update test documentation to reflect new structure - Set proper environment variables in CI for test execution * fix: add --distance-metric support to DiskANN embedding server and remove obsolete macOS ABI test markers - Add --distance-metric parameter to diskann_embedding_server.py for consistency with other backends - Remove pytest.skip and pytest.xfail markers for macOS C++ ABI issues as they have been fixed - Fix test assertions to handle SearchResult objects correctly - All tests now pass on macOS with the C++ ABI compatibility fixes * chore: update lock file with test dependencies * docs: remove obsolete C++ ABI compatibility warnings - Remove outdated macOS C++ compatibility warnings from README - Simplify CI workflow by removing macOS-specific failure handling - All tests now pass consistently on macOS after ABI fixes * fix: update macOS deployment target for DiskANN to 13.3 - DiskANN uses sgesdd_ LAPACK function which is only available on macOS 13.3+ - Update MACOSX_DEPLOYMENT_TARGET from 11.0 to 13.3 for DiskANN builds - This fixes the compilation error on GitHub Actions macOS runners * fix: align Python version requirements to 3.9 - Update root project to support Python 3.9, matching subpackages - Restore macOS Python 3.9 support in CI - This fixes the CI failure for Python 3.9 environments * fix: handle MPS memory issues in CI tests - Use smaller MiniLM-L6-v2 model (384 dimensions) for README tests in CI - Skip other memory-intensive tests in CI environment - Add minimal CI tests that don't require model loading - Set CI environment variable and disable MPS fallback - Ensure README examples always run correctly in CI * fix: remove Python 3.10+ dependencies for compatibility - Comment out llama-index-readers-docling and llama-index-node-parser-docling - These packages require Python >= 3.10 and were causing CI failures on Python 3.9 - Regenerate uv.lock file to resolve dependency conflicts * fix: use virtual environment in CI instead of system packages - uv-managed Python environments don't allow --system installs - Create and activate virtual environment before installing packages - Update all CI steps to use the virtual environment * add some env in ci * fix: use --find-links to install platform-specific wheels - Let uv automatically select the correct wheel for the current platform - Fixes error when trying to install macOS wheels on Linux - Simplifies the installation logic * fix: disable OpenMP parallelism in CI to avoid libomp crashes - Set OMP_NUM_THREADS=1 to avoid OpenMP thread synchronization issues - Set MKL_NUM_THREADS=1 for single-threaded MKL operations - This prevents segfaults in LayerNorm on macOS CI runners - Addresses the libomp compatibility issues with PyTorch on Apple Silicon * skip several macos test because strange issue on ci --------- Co-authored-by: yichuan520030910320 <yichuan_wang@berkeley.edu>
147 lines
5.0 KiB
Python
147 lines
5.0 KiB
Python
import argparse
|
|
import asyncio
|
|
from pathlib import Path
|
|
|
|
import dotenv
|
|
from leann.api import LeannBuilder, LeannChat
|
|
from llama_index.core import SimpleDirectoryReader
|
|
from llama_index.core.node_parser import SentenceSplitter
|
|
|
|
dotenv.load_dotenv()
|
|
|
|
|
|
async def main(args):
|
|
INDEX_DIR = Path(args.index_dir)
|
|
INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
|
|
|
|
if not INDEX_DIR.exists():
|
|
node_parser = SentenceSplitter(
|
|
chunk_size=256, chunk_overlap=128, separator=" ", paragraph_separator="\n\n"
|
|
)
|
|
|
|
print("Loading documents...")
|
|
documents = SimpleDirectoryReader(
|
|
args.data_dir,
|
|
recursive=True,
|
|
encoding="utf-8",
|
|
required_exts=[".pdf", ".txt", ".md"],
|
|
).load_data(show_progress=True)
|
|
print("Documents loaded.")
|
|
all_texts = []
|
|
for doc in documents:
|
|
nodes = node_parser.get_nodes_from_documents([doc])
|
|
if nodes:
|
|
all_texts.extend(node.get_content() for node in nodes)
|
|
|
|
print("--- Index directory not found, building new index ---")
|
|
|
|
print("\n[PHASE 1] Building Leann index...")
|
|
|
|
# LeannBuilder now automatically detects normalized embeddings and sets appropriate distance metric
|
|
print(f"Using {args.embedding_model} with {args.embedding_mode} mode")
|
|
|
|
# Use HNSW backend for better macOS compatibility
|
|
builder = LeannBuilder(
|
|
backend_name="hnsw",
|
|
embedding_model=args.embedding_model,
|
|
embedding_mode=args.embedding_mode,
|
|
# distance_metric is automatically set based on embedding model
|
|
graph_degree=32,
|
|
complexity=64,
|
|
is_compact=True,
|
|
is_recompute=True,
|
|
num_threads=1, # Force single-threaded mode
|
|
)
|
|
|
|
print(f"Loaded {len(all_texts)} text chunks from documents.")
|
|
for chunk_text in all_texts:
|
|
builder.add_text(chunk_text)
|
|
|
|
builder.build_index(INDEX_PATH)
|
|
print(f"\nLeann index built at {INDEX_PATH}!")
|
|
else:
|
|
print(f"--- Using existing index at {INDEX_DIR} ---")
|
|
|
|
print("\n[PHASE 2] Starting Leann chat session...")
|
|
|
|
# Build llm_config based on command line arguments
|
|
if args.llm == "simulated":
|
|
llm_config = {"type": "simulated"}
|
|
elif args.llm == "ollama":
|
|
llm_config = {"type": "ollama", "model": args.model, "host": args.host}
|
|
elif args.llm == "hf":
|
|
llm_config = {"type": "hf", "model": args.model}
|
|
elif args.llm == "openai":
|
|
llm_config = {"type": "openai", "model": args.model}
|
|
else:
|
|
raise ValueError(f"Unknown LLM type: {args.llm}")
|
|
|
|
print(f"Using LLM: {args.llm} with model: {args.model if args.llm != 'simulated' else 'N/A'}")
|
|
|
|
chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
|
|
# query = (
|
|
# "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
|
|
# )
|
|
query = args.query
|
|
|
|
print(f"You: {query}")
|
|
chat_response = chat.ask(query, top_k=20, recompute_embeddings=True, complexity=32)
|
|
print(f"Leann chat response: \033[36m{chat_response}\033[0m")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Run Leann Chat with various LLM backends.")
|
|
parser.add_argument(
|
|
"--llm",
|
|
type=str,
|
|
default="hf",
|
|
choices=["simulated", "ollama", "hf", "openai"],
|
|
help="The LLM backend to use.",
|
|
)
|
|
parser.add_argument(
|
|
"--model",
|
|
type=str,
|
|
default="Qwen/Qwen3-0.6B",
|
|
help="The model name to use (e.g., 'llama3:8b' for ollama, 'deepseek-ai/deepseek-llm-7b-chat' for hf, 'gpt-4o' for openai).",
|
|
)
|
|
parser.add_argument(
|
|
"--embedding-model",
|
|
type=str,
|
|
default="facebook/contriever",
|
|
help="The embedding model to use (e.g., 'facebook/contriever', 'text-embedding-3-small').",
|
|
)
|
|
parser.add_argument(
|
|
"--embedding-mode",
|
|
type=str,
|
|
default="sentence-transformers",
|
|
choices=["sentence-transformers", "openai", "mlx"],
|
|
help="The embedding backend mode.",
|
|
)
|
|
parser.add_argument(
|
|
"--host",
|
|
type=str,
|
|
default="http://localhost:11434",
|
|
help="The host for the Ollama API.",
|
|
)
|
|
parser.add_argument(
|
|
"--index-dir",
|
|
type=str,
|
|
default="./test_doc_files",
|
|
help="Directory where the Leann index will be stored.",
|
|
)
|
|
parser.add_argument(
|
|
"--data-dir",
|
|
type=str,
|
|
default="examples/data",
|
|
help="Directory containing documents to index (PDF, TXT, MD files).",
|
|
)
|
|
parser.add_argument(
|
|
"--query",
|
|
type=str,
|
|
default="Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?",
|
|
help="The query to ask the Leann chat system.",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
asyncio.run(main(args))
|