* fix: auto-detect normalized embeddings and use cosine distance - Add automatic detection for normalized embedding models (OpenAI, Voyage AI, Cohere) - Automatically set distance_metric='cosine' for normalized embeddings - Add warnings when using non-optimal distance metrics - Implement manual L2 normalization in HNSW backend (custom Faiss build lacks normalize_L2) - Fix DiskANN zmq_port compatibility with lazy loading strategy - Add documentation for normalized embeddings feature This fixes the low accuracy issue when using OpenAI text-embedding-3-small model with default MIPS metric. * style: format * feat: add OpenAI embeddings support to google_history_reader_leann.py - Add --embedding-model and --embedding-mode arguments - Support automatic detection of normalized embeddings - Works correctly with cosine distance for OpenAI embeddings * feat: add --use-existing-index option to google_history_reader_leann.py - Allow using existing index without rebuilding - Useful for testing pre-built indices * fix: Improve OpenAI embeddings handling in HNSW backend * fix: improve macOS C++ compatibility and add CI tests * refactor: improve test structure and fix main_cli example - Move pytest configuration from pytest.ini to pyproject.toml - Remove unnecessary run_tests.py script (use test extras instead) - Fix main_cli_example.py to properly use command line arguments for LLM config - Add test_readme_examples.py to test code examples from README - Refactor tests to use pytest fixtures and parametrization - Update test documentation to reflect new structure - Set proper environment variables in CI for test execution * fix: add --distance-metric support to DiskANN embedding server and remove obsolete macOS ABI test markers - Add --distance-metric parameter to diskann_embedding_server.py for consistency with other backends - Remove pytest.skip and pytest.xfail markers for macOS C++ ABI issues as they have been fixed - Fix test assertions to handle SearchResult objects correctly - All tests now pass on macOS with the C++ ABI compatibility fixes * chore: update lock file with test dependencies * docs: remove obsolete C++ ABI compatibility warnings - Remove outdated macOS C++ compatibility warnings from README - Simplify CI workflow by removing macOS-specific failure handling - All tests now pass consistently on macOS after ABI fixes * fix: update macOS deployment target for DiskANN to 13.3 - DiskANN uses sgesdd_ LAPACK function which is only available on macOS 13.3+ - Update MACOSX_DEPLOYMENT_TARGET from 11.0 to 13.3 for DiskANN builds - This fixes the compilation error on GitHub Actions macOS runners * fix: align Python version requirements to 3.9 - Update root project to support Python 3.9, matching subpackages - Restore macOS Python 3.9 support in CI - This fixes the CI failure for Python 3.9 environments * fix: handle MPS memory issues in CI tests - Use smaller MiniLM-L6-v2 model (384 dimensions) for README tests in CI - Skip other memory-intensive tests in CI environment - Add minimal CI tests that don't require model loading - Set CI environment variable and disable MPS fallback - Ensure README examples always run correctly in CI * fix: remove Python 3.10+ dependencies for compatibility - Comment out llama-index-readers-docling and llama-index-node-parser-docling - These packages require Python >= 3.10 and were causing CI failures on Python 3.9 - Regenerate uv.lock file to resolve dependency conflicts * fix: use virtual environment in CI instead of system packages - uv-managed Python environments don't allow --system installs - Create and activate virtual environment before installing packages - Update all CI steps to use the virtual environment * add some env in ci * fix: use --find-links to install platform-specific wheels - Let uv automatically select the correct wheel for the current platform - Fixes error when trying to install macOS wheels on Linux - Simplifies the installation logic * fix: disable OpenMP parallelism in CI to avoid libomp crashes - Set OMP_NUM_THREADS=1 to avoid OpenMP thread synchronization issues - Set MKL_NUM_THREADS=1 for single-threaded MKL operations - This prevents segfaults in LayerNorm on macOS CI runners - Addresses the libomp compatibility issues with PyTorch on Apple Silicon * skip several macos test because strange issue on ci --------- Co-authored-by: yichuan520030910320 <yichuan_wang@berkeley.edu>
157 lines
3.9 KiB
TOML
157 lines
3.9 KiB
TOML
[build-system]
|
|
requires = ["setuptools>=61.0", "cmake>=3.24"]
|
|
build-backend = "setuptools.build_meta"
|
|
|
|
[project]
|
|
name = "leann-workspace"
|
|
version = "0.1.0"
|
|
requires-python = ">=3.9"
|
|
|
|
dependencies = [
|
|
"leann-core",
|
|
"leann-backend-hnsw",
|
|
"numpy>=1.26.0",
|
|
"torch",
|
|
"tqdm",
|
|
"flask",
|
|
"flask_compress",
|
|
"datasets>=2.15.0",
|
|
"evaluate",
|
|
"colorama",
|
|
"boto3",
|
|
"protobuf==4.25.3",
|
|
"sglang",
|
|
"ollama",
|
|
"requests>=2.25.0",
|
|
"sentence-transformers>=2.2.0",
|
|
"openai>=1.0.0",
|
|
# PDF parsing dependencies - essential for document processing
|
|
"PyPDF2>=3.0.0",
|
|
"pdfplumber>=0.11.0",
|
|
"pymupdf>=1.26.0",
|
|
"pypdfium2>=4.30.0",
|
|
# LlamaIndex core and readers - updated versions
|
|
"llama-index>=0.12.44",
|
|
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
|
|
# "llama-index-readers-docling", # Requires Python >= 3.10
|
|
# "llama-index-node-parser-docling", # Requires Python >= 3.10
|
|
"llama-index-vector-stores-faiss>=0.4.0",
|
|
"llama-index-embeddings-huggingface>=0.5.5",
|
|
# Other dependencies
|
|
"ipykernel==6.29.5",
|
|
"msgpack>=1.1.1",
|
|
"mlx>=0.26.3; sys_platform == 'darwin'",
|
|
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
|
"psutil>=5.8.0",
|
|
]
|
|
|
|
[project.optional-dependencies]
|
|
dev = [
|
|
"pytest>=7.0",
|
|
"pytest-cov>=4.0",
|
|
"pytest-xdist>=3.0", # For parallel test execution
|
|
"black>=23.0",
|
|
"ruff>=0.1.0",
|
|
"matplotlib",
|
|
"huggingface-hub>=0.20.0",
|
|
"pre-commit>=3.5.0",
|
|
]
|
|
|
|
test = [
|
|
"pytest>=7.0",
|
|
"pytest-timeout>=2.0",
|
|
"llama-index-core>=0.12.0",
|
|
"llama-index-readers-file>=0.4.0",
|
|
"python-dotenv>=1.0.0",
|
|
"sentence-transformers>=2.2.0",
|
|
]
|
|
|
|
diskann = [
|
|
"leann-backend-diskann",
|
|
]
|
|
|
|
# Add a new optional dependency group for document processing
|
|
documents = [
|
|
"beautifulsoup4>=4.13.0", # For HTML parsing
|
|
"python-docx>=0.8.11", # For Word documents
|
|
"openpyxl>=3.1.0", # For Excel files
|
|
"pandas>=2.2.0", # For data processing
|
|
]
|
|
|
|
[tool.setuptools]
|
|
py-modules = []
|
|
|
|
|
|
[tool.uv.sources]
|
|
leann-core = { path = "packages/leann-core", editable = true }
|
|
leann-backend-diskann = { path = "packages/leann-backend-diskann", editable = true }
|
|
leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true }
|
|
|
|
[tool.ruff]
|
|
target-version = "py310"
|
|
line-length = 100
|
|
extend-exclude = [
|
|
"third_party",
|
|
"*.egg-info",
|
|
"__pycache__",
|
|
".git",
|
|
".venv",
|
|
]
|
|
|
|
[tool.ruff.lint]
|
|
select = [
|
|
"E", # pycodestyle errors
|
|
"W", # pycodestyle warnings
|
|
"F", # pyflakes
|
|
"I", # isort
|
|
"B", # flake8-bugbear
|
|
"C4", # flake8-comprehensions
|
|
"UP", # pyupgrade
|
|
"N", # pep8-naming
|
|
"RUF", # ruff-specific rules
|
|
]
|
|
ignore = [
|
|
"E501", # line too long (handled by formatter)
|
|
"B008", # do not perform function calls in argument defaults
|
|
"B904", # raise without from
|
|
"N812", # lowercase imported as non-lowercase
|
|
"N806", # variable in function should be lowercase
|
|
"RUF012", # mutable class attributes should be annotated with typing.ClassVar
|
|
]
|
|
|
|
[tool.ruff.lint.per-file-ignores]
|
|
"test/**/*.py" = ["E402"] # module level import not at top of file (common in tests)
|
|
"examples/**/*.py" = ["E402"] # module level import not at top of file (common in examples)
|
|
|
|
[tool.ruff.format]
|
|
quote-style = "double"
|
|
indent-style = "space"
|
|
skip-magic-trailing-comma = false
|
|
line-ending = "auto"
|
|
|
|
[dependency-groups]
|
|
dev = [
|
|
"ruff>=0.12.4",
|
|
]
|
|
|
|
[tool.pytest.ini_options]
|
|
testpaths = ["tests"]
|
|
python_files = ["test_*.py"]
|
|
python_classes = ["Test*"]
|
|
python_functions = ["test_*"]
|
|
markers = [
|
|
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
"openai: marks tests that require OpenAI API key",
|
|
]
|
|
timeout = 600
|
|
addopts = [
|
|
"-v",
|
|
"--tb=short",
|
|
"--strict-markers",
|
|
"--disable-warnings",
|
|
]
|
|
env = [
|
|
"HF_HUB_DISABLE_SYMLINKS=1",
|
|
"TOKENIZERS_PARALLELISM=false",
|
|
]
|