Compare commits

..

4 Commits

Author SHA1 Message Date
yichuan520030910320
00c44e3980 [cli] fix # 81 2025-09-01 16:53:09 -07:00
yichuan520030910320
e6a542bf4b [cli] better gitignore / better leann list 2025-09-01 16:42:11 -07:00
yichuan520030910320
7e84dae02e [chore] add slack to share use case 2025-08-30 00:32:13 -07:00
yichuan520030910320
2f05ed4535 chore(submodule): bump faiss to latest storage-efficient build 2025-08-23 18:29:11 -07:00
5 changed files with 71 additions and 88 deletions

View File

@@ -54,17 +54,6 @@ jobs:
python: '3.12' python: '3.12'
- os: ubuntu-22.04 - os: ubuntu-22.04
python: '3.13' python: '3.13'
# ARM64 Linux builds
- os: ubuntu-24.04-arm
python: '3.9'
- os: ubuntu-24.04-arm
python: '3.10'
- os: ubuntu-24.04-arm
python: '3.11'
- os: ubuntu-24.04-arm
python: '3.12'
- os: ubuntu-24.04-arm
python: '3.13'
- os: macos-14 - os: macos-14
python: '3.9' python: '3.9'
- os: macos-14 - os: macos-14
@@ -119,46 +108,13 @@ jobs:
pkg-config libabsl-dev libaio-dev libprotobuf-dev \ pkg-config libabsl-dev libaio-dev libprotobuf-dev \
patchelf patchelf
# Debug: Show system information # Install Intel MKL for DiskANN
echo "🔍 System Information:" wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
echo "Architecture: $(uname -m)" sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
echo "OS: $(uname -a)" source /opt/intel/oneapi/setvars.sh
echo "CPU info: $(lscpu | head -5)" echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin" >> $GITHUB_ENV
# Install math library based on architecture echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/mkl/latest/lib/intel64" >> $GITHUB_ENV
ARCH=$(uname -m)
echo "🔍 Setting up math library for architecture: $ARCH"
if [[ "$ARCH" == "x86_64" ]]; then
# Install Intel MKL for DiskANN on x86_64
echo "📦 Installing Intel MKL for x86_64..."
wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
source /opt/intel/oneapi/setvars.sh
echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/mkl/latest/lib/intel64" >> $GITHUB_ENV
echo "✅ Intel MKL installed for x86_64"
# Debug: Check MKL installation
echo "🔍 MKL Installation Check:"
ls -la /opt/intel/oneapi/mkl/latest/ || echo "MKL directory not found"
ls -la /opt/intel/oneapi/mkl/latest/lib/ || echo "MKL lib directory not found"
elif [[ "$ARCH" == "aarch64" ]]; then
# Use OpenBLAS for ARM64 (MKL installer not compatible with ARM64)
echo "📦 Installing OpenBLAS for ARM64..."
sudo apt-get install -y libopenblas-dev liblapack-dev liblapacke-dev
echo "✅ OpenBLAS installed for ARM64"
# Debug: Check OpenBLAS installation
echo "🔍 OpenBLAS Installation Check:"
dpkg -l | grep openblas || echo "OpenBLAS package not found"
ls -la /usr/lib/aarch64-linux-gnu/openblas/ || echo "OpenBLAS directory not found"
fi
# Debug: Show final library paths
echo "🔍 Final LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
- name: Install system dependencies (macOS) - name: Install system dependencies (macOS)
if: runner.os == 'macOS' if: runner.os == 'macOS'

View File

@@ -49,28 +49,9 @@ set(BUILD_TESTING OFF CACHE BOOL "" FORCE)
set(FAISS_ENABLE_C_API OFF CACHE BOOL "" FORCE) set(FAISS_ENABLE_C_API OFF CACHE BOOL "" FORCE)
set(FAISS_OPT_LEVEL "generic" CACHE STRING "" FORCE) set(FAISS_OPT_LEVEL "generic" CACHE STRING "" FORCE)
# Disable x86-specific SIMD optimizations (important for ARM64 compatibility) # Disable additional SIMD versions to speed up compilation
set(FAISS_ENABLE_AVX2 OFF CACHE BOOL "" FORCE) set(FAISS_ENABLE_AVX2 OFF CACHE BOOL "" FORCE)
set(FAISS_ENABLE_AVX512 OFF CACHE BOOL "" FORCE) set(FAISS_ENABLE_AVX512 OFF CACHE BOOL "" FORCE)
set(FAISS_ENABLE_SSE4_1 OFF CACHE BOOL "" FORCE)
# ARM64-specific configuration
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
message(STATUS "Configuring Faiss for ARM64 architecture")
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
# Use SVE optimization level for ARM64 Linux (as seen in Faiss conda build)
set(FAISS_OPT_LEVEL "sve" CACHE STRING "" FORCE)
message(STATUS "Setting FAISS_OPT_LEVEL to 'sve' for ARM64 Linux")
else()
# Use generic optimization for other ARM64 platforms (like macOS)
set(FAISS_OPT_LEVEL "generic" CACHE STRING "" FORCE)
message(STATUS "Setting FAISS_OPT_LEVEL to 'generic' for ARM64 ${CMAKE_SYSTEM_NAME}")
endif()
# ARM64 compatibility: Faiss submodule has been modified to fix x86 header inclusion
message(STATUS "Using ARM64-compatible Faiss submodule")
endif()
# Additional optimization options from INSTALL.md # Additional optimization options from INSTALL.md
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)

View File

@@ -322,9 +322,17 @@ Examples:
return basic_matches return basic_matches
def _should_exclude_file(self, relative_path: Path, gitignore_matches) -> bool: def _should_exclude_file(self, file_path: Path, gitignore_matches) -> bool:
"""Check if a file should be excluded using gitignore parser.""" """Check if a file should be excluded using gitignore parser.
return gitignore_matches(str(relative_path))
Always match against absolute, posix-style paths for consistency with
gitignore_parser expectations.
"""
try:
absolute_path = file_path.resolve()
except Exception:
absolute_path = Path(str(file_path))
return gitignore_matches(absolute_path.as_posix())
def _is_git_submodule(self, path: Path) -> bool: def _is_git_submodule(self, path: Path) -> bool:
"""Check if a path is a git submodule.""" """Check if a path is a git submodule."""
@@ -396,7 +404,9 @@ Examples:
print(f" {current_path}") print(f" {current_path}")
print(" " + "" * 45) print(" " + "" * 45)
current_indexes = self._discover_indexes_in_project(current_path) current_indexes = self._discover_indexes_in_project(
current_path, exclude_dirs=other_projects
)
if current_indexes: if current_indexes:
for idx in current_indexes: for idx in current_indexes:
total_indexes += 1 total_indexes += 1
@@ -435,9 +445,14 @@ Examples:
print(" leann build my-docs --docs ./documents") print(" leann build my-docs --docs ./documents")
else: else:
# Count only projects that have at least one discoverable index # Count only projects that have at least one discoverable index
projects_count = sum( projects_count = 0
1 for p in valid_projects if len(self._discover_indexes_in_project(p)) > 0 for p in valid_projects:
) if p == current_path:
discovered = self._discover_indexes_in_project(p, exclude_dirs=other_projects)
else:
discovered = self._discover_indexes_in_project(p)
if len(discovered) > 0:
projects_count += 1
print(f"📊 Total: {total_indexes} indexes across {projects_count} projects") print(f"📊 Total: {total_indexes} indexes across {projects_count} projects")
if current_indexes_count > 0: if current_indexes_count > 0:
@@ -454,9 +469,22 @@ Examples:
print("\n💡 Create your first index:") print("\n💡 Create your first index:")
print(" leann build my-docs --docs ./documents") print(" leann build my-docs --docs ./documents")
def _discover_indexes_in_project(self, project_path: Path): def _discover_indexes_in_project(
"""Discover all indexes in a project directory (both CLI and apps formats)""" self, project_path: Path, exclude_dirs: Optional[list[Path]] = None
):
"""Discover all indexes in a project directory (both CLI and apps formats)
exclude_dirs: when provided, skip any APP-format index files that are
located under these directories. This prevents duplicates when the
current project is a parent directory of other registered projects.
"""
indexes = [] indexes = []
exclude_dirs = exclude_dirs or []
# normalize to resolved paths once for comparison
try:
exclude_dirs_resolved = [p.resolve() for p in exclude_dirs]
except Exception:
exclude_dirs_resolved = exclude_dirs
# 1. CLI format: .leann/indexes/index_name/ # 1. CLI format: .leann/indexes/index_name/
cli_indexes_dir = project_path / ".leann" / "indexes" cli_indexes_dir = project_path / ".leann" / "indexes"
@@ -495,6 +523,17 @@ Examples:
continue continue
except Exception: except Exception:
pass pass
# Skip meta files that live under excluded directories
try:
meta_parent_resolved = meta_file.parent.resolve()
if any(
meta_parent_resolved.is_relative_to(ex_dir)
for ex_dir in exclude_dirs_resolved
):
continue
except Exception:
# best effort; if resolve or comparison fails, do not exclude
pass
# Use the parent directory name as the app index display name # Use the parent directory name as the app index display name
display_name = meta_file.parent.name display_name = meta_file.parent.name
# Extract file base used to store files # Extract file base used to store files
@@ -1022,7 +1061,8 @@ Examples:
# Try to use better PDF parsers first, but only if PDFs are requested # Try to use better PDF parsers first, but only if PDFs are requested
documents = [] documents = []
docs_path = Path(docs_dir) # Use resolved absolute paths to avoid mismatches (symlinks, relative vs absolute)
docs_path = Path(docs_dir).resolve()
# Check if we should process PDFs # Check if we should process PDFs
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
@@ -1031,10 +1071,15 @@ Examples:
for file_path in docs_path.rglob("*.pdf"): for file_path in docs_path.rglob("*.pdf"):
# Check if file matches any exclude pattern # Check if file matches any exclude pattern
try: try:
# Ensure both paths are resolved before computing relativity
file_path_resolved = file_path.resolve()
# Determine directory scope using the non-resolved path to avoid
# misclassifying symlinked entries as outside the docs directory
relative_path = file_path.relative_to(docs_path) relative_path = file_path.relative_to(docs_path)
if not include_hidden and _path_has_hidden_segment(relative_path): if not include_hidden and _path_has_hidden_segment(relative_path):
continue continue
if self._should_exclude_file(relative_path, gitignore_matches): # Use absolute path for gitignore matching
if self._should_exclude_file(file_path_resolved, gitignore_matches):
continue continue
except ValueError: except ValueError:
# Skip files that can't be made relative to docs_path # Skip files that can't be made relative to docs_path
@@ -1077,10 +1122,11 @@ Examples:
) -> bool: ) -> bool:
"""Return True if file should be included (not excluded)""" """Return True if file should be included (not excluded)"""
try: try:
docs_path_obj = Path(docs_dir) docs_path_obj = Path(docs_dir).resolve()
file_path_obj = Path(file_path) file_path_obj = Path(file_path).resolve()
relative_path = file_path_obj.relative_to(docs_path_obj) # Use absolute path for gitignore matching
return not self._should_exclude_file(relative_path, gitignore_matches) _ = file_path_obj.relative_to(docs_path_obj) # validate scope
return not self._should_exclude_file(file_path_obj, gitignore_matches)
except (ValueError, OSError): except (ValueError, OSError):
return True # Include files that can't be processed return True # Include files that can't be processed