Compare commits
22 Commits
v0.1.14
...
fix/manyli
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
971653fa1a | ||
|
|
02672c040d | ||
|
|
f55108feda | ||
|
|
74d485c908 | ||
|
|
d1fefb6378 | ||
|
|
732384f4f8 | ||
|
|
ae38e10d1b | ||
|
|
ca0fd88934 | ||
|
|
3c8d32f156 | ||
|
|
b8ff00fc6a | ||
|
|
3c836766f8 | ||
|
|
b4a1dfb9c7 | ||
|
|
a4d66e95d8 | ||
|
|
cf58b3e31b | ||
|
|
e9c2ca7936 | ||
|
|
dab154a77b | ||
|
|
13413dfae5 | ||
|
|
0543cc9816 | ||
|
|
fb53ed9a0e | ||
|
|
015f43733a | ||
|
|
2957c8bf5a | ||
|
|
a73194c3f6 |
171
.github/workflows/build-cibuildwheel.yml
vendored
Normal file
171
.github/workflows/build-cibuildwheel.yml
vendored
Normal file
@@ -0,0 +1,171 @@
|
||||
name: Build with cibuildwheel
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
ref:
|
||||
description: 'Git ref to build'
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
|
||||
jobs:
|
||||
build_wheels:
|
||||
name: Build wheels on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest] # Focus on Linux/manylinux for Colab compatibility
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.ref }}
|
||||
submodules: recursive
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
# Build pure Python packages separately
|
||||
- name: Build pure Python packages (leann-core, leann)
|
||||
if: matrix.os == 'ubuntu-latest' # Only build once
|
||||
run: |
|
||||
python -m pip install --upgrade pip build
|
||||
python -m build packages/leann-core --outdir wheelhouse/
|
||||
python -m build packages/leann --outdir wheelhouse/
|
||||
|
||||
- name: Build leann-backend-hnsw wheels
|
||||
uses: pypa/cibuildwheel@v2.20.0
|
||||
with:
|
||||
package-dir: packages/leann-backend-hnsw
|
||||
output-dir: wheelhouse
|
||||
env:
|
||||
CIBW_BUILD: cp39-* cp310-* cp311-* cp312-* cp313-*
|
||||
CIBW_SKIP: "*-win32 *-manylinux_i686 pp* *musllinux*"
|
||||
|
||||
# Use manylinux_2_35 for Colab compatibility with modern features
|
||||
CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_35
|
||||
CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_35
|
||||
|
||||
# Linux dependencies - using dnf for manylinux_2_35 (based on AlmaLinux 9)
|
||||
CIBW_BEFORE_ALL_LINUX: |
|
||||
dnf install -y epel-release
|
||||
dnf install -y gcc-c++ boost-devel zeromq-devel openblas-devel cmake python3-devel
|
||||
|
||||
# Install numpy before building
|
||||
CIBW_BEFORE_BUILD: |
|
||||
pip install numpy
|
||||
pip install --upgrade pip setuptools wheel
|
||||
|
||||
CIBW_BEFORE_BUILD_LINUX: |
|
||||
pip install numpy
|
||||
pip install --upgrade pip setuptools wheel swig
|
||||
|
||||
CIBW_BEFORE_ALL_MACOS: |
|
||||
brew install boost zeromq openblas cmake libomp
|
||||
|
||||
# Pre-install test dependencies to avoid compilation
|
||||
CIBW_BEFORE_TEST: |
|
||||
pip install --only-binary :all: "pyzmq>=23.0.0"
|
||||
|
||||
# Test command to verify the wheel works
|
||||
CIBW_TEST_COMMAND: |
|
||||
python -c "import leann_backend_hnsw; print('HNSW backend imported successfully')"
|
||||
|
||||
# Skip problematic configurations
|
||||
CIBW_TEST_SKIP: "*-macosx_arm64" # Skip ARM64 tests on GitHub Actions
|
||||
|
||||
# Test dependencies
|
||||
CIBW_TEST_REQUIRES: "pytest numpy"
|
||||
|
||||
# Environment variables for build
|
||||
CIBW_ENVIRONMENT: |
|
||||
CMAKE_BUILD_PARALLEL_LEVEL=8
|
||||
Python_FIND_VIRTUALENV=ONLY
|
||||
Python3_FIND_VIRTUALENV=ONLY
|
||||
|
||||
# Linux-specific environment variables
|
||||
CIBW_ENVIRONMENT_LINUX: |
|
||||
CMAKE_BUILD_PARALLEL_LEVEL=8
|
||||
|
||||
# macOS-specific environment variables
|
||||
CIBW_ENVIRONMENT_MACOS: |
|
||||
CMAKE_BUILD_PARALLEL_LEVEL=8
|
||||
MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
CMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
Python_FIND_VIRTUALENV=ONLY
|
||||
Python3_FIND_VIRTUALENV=ONLY
|
||||
|
||||
- name: Build leann-backend-diskann wheels
|
||||
uses: pypa/cibuildwheel@v2.20.0
|
||||
with:
|
||||
package-dir: packages/leann-backend-diskann
|
||||
output-dir: wheelhouse
|
||||
env:
|
||||
CIBW_BUILD: cp39-* cp310-* cp311-* cp312-* cp313-*
|
||||
CIBW_SKIP: "*-win32 *-manylinux_i686 pp* *musllinux*"
|
||||
|
||||
CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_35
|
||||
CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_35
|
||||
|
||||
CIBW_BEFORE_ALL_LINUX: |
|
||||
dnf install -y epel-release
|
||||
dnf install -y gcc-c++ boost-devel zeromq-devel openblas-devel cmake python3-devel
|
||||
|
||||
# Install numpy before building
|
||||
CIBW_BEFORE_BUILD: |
|
||||
pip install numpy
|
||||
pip install --upgrade pip setuptools wheel
|
||||
|
||||
CIBW_BEFORE_BUILD_LINUX: |
|
||||
pip install numpy
|
||||
pip install --upgrade pip setuptools wheel swig
|
||||
|
||||
CIBW_BEFORE_ALL_MACOS: |
|
||||
brew install boost zeromq openblas cmake libomp
|
||||
|
||||
# Pre-install test dependencies to avoid compilation
|
||||
CIBW_BEFORE_TEST: |
|
||||
pip install --only-binary :all: "pyzmq>=23.0.0"
|
||||
|
||||
# Test command to verify the wheel works
|
||||
CIBW_TEST_COMMAND: |
|
||||
python -c "import leann_backend_diskann; print('DiskANN backend imported successfully')"
|
||||
|
||||
# Skip problematic configurations
|
||||
CIBW_TEST_SKIP: "*-macosx_arm64" # Skip ARM64 tests on GitHub Actions
|
||||
|
||||
# Test dependencies - avoid pyzmq due to manylinux2014 compatibility issues
|
||||
CIBW_TEST_REQUIRES: "pytest numpy"
|
||||
|
||||
CIBW_ENVIRONMENT: |
|
||||
CMAKE_BUILD_PARALLEL_LEVEL=8
|
||||
Python_FIND_VIRTUALENV=ONLY
|
||||
Python3_FIND_VIRTUALENV=ONLY
|
||||
|
||||
# Linux-specific environment variables
|
||||
CIBW_ENVIRONMENT_LINUX: |
|
||||
CMAKE_BUILD_PARALLEL_LEVEL=8
|
||||
CMAKE_PREFIX_PATH=$VIRTUAL_ENV
|
||||
Python_FIND_VIRTUALENV=ONLY
|
||||
Python3_FIND_VIRTUALENV=ONLY
|
||||
Python_FIND_STRATEGY=LOCATION
|
||||
Python3_FIND_STRATEGY=LOCATION
|
||||
Python_EXECUTABLE=$VIRTUAL_ENV/bin/python
|
||||
Python3_EXECUTABLE=$VIRTUAL_ENV/bin/python
|
||||
|
||||
# macOS-specific environment variables
|
||||
CIBW_ENVIRONMENT_MACOS: |
|
||||
CMAKE_BUILD_PARALLEL_LEVEL=8
|
||||
MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
CMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
Python_FIND_VIRTUALENV=ONLY
|
||||
Python3_FIND_VIRTUALENV=ONLY
|
||||
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wheels-${{ matrix.os }}
|
||||
path: ./wheelhouse/*.whl
|
||||
219
.github/workflows/build-reusable.yml
vendored
219
.github/workflows/build-reusable.yml
vendored
@@ -10,78 +10,110 @@ on:
|
||||
default: ''
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
name: Lint and Format Check
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.ref }}
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v4
|
||||
|
||||
- name: Install ruff
|
||||
run: |
|
||||
uv tool install ruff
|
||||
|
||||
- name: Run ruff check
|
||||
run: |
|
||||
ruff check .
|
||||
|
||||
- name: Run ruff format check
|
||||
run: |
|
||||
ruff format --check .
|
||||
|
||||
build:
|
||||
needs: lint
|
||||
name: Build ${{ matrix.os }} Python ${{ matrix.python }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-22.04
|
||||
- os: ubuntu-latest
|
||||
python: '3.9'
|
||||
- os: ubuntu-22.04
|
||||
container: 'quay.io/pypa/manylinux2014_x86_64'
|
||||
- os: ubuntu-latest
|
||||
python: '3.10'
|
||||
- os: ubuntu-22.04
|
||||
container: 'quay.io/pypa/manylinux2014_x86_64'
|
||||
- os: ubuntu-latest
|
||||
python: '3.11'
|
||||
- os: ubuntu-22.04
|
||||
container: 'quay.io/pypa/manylinux2014_x86_64'
|
||||
- os: ubuntu-latest
|
||||
python: '3.12'
|
||||
- os: ubuntu-22.04
|
||||
container: 'quay.io/pypa/manylinux2014_x86_64'
|
||||
- os: ubuntu-latest
|
||||
python: '3.13'
|
||||
container: 'quay.io/pypa/manylinux2014_x86_64'
|
||||
- os: macos-latest
|
||||
python: '3.9'
|
||||
container: ''
|
||||
- os: macos-latest
|
||||
python: '3.10'
|
||||
container: ''
|
||||
- os: macos-latest
|
||||
python: '3.11'
|
||||
container: ''
|
||||
- os: macos-latest
|
||||
python: '3.12'
|
||||
container: ''
|
||||
- os: macos-latest
|
||||
python: '3.13'
|
||||
container: ''
|
||||
runs-on: ${{ matrix.os }}
|
||||
container: ${{ matrix.container }}
|
||||
|
||||
steps:
|
||||
# For manylinux2014 compatibility, we'll handle checkout differently
|
||||
- uses: actions/checkout@v4
|
||||
if: matrix.container == ''
|
||||
with:
|
||||
ref: ${{ inputs.ref }}
|
||||
submodules: recursive
|
||||
|
||||
- name: Setup Python
|
||||
# Manual checkout for containers to avoid Node.js compatibility issues
|
||||
- name: Manual checkout in container
|
||||
if: matrix.container != ''
|
||||
run: |
|
||||
# Install git if not available
|
||||
yum install -y git || true
|
||||
|
||||
# Configure git to handle the directory ownership issue
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
git config --global --add safe.directory /__w/LEANN/LEANN
|
||||
git config --global --add safe.directory /github/workspace
|
||||
git config --global --add safe.directory $(pwd)
|
||||
|
||||
# Clone the repository manually in the container
|
||||
git init
|
||||
git remote add origin https://github.com/${GITHUB_REPOSITORY}.git
|
||||
|
||||
# Fetch the appropriate ref
|
||||
if [ -n "${{ inputs.ref }}" ]; then
|
||||
git fetch --depth=1 origin ${{ inputs.ref }}
|
||||
else
|
||||
git fetch --depth=1 origin ${GITHUB_SHA}
|
||||
fi
|
||||
git checkout FETCH_HEAD
|
||||
|
||||
# Initialize submodules
|
||||
git submodule update --init --recursive
|
||||
|
||||
- name: Setup Python (macOS and regular Ubuntu)
|
||||
if: matrix.container == ''
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
|
||||
- name: Install uv
|
||||
- name: Setup Python (manylinux container)
|
||||
if: matrix.container != ''
|
||||
run: |
|
||||
# Use the pre-installed Python version in manylinux container
|
||||
# Convert Python version format (3.9 -> 39, 3.10 -> 310, etc.)
|
||||
PY_VER=$(echo "${{ matrix.python }}" | sed 's/\.//g')
|
||||
/opt/python/cp${PY_VER}-*/bin/python -m pip install --upgrade pip
|
||||
# Create symlinks for convenience
|
||||
ln -sf /opt/python/cp${PY_VER}-*/bin/python /usr/local/bin/python
|
||||
ln -sf /opt/python/cp${PY_VER}-*/bin/pip /usr/local/bin/pip
|
||||
|
||||
- name: Install uv (macOS and regular Ubuntu)
|
||||
if: matrix.container == ''
|
||||
uses: astral-sh/setup-uv@v4
|
||||
|
||||
- name: Install system dependencies (Ubuntu)
|
||||
if: runner.os == 'Linux'
|
||||
- name: Install uv (manylinux container)
|
||||
if: matrix.container != ''
|
||||
run: |
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Install system dependencies (Ubuntu - regular)
|
||||
if: runner.os == 'Linux' && matrix.container == ''
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \
|
||||
@@ -94,6 +126,64 @@ jobs:
|
||||
echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
|
||||
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
|
||||
|
||||
- name: Install system dependencies (manylinux container)
|
||||
if: runner.os == 'Linux' && matrix.container != ''
|
||||
run: |
|
||||
# manylinux2014 uses yum instead of apt
|
||||
# Update yum cache first
|
||||
yum clean all
|
||||
yum makecache
|
||||
|
||||
# Install EPEL repository
|
||||
yum install -y epel-release || true
|
||||
|
||||
# Update cache again after EPEL
|
||||
yum makecache || true
|
||||
|
||||
# Install development packages
|
||||
# Note: Some packages might have different names in CentOS 7
|
||||
yum install -y \
|
||||
gcc-c++ \
|
||||
boost-devel \
|
||||
protobuf-compiler \
|
||||
protobuf-devel \
|
||||
zeromq-devel \
|
||||
pkgconfig \
|
||||
openblas-devel \
|
||||
cmake || {
|
||||
echo "Some packages failed to install, trying alternatives..."
|
||||
# Try alternative package names
|
||||
yum install -y libzmq3-devel || true
|
||||
yum install -y libzmq-devel || true
|
||||
}
|
||||
|
||||
# Install optional packages that might not be available
|
||||
yum install -y libaio-devel || echo "libaio-devel not available, continuing..."
|
||||
|
||||
# Verify zmq installation and create pkg-config file if needed
|
||||
if [ ! -f /usr/lib64/pkgconfig/libzmq.pc ] && [ ! -f /usr/lib/pkgconfig/libzmq.pc ]; then
|
||||
echo "Creating libzmq.pc file..."
|
||||
mkdir -p /usr/lib64/pkgconfig
|
||||
cat > /usr/lib64/pkgconfig/libzmq.pc << 'EOF'
|
||||
prefix=/usr
|
||||
exec_prefix=${prefix}
|
||||
libdir=${exec_prefix}/lib64
|
||||
includedir=${prefix}/include
|
||||
|
||||
Name: libzmq
|
||||
Description: ZeroMQ library
|
||||
Version: 4.1.4
|
||||
Libs: -L${libdir} -lzmq
|
||||
Cflags: -I${includedir}
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Update PKG_CONFIG_PATH
|
||||
echo "PKG_CONFIG_PATH=/usr/lib64/pkgconfig:/usr/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
|
||||
|
||||
# Build tools are pre-installed in manylinux
|
||||
# MKL is more complex in container, skip for now and use OpenBLAS
|
||||
|
||||
- name: Install system dependencies (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
run: |
|
||||
@@ -101,44 +191,65 @@ jobs:
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
uv pip install --system scikit-build-core numpy swig Cython pybind11
|
||||
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
||||
uv pip install --system auditwheel
|
||||
if [[ -n "${{ matrix.container }}" ]]; then
|
||||
# In manylinux container, use regular pip
|
||||
pip install scikit-build-core numpy swig Cython pybind11 auditwheel
|
||||
else
|
||||
uv pip install --system delocate
|
||||
# Regular environment, use uv
|
||||
uv pip install --system scikit-build-core numpy swig Cython pybind11
|
||||
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
||||
uv pip install --system auditwheel
|
||||
else
|
||||
uv pip install --system delocate
|
||||
fi
|
||||
fi
|
||||
|
||||
- name: Build packages
|
||||
run: |
|
||||
# Choose build command based on environment
|
||||
if [[ -n "${{ matrix.container }}" ]]; then
|
||||
BUILD_CMD="pip wheel . --no-deps -w dist"
|
||||
else
|
||||
BUILD_CMD="uv build --wheel --python python"
|
||||
fi
|
||||
|
||||
# Build core (platform independent)
|
||||
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
|
||||
if [ "${{ matrix.os }}" == "ubuntu-latest" ]; then
|
||||
cd packages/leann-core
|
||||
uv build
|
||||
if [[ -n "${{ matrix.container }}" ]]; then
|
||||
pip wheel . --no-deps -w dist
|
||||
else
|
||||
uv build
|
||||
fi
|
||||
cd ../..
|
||||
fi
|
||||
|
||||
# Build HNSW backend
|
||||
cd packages/leann-backend-hnsw
|
||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv build --wheel --python python
|
||||
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ $BUILD_CMD
|
||||
else
|
||||
uv build --wheel --python python
|
||||
eval $BUILD_CMD
|
||||
fi
|
||||
cd ../..
|
||||
|
||||
# Build DiskANN backend
|
||||
cd packages/leann-backend-diskann
|
||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv build --wheel --python python
|
||||
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ $BUILD_CMD
|
||||
else
|
||||
uv build --wheel --python python
|
||||
eval $BUILD_CMD
|
||||
fi
|
||||
cd ../..
|
||||
|
||||
# Build meta package (platform independent)
|
||||
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
|
||||
if [ "${{ matrix.os }}" == "ubuntu-latest" ]; then
|
||||
cd packages/leann
|
||||
uv build
|
||||
if [[ -n "${{ matrix.container }}" ]]; then
|
||||
pip wheel . --no-deps -w dist
|
||||
else
|
||||
uv build
|
||||
fi
|
||||
cd ../..
|
||||
fi
|
||||
|
||||
@@ -148,6 +259,9 @@ jobs:
|
||||
# Repair HNSW wheel
|
||||
cd packages/leann-backend-hnsw
|
||||
if [ -d dist ]; then
|
||||
# Show what platform auditwheel will use
|
||||
auditwheel show dist/*.whl || true
|
||||
# Let auditwheel auto-detect the appropriate manylinux tag
|
||||
auditwheel repair dist/*.whl -w dist_repaired
|
||||
rm -rf dist
|
||||
mv dist_repaired dist
|
||||
@@ -157,6 +271,9 @@ jobs:
|
||||
# Repair DiskANN wheel
|
||||
cd packages/leann-backend-diskann
|
||||
if [ -d dist ]; then
|
||||
# Show what platform auditwheel will use
|
||||
auditwheel show dist/*.whl || true
|
||||
# Let auditwheel auto-detect the appropriate manylinux tag
|
||||
auditwheel repair dist/*.whl -w dist_repaired
|
||||
rm -rf dist
|
||||
mv dist_repaired dist
|
||||
@@ -192,5 +309,5 @@ jobs:
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: packages-${{ matrix.os }}-py${{ matrix.python }}
|
||||
name: packages-${{ matrix.os }}-py${{ matrix.python }}${{ matrix.container && '-manylinux' || '' }}
|
||||
path: packages/*/dist/
|
||||
85
.github/workflows/release-manual.yml
vendored
85
.github/workflows/release-manual.yml
vendored
@@ -7,6 +7,11 @@ on:
|
||||
description: 'Version to release (e.g., 0.1.2)'
|
||||
required: true
|
||||
type: string
|
||||
use_cibuildwheel:
|
||||
description: 'Use cibuildwheel for better compatibility (recommended for Colab)'
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
jobs:
|
||||
update-version:
|
||||
@@ -22,50 +27,46 @@ jobs:
|
||||
|
||||
- name: Validate version
|
||||
run: |
|
||||
# Remove 'v' prefix if present for validation
|
||||
VERSION_CLEAN="${{ inputs.version }}"
|
||||
VERSION_CLEAN="${VERSION_CLEAN#v}"
|
||||
if ! [[ "$VERSION_CLEAN" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
||||
echo "❌ Invalid version format. Expected format: X.Y.Z or vX.Y.Z"
|
||||
if ! [[ "${{ inputs.version }}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
||||
echo "❌ Invalid version format"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version format valid: ${{ inputs.version }}"
|
||||
echo "✅ Version format valid"
|
||||
|
||||
- name: Update versions and push
|
||||
id: push
|
||||
run: |
|
||||
# Check current version
|
||||
CURRENT_VERSION=$(grep "^version" packages/leann-core/pyproject.toml | cut -d'"' -f2)
|
||||
echo "Current version: $CURRENT_VERSION"
|
||||
echo "Target version: ${{ inputs.version }}"
|
||||
|
||||
if [ "$CURRENT_VERSION" = "${{ inputs.version }}" ]; then
|
||||
echo "⚠️ Version is already ${{ inputs.version }}, skipping update"
|
||||
COMMIT_SHA=$(git rev-parse HEAD)
|
||||
else
|
||||
./scripts/bump_version.sh ${{ inputs.version }}
|
||||
git config user.name "GitHub Actions"
|
||||
git config user.email "actions@github.com"
|
||||
git add packages/*/pyproject.toml
|
||||
git commit -m "chore: release v${{ inputs.version }}"
|
||||
git push origin main
|
||||
COMMIT_SHA=$(git rev-parse HEAD)
|
||||
echo "✅ Pushed version update: $COMMIT_SHA"
|
||||
fi
|
||||
./scripts/bump_version.sh ${{ inputs.version }}
|
||||
git config user.name "GitHub Actions"
|
||||
git config user.email "actions@github.com"
|
||||
git add packages/*/pyproject.toml
|
||||
git commit -m "chore: release v${{ inputs.version }}"
|
||||
git push origin main
|
||||
|
||||
COMMIT_SHA=$(git rev-parse HEAD)
|
||||
echo "commit-sha=$COMMIT_SHA" >> $GITHUB_OUTPUT
|
||||
echo "✅ Pushed version update: $COMMIT_SHA"
|
||||
|
||||
build-packages:
|
||||
name: Build packages
|
||||
build-packages-reusable:
|
||||
name: Build packages (Standard)
|
||||
needs: update-version
|
||||
if: ${{ !inputs.use_cibuildwheel }}
|
||||
uses: ./.github/workflows/build-reusable.yml
|
||||
with:
|
||||
ref: 'main'
|
||||
ref: ${{ needs.update-version.outputs.commit-sha }}
|
||||
|
||||
build-packages-cibuildwheel:
|
||||
name: Build packages (cibuildwheel)
|
||||
needs: update-version
|
||||
if: ${{ inputs.use_cibuildwheel }}
|
||||
uses: ./.github/workflows/build-cibuildwheel.yml
|
||||
with:
|
||||
ref: ${{ needs.update-version.outputs.commit-sha }}
|
||||
|
||||
publish:
|
||||
name: Publish and Release
|
||||
needs: [update-version, build-packages]
|
||||
if: always() && needs.update-version.result == 'success' && needs.build-packages.result == 'success'
|
||||
needs: [update-version, build-packages-reusable, build-packages-cibuildwheel]
|
||||
if: always() && needs.update-version.result == 'success' && (needs.build-packages-reusable.result == 'success' || needs.build-packages-cibuildwheel.result == 'success')
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
@@ -73,7 +74,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: 'main'
|
||||
ref: ${{ needs.update-version.outputs.commit-sha }}
|
||||
|
||||
- name: Download all artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
@@ -106,24 +107,12 @@ jobs:
|
||||
|
||||
- name: Create release
|
||||
run: |
|
||||
# Check if tag already exists
|
||||
if git rev-parse "v${{ inputs.version }}" >/dev/null 2>&1; then
|
||||
echo "⚠️ Tag v${{ inputs.version }} already exists, skipping tag creation"
|
||||
else
|
||||
git tag "v${{ inputs.version }}"
|
||||
git push origin "v${{ inputs.version }}"
|
||||
echo "✅ Created and pushed tag v${{ inputs.version }}"
|
||||
fi
|
||||
git tag "v${{ inputs.version }}"
|
||||
git push origin "v${{ inputs.version }}"
|
||||
|
||||
# Check if release already exists
|
||||
if gh release view "v${{ inputs.version }}" >/dev/null 2>&1; then
|
||||
echo "⚠️ Release v${{ inputs.version }} already exists, skipping release creation"
|
||||
else
|
||||
gh release create "v${{ inputs.version }}" \
|
||||
--title "Release v${{ inputs.version }}" \
|
||||
--notes "🚀 Released to PyPI: https://pypi.org/project/leann/${{ inputs.version }}/" \
|
||||
--latest
|
||||
echo "✅ Created GitHub release v${{ inputs.version }}"
|
||||
fi
|
||||
gh release create "v${{ inputs.version }}" \
|
||||
--title "Release v${{ inputs.version }}" \
|
||||
--notes "🚀 Released to PyPI: https://pypi.org/project/leann/${{ inputs.version }}/" \
|
||||
--latest
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
60
.github/workflows/test-manylinux.yml
vendored
Normal file
60
.github/workflows/test-manylinux.yml
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
name: Test Manylinux Build
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- '.github/workflows/**'
|
||||
- 'packages/**'
|
||||
- 'pyproject.toml'
|
||||
push:
|
||||
branches:
|
||||
- 'fix/manylinux-*'
|
||||
- 'test/build-*'
|
||||
|
||||
jobs:
|
||||
build:
|
||||
uses: ./.github/workflows/build-cibuildwheel.yml
|
||||
|
||||
test-install:
|
||||
needs: build
|
||||
runs-on: ubuntu-22.04 # Simulating Colab environment
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.10', '3.11', '3.12']
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Download artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: wheels-*
|
||||
path: dist
|
||||
merge-multiple: true
|
||||
|
||||
- name: Test installation
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
# Find and install the appropriate wheels
|
||||
pip install dist/leann_core-*.whl
|
||||
pip install dist/leann_backend_hnsw-*manylinux*.whl
|
||||
pip install dist/leann-*.whl
|
||||
|
||||
- name: Test import
|
||||
run: |
|
||||
python -c "
|
||||
import leann
|
||||
from leann import LeannBuilder, LeannSearcher
|
||||
print('Successfully imported leann modules')
|
||||
|
||||
# Quick functionality test
|
||||
builder = LeannBuilder(backend_name='hnsw')
|
||||
builder.add_text('Test document')
|
||||
print('LeannBuilder created and used successfully')
|
||||
"
|
||||
@@ -1,23 +0,0 @@
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.5.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-yaml
|
||||
- id: check-added-large-files
|
||||
- id: check-merge-conflict
|
||||
- id: debug-statements
|
||||
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 24.1.1
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3
|
||||
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.2.1
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix]
|
||||
- id: ruff-format
|
||||
50
MANYLINUX_BUILD_STRATEGY.md
Normal file
50
MANYLINUX_BUILD_STRATEGY.md
Normal file
@@ -0,0 +1,50 @@
|
||||
# Manylinux Build Strategy
|
||||
|
||||
## Problem
|
||||
Google Colab requires wheels compatible with `manylinux_2_35_x86_64` or earlier. Our previous builds were producing `manylinux_2_39_x86_64` wheels, which are incompatible.
|
||||
|
||||
## Solution
|
||||
We're using `cibuildwheel` with `manylinux_2_35` images to build wheels that are compatible with Google Colab while maintaining modern toolchain features.
|
||||
|
||||
### Key Changes
|
||||
|
||||
1. **cibuildwheel Configuration**
|
||||
- Using `manylinux2014` images (provides `manylinux_2_17` compatibility)
|
||||
- Using `yum` package manager (CentOS 7 based)
|
||||
- Installing `cmake3` and creating symlink for compatibility
|
||||
|
||||
2. **Build Matrix**
|
||||
- Python versions: 3.9, 3.10, 3.11, 3.12, 3.13
|
||||
- Platforms: Linux (x86_64), macOS
|
||||
- No Windows support (not required)
|
||||
|
||||
3. **Dependencies**
|
||||
- Linux: gcc-c++, boost-devel, zeromq-devel, openblas-devel, cmake3
|
||||
- macOS: boost, zeromq, openblas, cmake (via Homebrew)
|
||||
|
||||
4. **Environment Variables**
|
||||
- `CMAKE_BUILD_PARALLEL_LEVEL=8`: Speed up builds
|
||||
- `Python_FIND_VIRTUALENV=ONLY`: Help CMake find Python in cibuildwheel env
|
||||
- `Python3_FIND_VIRTUALENV=ONLY`: Alternative variable for compatibility
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
1. **CI Pipeline**: `test-manylinux.yml`
|
||||
- Triggers on PR to main, manual dispatch, or push to `fix/manylinux-*` branches
|
||||
- Builds wheels using cibuildwheel
|
||||
- Tests installation on Ubuntu 22.04 (simulating Colab)
|
||||
|
||||
2. **Local Testing**
|
||||
```bash
|
||||
# Download built wheels
|
||||
# Test in fresh environment
|
||||
python -m venv test_env
|
||||
source test_env/bin/activate
|
||||
pip install leann_core-*.whl leann_backend_hnsw-*manylinux*.whl leann-*.whl
|
||||
python -c "from leann import LeannBuilder; print('Success!')"
|
||||
```
|
||||
|
||||
## References
|
||||
- [cibuildwheel documentation](https://cibuildwheel.readthedocs.io/)
|
||||
- [manylinux standards](https://github.com/pypa/manylinux)
|
||||
- [PEP 599 - manylinux2014](https://peps.python.org/pep-0599/)
|
||||
@@ -12,7 +12,7 @@
|
||||
The smallest vector index in the world. RAG Everything with LEANN!
|
||||
</h2>
|
||||
|
||||
LEANN is an innovative vector database that democratizes personal AI. Transform your laptop into a powerful RAG system that can index and search through millions of documents while using **97% less storage** than traditional solutions **without accuracy loss**.
|
||||
LEANN is a revolutionary vector database that democratizes personal AI. Transform your laptop into a powerful RAG system that can index and search through millions of documents while using **97% less storage** than traditional solutions **without accuracy loss**.
|
||||
|
||||
LEANN achieves this through *graph-based selective recomputation* with *high-degree preserving pruning*, computing embeddings on-demand instead of storing them all. [Illustration Fig →](#️-architecture--how-it-works) | [Paper →](https://arxiv.org/abs/2506.08276)
|
||||
|
||||
@@ -195,7 +195,7 @@ Once the index is built, you can ask questions like:
|
||||
- "Show me emails about travel expenses"
|
||||
</details>
|
||||
|
||||
### 🔍 Time Machine for the Web: RAG Your Entire Chrome Browser History!
|
||||
### 🔍 Time Machine for the Web: RAG Your Entire Google Browser History!
|
||||
|
||||
<p align="center">
|
||||
<img src="videos/google_clear.gif" alt="LEANN Browser History Search Demo" width="600">
|
||||
|
||||
240
demo.ipynb
240
demo.ipynb
@@ -4,11 +4,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Quick Start in 30s\n",
|
||||
"\n",
|
||||
"**Home GitHub Repository:** [LEANN on GitHub](https://github.com/yichuan-w/LEANN)\n",
|
||||
"\n",
|
||||
"**Important for Colab users:** Set your runtime type to T4 GPU for optimal performance. Go to Runtime → Change runtime type → Hardware accelerator → T4 GPU."
|
||||
"# Quick Start in 30s"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -17,25 +13,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# install this if you are using colab\n",
|
||||
"! uv pip install leann-core leann-backend-hnsw --no-deps\n",
|
||||
"! uv pip install leann --no-deps\n",
|
||||
"# For Colab environment, we need to set some environment variables\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"LEANN_LOG_LEVEL\"] = \"INFO\" # Enable more detailed logging"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"INDEX_DIR = Path(\"./\").resolve()\n",
|
||||
"INDEX_PATH = str(INDEX_DIR / \"demo.leann\")"
|
||||
"# install this if you areusing colab\n",
|
||||
"! pip install leann"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -47,15 +26,26 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO: Registering backend 'hnsw'\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Writing passages: 100%|██████████| 5/5 [00:00<00:00, 17077.79chunk/s]\n",
|
||||
"Batches: 100%|██████████| 1/1 [00:00<00:00, 36.43it/s]\n",
|
||||
"/Users/yichuan/Desktop/code/LEANN/leann/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: facebook/contriever\n",
|
||||
"WARNING:sentence_transformers.SentenceTransformer:No sentence-transformers model found with name facebook/contriever. Creating a new one with mean pooling.\n",
|
||||
"Writing passages: 100%|██████████| 5/5 [00:00<00:00, 27887.66chunk/s]\n",
|
||||
"Batches: 100%|██████████| 1/1 [00:00<00:00, 13.51it/s]\n",
|
||||
"WARNING:leann_backend_hnsw.hnsw_backend:Converting data to float32, shape: (5, 768)\n",
|
||||
"INFO:leann_backend_hnsw.hnsw_backend:INFO: Converting HNSW index to CSR-pruned format...\n"
|
||||
]
|
||||
@@ -65,41 +55,41 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"M: 64 for level: 0\n",
|
||||
"Starting conversion: index.index -> index.csr.tmp\n",
|
||||
"Starting conversion: knowledge.index -> knowledge.csr.tmp\n",
|
||||
"[0.00s] Reading Index HNSW header...\n",
|
||||
"[0.00s] Header read: d=768, ntotal=5\n",
|
||||
"[0.00s] Reading HNSW struct vectors...\n",
|
||||
" Reading vector (dtype=<class 'numpy.float64'>, fmt='d')... Count=6, Bytes=48\n",
|
||||
"[0.00s] Read assign_probas (6)\n",
|
||||
" Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=7, Bytes=28\n",
|
||||
"[0.14s] Read cum_nneighbor_per_level (7)\n",
|
||||
"[0.11s] Read cum_nneighbor_per_level (7)\n",
|
||||
" Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=5, Bytes=20\n",
|
||||
"[0.24s] Read levels (5)\n",
|
||||
"[0.33s] Probing for compact storage flag...\n",
|
||||
"[0.33s] Found compact flag: False\n",
|
||||
"[0.33s] Compact flag is False, reading original format...\n",
|
||||
"[0.33s] Probing for potential extra byte before non-compact offsets...\n",
|
||||
"[0.33s] Found and consumed an unexpected 0x00 byte.\n",
|
||||
"[0.21s] Read levels (5)\n",
|
||||
"[0.30s] Probing for compact storage flag...\n",
|
||||
"[0.30s] Found compact flag: False\n",
|
||||
"[0.30s] Compact flag is False, reading original format...\n",
|
||||
"[0.30s] Probing for potential extra byte before non-compact offsets...\n",
|
||||
"[0.30s] Found and consumed an unexpected 0x00 byte.\n",
|
||||
" Reading vector (dtype=<class 'numpy.uint64'>, fmt='Q')... Count=6, Bytes=48\n",
|
||||
"[0.33s] Read offsets (6)\n",
|
||||
"[0.41s] Attempting to read neighbors vector...\n",
|
||||
"[0.30s] Read offsets (6)\n",
|
||||
"[0.40s] Attempting to read neighbors vector...\n",
|
||||
" Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=320, Bytes=1280\n",
|
||||
"[0.41s] Read neighbors (320)\n",
|
||||
"[0.54s] Read scalar params (ep=4, max_lvl=0)\n",
|
||||
"[0.54s] Checking for storage data...\n",
|
||||
"[0.54s] Found storage fourcc: 49467849.\n",
|
||||
"[0.54s] Converting to CSR format...\n",
|
||||
"[0.54s] Conversion loop finished. \n",
|
||||
"[0.54s] Running validation checks...\n",
|
||||
"[0.40s] Read neighbors (320)\n",
|
||||
"[0.50s] Read scalar params (ep=4, max_lvl=0)\n",
|
||||
"[0.50s] Checking for storage data...\n",
|
||||
"[0.50s] Found storage fourcc: 49467849.\n",
|
||||
"[0.50s] Converting to CSR format...\n",
|
||||
"[0.50s] Conversion loop finished. \n",
|
||||
"[0.50s] Running validation checks...\n",
|
||||
" Checking total valid neighbor count...\n",
|
||||
" OK: Total valid neighbors = 20\n",
|
||||
" Checking final pointer indices...\n",
|
||||
" OK: Final pointers match data size.\n",
|
||||
"[0.54s] Deleting original neighbors and offsets arrays...\n",
|
||||
"[0.50s] Deleting original neighbors and offsets arrays...\n",
|
||||
" CSR Stats: |data|=20, |level_ptr|=10\n",
|
||||
"[0.63s] Writing CSR HNSW graph data in FAISS-compatible order...\n",
|
||||
"[0.59s] Writing CSR HNSW graph data in FAISS-compatible order...\n",
|
||||
" Pruning embeddings: Writing NULL storage marker.\n",
|
||||
"[0.71s] Conversion complete.\n"
|
||||
"[0.69s] Conversion complete.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -107,7 +97,7 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:leann_backend_hnsw.hnsw_backend:✅ CSR conversion successful.\n",
|
||||
"INFO:leann_backend_hnsw.hnsw_backend:INFO: Replaced original index with CSR-pruned version at 'index.index'\n"
|
||||
"INFO:leann_backend_hnsw.hnsw_backend:INFO: Replaced original index with CSR-pruned version at 'knowledge.index'\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -116,13 +106,11 @@
|
||||
"\n",
|
||||
"builder = LeannBuilder(backend_name=\"hnsw\")\n",
|
||||
"builder.add_text(\"C# is a powerful programming language and it is good at game development\")\n",
|
||||
"builder.add_text(\n",
|
||||
" \"Python is a powerful programming language and it is good at machine learning tasks\"\n",
|
||||
")\n",
|
||||
"builder.add_text(\"Python is a powerful programming language and it is good at machine learning tasks\")\n",
|
||||
"builder.add_text(\"Machine learning transforms industries\")\n",
|
||||
"builder.add_text(\"Neural networks process complex data\")\n",
|
||||
"builder.add_text(\"Leann is a great storage saving engine for RAG on your MacBook\")\n",
|
||||
"builder.build_index(INDEX_PATH)"
|
||||
"builder.build_index(\"knowledge.leann\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -134,7 +122,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -148,16 +136,14 @@
|
||||
"INFO:leann.embedding_server_manager:Port 5557 has incompatible server, trying next port...\n",
|
||||
"INFO:leann.embedding_server_manager:Port 5558 has incompatible server, trying next port...\n",
|
||||
"INFO:leann.embedding_server_manager:Port 5559 has incompatible server, trying next port...\n",
|
||||
"INFO:leann.embedding_server_manager:Port 5560 has incompatible server, trying next port...\n",
|
||||
"INFO:leann.embedding_server_manager:Port 5561 has incompatible server, trying next port...\n",
|
||||
"INFO:leann.embedding_server_manager:Port 5562 has incompatible server, trying next port...\n",
|
||||
"INFO:leann.embedding_server_manager:Starting embedding server on port 5563...\n",
|
||||
"INFO:leann.embedding_server_manager:Command: /Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5563 --model-name facebook/contriever --passages-file /Users/yichuan/Desktop/code/test_leann_pip/LEANN/content/index.meta.json\n",
|
||||
"INFO:leann.embedding_server_manager:Using port 5560 instead of 5557\n",
|
||||
"INFO:leann.embedding_server_manager:Starting embedding server on port 5560...\n",
|
||||
"INFO:leann.embedding_server_manager:Command: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5560 --model-name facebook/contriever --passages-file knowledge.leann.meta.json\n",
|
||||
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
||||
"To disable this warning, you can either:\n",
|
||||
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
||||
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
|
||||
"INFO:leann.embedding_server_manager:Server process started with PID: 31699\n"
|
||||
"INFO:leann.embedding_server_manager:Server process started with PID: 4574\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -183,38 +169,50 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Traceback (most recent call last):\n",
|
||||
" File \"<frozen runpy>\", line 198, in _run_module_as_main\n",
|
||||
" File \"<frozen runpy>\", line 88, in _run_code\n",
|
||||
" File \"/Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann_backend_hnsw/hnsw_embedding_server.py\", line 323, in <module>\n",
|
||||
" create_hnsw_embedding_server(\n",
|
||||
" File \"/Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann_backend_hnsw/hnsw_embedding_server.py\", line 98, in create_hnsw_embedding_server\n",
|
||||
" passages = PassageManager(passage_sources)\n",
|
||||
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
||||
" File \"/Users/yichuan/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann/api.py\", line 127, in __init__\n",
|
||||
" raise FileNotFoundError(f\"Passage index file not found: {index_file}\")\n",
|
||||
"FileNotFoundError: Passage index file not found: /Users/yichuan/Desktop/code/test_leann_pip/LEANN/index.passages.idx\n",
|
||||
"ERROR:leann.embedding_server_manager:Server terminated during startup.\n"
|
||||
"INFO:leann.embedding_server_manager:Embedding server is ready!\n",
|
||||
"INFO:leann.api: Launching server time: 1.078078269958496 seconds\n",
|
||||
"INFO:leann.embedding_server_manager:Existing server process (PID 4574) is compatible\n",
|
||||
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: facebook/contriever\n",
|
||||
"WARNING:sentence_transformers.SentenceTransformer:No sentence-transformers model found with name facebook/contriever. Creating a new one with mean pooling.\n",
|
||||
"INFO:leann.api: Generated embedding shape: (1, 768)\n",
|
||||
"INFO:leann.api: Embedding time: 2.9307072162628174 seconds\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "RuntimeError",
|
||||
"evalue": "Failed to start embedding server on port 5563",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mRuntimeError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mleann\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mapi\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m LeannSearcher\n\u001b[32m 3\u001b[39m searcher = LeannSearcher(\u001b[33m\"\u001b[39m\u001b[33mindex\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m results = \u001b[43msearcher\u001b[49m\u001b[43m.\u001b[49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mprogramming languages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtop_k\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 5\u001b[39m results\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann/api.py:439\u001b[39m, in \u001b[36mLeannSearcher.search\u001b[39m\u001b[34m(self, query, top_k, complexity, beam_width, prune_ratio, recompute_embeddings, pruning_strategy, expected_zmq_port, **kwargs)\u001b[39m\n\u001b[32m 437\u001b[39m start_time = time.time()\n\u001b[32m 438\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recompute_embeddings:\n\u001b[32m--> \u001b[39m\u001b[32m439\u001b[39m zmq_port = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mbackend_impl\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_ensure_server_running\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 440\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmeta_path_str\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 441\u001b[39m \u001b[43m \u001b[49m\u001b[43mport\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_zmq_port\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 442\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 443\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 444\u001b[39m \u001b[38;5;28;01mdel\u001b[39;00m expected_zmq_port\n\u001b[32m 445\u001b[39m zmq_time = time.time() - start_time\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/code/test_leann_pip/LEANN/.venv/lib/python3.11/site-packages/leann/searcher_base.py:81\u001b[39m, in \u001b[36mBaseSearcher._ensure_server_running\u001b[39m\u001b[34m(self, passages_source_file, port, **kwargs)\u001b[39m\n\u001b[32m 72\u001b[39m server_started, actual_port = \u001b[38;5;28mself\u001b[39m.embedding_server_manager.start_server(\n\u001b[32m 73\u001b[39m port=port,\n\u001b[32m 74\u001b[39m model_name=\u001b[38;5;28mself\u001b[39m.embedding_model,\n\u001b[32m (...)\u001b[39m\u001b[32m 78\u001b[39m enable_warmup=kwargs.get(\u001b[33m\"\u001b[39m\u001b[33menable_warmup\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m),\n\u001b[32m 79\u001b[39m )\n\u001b[32m 80\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m server_started:\n\u001b[32m---> \u001b[39m\u001b[32m81\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[32m 82\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mFailed to start embedding server on port \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mactual_port\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 83\u001b[39m )\n\u001b[32m 85\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m actual_port\n",
|
||||
"\u001b[31mRuntimeError\u001b[39m: Failed to start embedding server on port 5563"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ZmqDistanceComputer initialized: d=768, metric=0\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:leann.api: Search time: 0.27327895164489746 seconds\n",
|
||||
"INFO:leann.api: Backend returned: labels=2 results\n",
|
||||
"INFO:leann.api: Processing 2 passage IDs:\n",
|
||||
"INFO:leann.api: 1. passage_id='0' -> SUCCESS: C# is a powerful programming language and it is good at game development...\n",
|
||||
"INFO:leann.api: 2. passage_id='1' -> SUCCESS: Python is a powerful programming language and it is good at machine learning tasks...\n",
|
||||
"INFO:leann.api: Final enriched results: 2 passages\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[SearchResult(id='0', score=np.float32(0.9874103), text='C# is a powerful programming language and it is good at game development', metadata={}),\n",
|
||||
" SearchResult(id='1', score=np.float32(0.8922168), text='Python is a powerful programming language and it is good at machine learning tasks', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from leann.api import LeannSearcher\n",
|
||||
"\n",
|
||||
"searcher = LeannSearcher(INDEX_PATH)\n",
|
||||
"searcher = LeannSearcher(\"knowledge.leann\")\n",
|
||||
"results = searcher.search(\"programming languages\", top_k=2)\n",
|
||||
"results"
|
||||
]
|
||||
@@ -230,7 +228,79 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:leann.chat:Attempting to create LLM of type='hf' with model='Qwen/Qwen3-0.6B'\n",
|
||||
"INFO:leann.chat:Initializing HFChat with model='Qwen/Qwen3-0.6B'\n",
|
||||
"INFO:leann.chat:MPS is available. Using Apple Silicon GPU.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[read_HNSW - CSR NL v4] Reading metadata & CSR indices (manual offset)...\n",
|
||||
"[read_HNSW NL v4] Read levels vector, size: 5\n",
|
||||
"[read_HNSW NL v4] Reading Compact Storage format indices...\n",
|
||||
"[read_HNSW NL v4] Read compact_level_ptr, size: 10\n",
|
||||
"[read_HNSW NL v4] Read compact_node_offsets, size: 6\n",
|
||||
"[read_HNSW NL v4] Read entry_point: 4, max_level: 0\n",
|
||||
"[read_HNSW NL v4] Read storage fourcc: 0x6c6c756e\n",
|
||||
"[read_HNSW NL v4 FIX] Detected FileIOReader. Neighbors size field offset: 326\n",
|
||||
"[read_HNSW NL v4] Reading neighbors data into memory.\n",
|
||||
"[read_HNSW NL v4] Read neighbors data, size: 20\n",
|
||||
"[read_HNSW NL v4] Finished reading metadata and CSR indices.\n",
|
||||
"INFO: Skipping external storage loading, since is_recompute is true.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:leann.api:🔍 LeannSearcher.search() called:\n",
|
||||
"INFO:leann.api: Query: 'Compare the two retrieved programming languages and tell me their advantages.'\n",
|
||||
"INFO:leann.api: Top_k: 2\n",
|
||||
"INFO:leann.api: Additional kwargs: {}\n",
|
||||
"INFO:leann.embedding_server_manager:Port 5557 has incompatible server, trying next port...\n",
|
||||
"INFO:leann.embedding_server_manager:Port 5558 has incompatible server, trying next port...\n",
|
||||
"INFO:leann.embedding_server_manager:Port 5559 has incompatible server, trying next port...\n",
|
||||
"INFO:leann.embedding_server_manager:Found compatible server on port 5560\n",
|
||||
"INFO:leann.embedding_server_manager:Using existing compatible server on port 5560\n",
|
||||
"INFO:leann.api: Launching server time: 0.04932403564453125 seconds\n",
|
||||
"INFO:leann.embedding_server_manager:Found compatible server on port 5560\n",
|
||||
"INFO:leann.embedding_server_manager:Using existing compatible server on port 5560\n",
|
||||
"INFO:leann.api: Generated embedding shape: (1, 768)\n",
|
||||
"INFO:leann.api: Embedding time: 0.06902289390563965 seconds\n",
|
||||
"INFO:leann.api: Search time: 0.026793241500854492 seconds\n",
|
||||
"INFO:leann.api: Backend returned: labels=2 results\n",
|
||||
"INFO:leann.api: Processing 2 passage IDs:\n",
|
||||
"INFO:leann.api: 1. passage_id='0' -> SUCCESS: C# is a powerful programming language and it is good at game development...\n",
|
||||
"INFO:leann.api: 2. passage_id='1' -> SUCCESS: Python is a powerful programming language and it is good at machine learning tasks...\n",
|
||||
"INFO:leann.api: Final enriched results: 2 passages\n",
|
||||
"INFO:leann.chat:Generating with HuggingFace model, config: {'max_new_tokens': 128, 'temperature': 0.7, 'top_p': 0.9, 'do_sample': True, 'pad_token_id': 151645, 'eos_token_id': 151645}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ZmqDistanceComputer initialized: d=768, metric=0\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"<think>\\n\\n</think>\\n\\nBased on the context provided, here's a comparison of the two retrieved programming languages:\\n\\n**C#** is known for being a powerful programming language and is well-suited for game development. It is often used in game development and is popular among developers working on Windows applications.\\n\\n**Python**, on the other hand, is also a powerful language and is well-suited for machine learning tasks. It is widely used for data analysis, scientific computing, and other applications that require handling large datasets or performing complex calculations.\\n\\n**Advantages**:\\n- C#: Strong for game development and cross-platform compatibility.\\n- Python: Strong for\""
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from leann.api import LeannChat\n",
|
||||
"\n",
|
||||
@@ -239,11 +309,11 @@
|
||||
" \"model\": \"Qwen/Qwen3-0.6B\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)\n",
|
||||
"chat = LeannChat(index_path=\"knowledge.leann\", llm_config=llm_config)\n",
|
||||
"response = chat.ask(\n",
|
||||
" \"Compare the two retrieved programming languages and tell me their advantages.\",\n",
|
||||
" top_k=2,\n",
|
||||
" llm_kwargs={\"max_tokens\": 128},\n",
|
||||
" llm_kwargs={\"max_tokens\": 128}\n",
|
||||
")\n",
|
||||
"response"
|
||||
]
|
||||
|
||||
@@ -3,15 +3,14 @@
|
||||
Memory comparison between Faiss HNSW and LEANN HNSW backend
|
||||
"""
|
||||
|
||||
import gc
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import psutil
|
||||
import gc
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
# Setup logging
|
||||
@@ -84,7 +83,9 @@ def test_faiss_hnsw():
|
||||
|
||||
for line in lines:
|
||||
if "Peak Memory:" in line:
|
||||
peak_memory = float(line.split("Peak Memory:")[1].split("MB")[0].strip())
|
||||
peak_memory = float(
|
||||
line.split("Peak Memory:")[1].split("MB")[0].strip()
|
||||
)
|
||||
|
||||
return {"peak_memory": peak_memory}
|
||||
|
||||
@@ -110,8 +111,9 @@ def test_leann_hnsw():
|
||||
|
||||
tracker.checkpoint("After imports")
|
||||
|
||||
from leann.api import LeannBuilder
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
|
||||
# Load and parse documents
|
||||
documents = SimpleDirectoryReader(
|
||||
@@ -195,14 +197,16 @@ def test_leann_hnsw():
|
||||
runtime_start_mem = get_memory_usage()
|
||||
print(f"Before load memory: {runtime_start_mem:.1f} MB")
|
||||
tracker.checkpoint("Before load memory")
|
||||
|
||||
|
||||
# Load searcher
|
||||
searcher = LeannSearcher(index_path)
|
||||
tracker.checkpoint("After searcher loading")
|
||||
|
||||
|
||||
|
||||
print("Running search queries...")
|
||||
queries = [
|
||||
"什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
|
||||
"什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
|
||||
"What is LEANN and how does it work?",
|
||||
"华为诺亚方舟实验室的主要研究内容",
|
||||
]
|
||||
@@ -300,15 +304,21 @@ def main():
|
||||
|
||||
print("\nLEANN vs Faiss Performance:")
|
||||
memory_saving = faiss_results["peak_memory"] - leann_results["peak_memory"]
|
||||
print(f" Search Memory: {memory_ratio:.1f}x less ({memory_saving:.1f} MB saved)")
|
||||
print(
|
||||
f" Search Memory: {memory_ratio:.1f}x less ({memory_saving:.1f} MB saved)"
|
||||
)
|
||||
|
||||
# Storage comparison
|
||||
if leann_storage_size > faiss_storage_size:
|
||||
storage_ratio = leann_storage_size / faiss_storage_size
|
||||
print(f" Storage Size: {storage_ratio:.1f}x larger (LEANN uses more storage)")
|
||||
print(
|
||||
f" Storage Size: {storage_ratio:.1f}x larger (LEANN uses more storage)"
|
||||
)
|
||||
elif faiss_storage_size > leann_storage_size:
|
||||
storage_ratio = faiss_storage_size / leann_storage_size
|
||||
print(f" Storage Size: {storage_ratio:.1f}x smaller (LEANN uses less storage)")
|
||||
print(
|
||||
f" Storage Size: {storage_ratio:.1f}x smaller (LEANN uses less storage)"
|
||||
)
|
||||
else:
|
||||
print(" Storage Size: similar")
|
||||
else:
|
||||
|
||||
@@ -3,44 +3,37 @@
|
||||
Document search demo with recompute mode
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Import backend packages to trigger plugin registration
|
||||
try:
|
||||
import leann_backend_diskann # noqa: F401
|
||||
import leann_backend_hnsw # noqa: F401
|
||||
|
||||
import leann_backend_diskann
|
||||
import leann_backend_hnsw
|
||||
print("INFO: Backend packages imported successfully.")
|
||||
except ImportError as e:
|
||||
print(f"WARNING: Could not import backend packages. Error: {e}")
|
||||
|
||||
# Import upper-level API from leann-core
|
||||
from leann.api import LeannBuilder, LeannChat, LeannSearcher
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
|
||||
|
||||
def load_sample_documents():
|
||||
"""Create sample documents for demonstration"""
|
||||
docs = [
|
||||
{
|
||||
"title": "Intro to Python",
|
||||
"content": "Python is a high-level, interpreted language known for simplicity.",
|
||||
},
|
||||
{"title": "Intro to Python", "content": "Python is a high-level, interpreted language known for simplicity."},
|
||||
{"title": "ML Basics", "content": "Machine learning builds systems that learn from data."},
|
||||
{
|
||||
"title": "Data Structures",
|
||||
"content": "Data structures like arrays, lists, and graphs organize data.",
|
||||
},
|
||||
{"title": "Data Structures", "content": "Data structures like arrays, lists, and graphs organize data."},
|
||||
]
|
||||
return docs
|
||||
|
||||
|
||||
def main():
|
||||
print("==========================================================")
|
||||
print("=== Leann Document Search Demo (DiskANN + Recompute) ===")
|
||||
print("==========================================================")
|
||||
|
||||
|
||||
INDEX_DIR = Path("./test_indices")
|
||||
INDEX_PATH = str(INDEX_DIR / "documents.diskann")
|
||||
BACKEND_TO_TEST = "diskann"
|
||||
@@ -51,96 +44,94 @@ def main():
|
||||
|
||||
# --- 1. Build index ---
|
||||
print(f"\n[PHASE 1] Building index using '{BACKEND_TO_TEST}' backend...")
|
||||
|
||||
builder = LeannBuilder(backend_name=BACKEND_TO_TEST, graph_degree=32, complexity=64)
|
||||
|
||||
|
||||
builder = LeannBuilder(
|
||||
backend_name=BACKEND_TO_TEST,
|
||||
graph_degree=32,
|
||||
complexity=64
|
||||
)
|
||||
|
||||
documents = load_sample_documents()
|
||||
print(f"Loaded {len(documents)} sample documents.")
|
||||
for doc in documents:
|
||||
builder.add_text(doc["content"], metadata={"title": doc["title"]})
|
||||
|
||||
|
||||
builder.build_index(INDEX_PATH)
|
||||
print("\nIndex built!")
|
||||
print(f"\nIndex built!")
|
||||
|
||||
# --- 2. Basic search demo ---
|
||||
print(f"\n[PHASE 2] Basic search using '{BACKEND_TO_TEST}' backend...")
|
||||
searcher = LeannSearcher(index_path=INDEX_PATH)
|
||||
|
||||
|
||||
query = "What is machine learning?"
|
||||
print(f"\nQuery: '{query}'")
|
||||
|
||||
|
||||
print("\n--- Basic search mode (PQ computation) ---")
|
||||
start_time = time.time()
|
||||
results = searcher.search(query, top_k=2)
|
||||
basic_time = time.time() - start_time
|
||||
|
||||
|
||||
print(f"⏱️ Basic search time: {basic_time:.3f} seconds")
|
||||
print(">>> Basic search results <<<")
|
||||
for i, res in enumerate(results, 1):
|
||||
print(
|
||||
f" {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}"
|
||||
)
|
||||
print(f" {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}")
|
||||
|
||||
# --- 3. Recompute search demo ---
|
||||
print("\n[PHASE 3] Recompute search using embedding server...")
|
||||
|
||||
print(f"\n[PHASE 3] Recompute search using embedding server...")
|
||||
|
||||
print("\n--- Recompute search mode (get real embeddings via network) ---")
|
||||
|
||||
|
||||
# Configure recompute parameters
|
||||
recompute_params = {
|
||||
"recompute_beighbor_embeddings": True, # Enable network recomputation
|
||||
"USE_DEFERRED_FETCH": False, # Don't use deferred fetch
|
||||
"skip_search_reorder": True, # Skip search reordering
|
||||
"dedup_node_dis": True, # Enable node distance deduplication
|
||||
"prune_ratio": 0.1, # Pruning ratio 10%
|
||||
"batch_recompute": False, # Don't use batch recomputation
|
||||
"global_pruning": False, # Don't use global pruning
|
||||
"zmq_port": 5555, # ZMQ port
|
||||
"embedding_model": "sentence-transformers/all-mpnet-base-v2",
|
||||
"USE_DEFERRED_FETCH": False, # Don't use deferred fetch
|
||||
"skip_search_reorder": True, # Skip search reordering
|
||||
"dedup_node_dis": True, # Enable node distance deduplication
|
||||
"prune_ratio": 0.1, # Pruning ratio 10%
|
||||
"batch_recompute": False, # Don't use batch recomputation
|
||||
"global_pruning": False, # Don't use global pruning
|
||||
"zmq_port": 5555, # ZMQ port
|
||||
"embedding_model": "sentence-transformers/all-mpnet-base-v2"
|
||||
}
|
||||
|
||||
|
||||
print("Recompute parameter configuration:")
|
||||
for key, value in recompute_params.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print("\n🔄 Executing Recompute search...")
|
||||
|
||||
print(f"\n🔄 Executing Recompute search...")
|
||||
try:
|
||||
start_time = time.time()
|
||||
recompute_results = searcher.search(query, top_k=2, **recompute_params)
|
||||
recompute_time = time.time() - start_time
|
||||
|
||||
|
||||
print(f"⏱️ Recompute search time: {recompute_time:.3f} seconds")
|
||||
print(">>> Recompute search results <<<")
|
||||
for i, res in enumerate(recompute_results, 1):
|
||||
print(
|
||||
f" {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}"
|
||||
)
|
||||
|
||||
print(f" {i}. ID: {res.id}, Score: {res.score:.4f}, Text: '{res.text}', Metadata: {res.metadata}")
|
||||
|
||||
# Compare results
|
||||
print("\n--- Result comparison ---")
|
||||
print(f"\n--- Result comparison ---")
|
||||
print(f"Basic search time: {basic_time:.3f} seconds")
|
||||
print(f"Recompute time: {recompute_time:.3f} seconds")
|
||||
|
||||
|
||||
print("\nBasic search vs Recompute results:")
|
||||
for i in range(min(len(results), len(recompute_results))):
|
||||
basic_score = results[i].score
|
||||
recompute_score = recompute_results[i].score
|
||||
score_diff = abs(basic_score - recompute_score)
|
||||
print(
|
||||
f" Position {i + 1}: PQ={basic_score:.4f}, Recompute={recompute_score:.4f}, Difference={score_diff:.4f}"
|
||||
)
|
||||
|
||||
print(f" Position {i+1}: PQ={basic_score:.4f}, Recompute={recompute_score:.4f}, Difference={score_diff:.4f}")
|
||||
|
||||
if recompute_time > basic_time:
|
||||
print("✅ Recompute mode working correctly (more accurate but slower)")
|
||||
print(f"✅ Recompute mode working correctly (more accurate but slower)")
|
||||
else:
|
||||
print("i️ Recompute time is unusually fast, network recomputation may not be enabled")
|
||||
|
||||
print(f"ℹ️ Recompute time is unusually fast, network recomputation may not be enabled")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Recompute search failed: {e}")
|
||||
print("This usually indicates an embedding server connection issue")
|
||||
|
||||
# --- 4. Chat demo ---
|
||||
print("\n[PHASE 4] Starting chat session...")
|
||||
print(f"\n[PHASE 4] Starting chat session...")
|
||||
chat = LeannChat(index_path=INDEX_PATH)
|
||||
chat_response = chat.ask(query)
|
||||
print(f"You: {query}")
|
||||
@@ -152,4 +143,4 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
@@ -1,13 +1,11 @@
|
||||
import email
|
||||
import os
|
||||
import email
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from typing import List, Any
|
||||
from llama_index.core import Document
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
|
||||
|
||||
def find_all_messages_directories(root: str | None = None) -> list[Path]:
|
||||
def find_all_messages_directories(root: str = None) -> List[Path]:
|
||||
"""
|
||||
Recursively find all 'Messages' directories under the given root.
|
||||
Returns a list of Path objects.
|
||||
@@ -16,97 +14,86 @@ def find_all_messages_directories(root: str | None = None) -> list[Path]:
|
||||
# Auto-detect user's mail path
|
||||
home_dir = os.path.expanduser("~")
|
||||
root = os.path.join(home_dir, "Library", "Mail")
|
||||
|
||||
|
||||
messages_dirs = []
|
||||
for dirpath, _dirnames, _filenames in os.walk(root):
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
if os.path.basename(dirpath) == "Messages":
|
||||
messages_dirs.append(Path(dirpath))
|
||||
return messages_dirs
|
||||
|
||||
|
||||
class EmlxReader(BaseReader):
|
||||
"""
|
||||
Apple Mail .emlx file reader with embedded metadata.
|
||||
|
||||
|
||||
Reads individual .emlx files from Apple Mail's storage format.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, include_html: bool = False) -> None:
|
||||
"""
|
||||
Initialize.
|
||||
|
||||
|
||||
Args:
|
||||
include_html: Whether to include HTML content in the email body (default: False)
|
||||
"""
|
||||
self.include_html = include_html
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> list[Document]:
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]:
|
||||
"""
|
||||
Load data from the input directory containing .emlx files.
|
||||
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing .emlx files
|
||||
**load_kwargs:
|
||||
max_count (int): Maximum amount of messages to read.
|
||||
"""
|
||||
docs: list[Document] = []
|
||||
max_count = load_kwargs.get("max_count", 1000)
|
||||
docs: List[Document] = []
|
||||
max_count = load_kwargs.get('max_count', 1000)
|
||||
count = 0
|
||||
|
||||
|
||||
# Walk through the directory recursively
|
||||
for dirpath, dirnames, filenames in os.walk(input_dir):
|
||||
# Skip hidden directories
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
||||
|
||||
|
||||
for filename in filenames:
|
||||
if count >= max_count:
|
||||
break
|
||||
|
||||
|
||||
if filename.endswith(".emlx"):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
try:
|
||||
# Read the .emlx file
|
||||
with open(filepath, encoding="utf-8", errors="ignore") as f:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
|
||||
# .emlx files have a length prefix followed by the email content
|
||||
# The first line contains the length, followed by the email
|
||||
lines = content.split("\n", 1)
|
||||
lines = content.split('\n', 1)
|
||||
if len(lines) >= 2:
|
||||
email_content = lines[1]
|
||||
|
||||
|
||||
# Parse the email using Python's email module
|
||||
try:
|
||||
msg = email.message_from_string(email_content)
|
||||
|
||||
|
||||
# Extract email metadata
|
||||
subject = msg.get("Subject", "No Subject")
|
||||
from_addr = msg.get("From", "Unknown")
|
||||
to_addr = msg.get("To", "Unknown")
|
||||
date = msg.get("Date", "Unknown")
|
||||
|
||||
subject = msg.get('Subject', 'No Subject')
|
||||
from_addr = msg.get('From', 'Unknown')
|
||||
to_addr = msg.get('To', 'Unknown')
|
||||
date = msg.get('Date', 'Unknown')
|
||||
|
||||
# Extract email body
|
||||
body = ""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if (
|
||||
part.get_content_type() == "text/plain"
|
||||
or part.get_content_type() == "text/html"
|
||||
):
|
||||
if (
|
||||
part.get_content_type() == "text/html"
|
||||
and not self.include_html
|
||||
):
|
||||
if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html":
|
||||
if part.get_content_type() == "text/html" and not self.include_html:
|
||||
continue
|
||||
body += part.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
# break
|
||||
else:
|
||||
body = msg.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
|
||||
body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
|
||||
# Create document content with metadata embedded in text
|
||||
doc_content = f"""
|
||||
[File]: {filename}
|
||||
@@ -117,19 +104,19 @@ class EmlxReader(BaseReader):
|
||||
[EMAIL BODY Start]:
|
||||
{body}
|
||||
"""
|
||||
|
||||
|
||||
# No separate metadata - everything is in the text
|
||||
doc = Document(text=doc_content, metadata={})
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing email from {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading file {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
print(f"Loaded {len(docs)} email documents")
|
||||
return docs
|
||||
return docs
|
||||
@@ -7,9 +7,9 @@ Contains simple parser for mbox files.
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
from fsspec import AbstractFileSystem
|
||||
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
from llama_index.core.schema import Document
|
||||
|
||||
@@ -27,7 +27,11 @@ class MboxReader(BaseReader):
|
||||
"""
|
||||
|
||||
DEFAULT_MESSAGE_FORMAT: str = (
|
||||
"Date: {_date}\nFrom: {_from}\nTo: {_to}\nSubject: {_subject}\nContent: {_content}"
|
||||
"Date: {_date}\n"
|
||||
"From: {_from}\n"
|
||||
"To: {_to}\n"
|
||||
"Subject: {_subject}\n"
|
||||
"Content: {_content}"
|
||||
)
|
||||
|
||||
def __init__(
|
||||
@@ -41,7 +45,9 @@ class MboxReader(BaseReader):
|
||||
try:
|
||||
from bs4 import BeautifulSoup # noqa
|
||||
except ImportError:
|
||||
raise ImportError("`beautifulsoup4` package not found: `pip install beautifulsoup4`")
|
||||
raise ImportError(
|
||||
"`beautifulsoup4` package not found: `pip install beautifulsoup4`"
|
||||
)
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
self.max_count = max_count
|
||||
@@ -50,9 +56,9 @@ class MboxReader(BaseReader):
|
||||
def load_data(
|
||||
self,
|
||||
file: Path,
|
||||
extra_info: dict | None = None,
|
||||
fs: AbstractFileSystem | None = None,
|
||||
) -> list[Document]:
|
||||
extra_info: Optional[Dict] = None,
|
||||
fs: Optional[AbstractFileSystem] = None,
|
||||
) -> List[Document]:
|
||||
"""Parse file into string."""
|
||||
# Import required libraries
|
||||
import mailbox
|
||||
@@ -68,7 +74,7 @@ class MboxReader(BaseReader):
|
||||
)
|
||||
|
||||
i = 0
|
||||
results: list[str] = []
|
||||
results: List[str] = []
|
||||
# Load file using mailbox
|
||||
bytes_parser = BytesParser(policy=default).parse
|
||||
mbox = mailbox.mbox(file, factory=bytes_parser) # type: ignore
|
||||
@@ -118,7 +124,7 @@ class MboxReader(BaseReader):
|
||||
class EmlxMboxReader(MboxReader):
|
||||
"""
|
||||
EmlxMboxReader - Modified MboxReader that handles directories of .emlx files.
|
||||
|
||||
|
||||
Extends MboxReader to work with Apple Mail's .emlx format by:
|
||||
1. Reading .emlx files from a directory
|
||||
2. Converting them to mbox format in memory
|
||||
@@ -128,13 +134,13 @@ class EmlxMboxReader(MboxReader):
|
||||
def load_data(
|
||||
self,
|
||||
directory: Path,
|
||||
extra_info: dict | None = None,
|
||||
fs: AbstractFileSystem | None = None,
|
||||
) -> list[Document]:
|
||||
extra_info: Optional[Dict] = None,
|
||||
fs: Optional[AbstractFileSystem] = None,
|
||||
) -> List[Document]:
|
||||
"""Parse .emlx files from directory into strings using MboxReader logic."""
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import os
|
||||
|
||||
if fs:
|
||||
logger.warning(
|
||||
"fs was specified but EmlxMboxReader doesn't support loading "
|
||||
@@ -144,37 +150,37 @@ class EmlxMboxReader(MboxReader):
|
||||
# Find all .emlx files in the directory
|
||||
emlx_files = list(directory.glob("*.emlx"))
|
||||
logger.info(f"Found {len(emlx_files)} .emlx files in {directory}")
|
||||
|
||||
|
||||
if not emlx_files:
|
||||
logger.warning(f"No .emlx files found in {directory}")
|
||||
return []
|
||||
|
||||
# Create a temporary mbox file
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".mbox", delete=False) as temp_mbox:
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.mbox', delete=False) as temp_mbox:
|
||||
temp_mbox_path = temp_mbox.name
|
||||
|
||||
|
||||
# Convert .emlx files to mbox format
|
||||
for emlx_file in emlx_files:
|
||||
try:
|
||||
# Read the .emlx file
|
||||
with open(emlx_file, encoding="utf-8", errors="ignore") as f:
|
||||
with open(emlx_file, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
|
||||
# .emlx format: first line is length, rest is email content
|
||||
lines = content.split("\n", 1)
|
||||
lines = content.split('\n', 1)
|
||||
if len(lines) >= 2:
|
||||
email_content = lines[1] # Skip the length line
|
||||
|
||||
|
||||
# Write to mbox format (each message starts with "From " and ends with blank line)
|
||||
temp_mbox.write(f"From {emlx_file.name} {email_content}\n\n")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to process {emlx_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
# Close the temporary file so MboxReader can read it
|
||||
temp_mbox.close()
|
||||
|
||||
|
||||
try:
|
||||
# Use the parent MboxReader's logic to parse the mbox file
|
||||
return super().load_data(Path(temp_mbox_path), extra_info, fs)
|
||||
@@ -182,5 +188,5 @@ class EmlxMboxReader(MboxReader):
|
||||
# Clean up temporary file
|
||||
try:
|
||||
os.unlink(temp_mbox_path)
|
||||
except OSError:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test only Faiss HNSW"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
import psutil
|
||||
import gc
|
||||
import os
|
||||
|
||||
|
||||
def get_memory_usage():
|
||||
@@ -37,20 +37,20 @@ def main():
|
||||
import faiss
|
||||
except ImportError:
|
||||
print("Faiss is not installed.")
|
||||
print(
|
||||
"Please install it with `uv pip install faiss-cpu` and you can then run this script again"
|
||||
)
|
||||
print("Please install it with `uv pip install faiss-cpu` and you can then run this script again")
|
||||
sys.exit(1)
|
||||
|
||||
from llama_index.core import (
|
||||
Settings,
|
||||
SimpleDirectoryReader,
|
||||
StorageContext,
|
||||
VectorStoreIndex,
|
||||
StorageContext,
|
||||
Settings,
|
||||
node_parser,
|
||||
Document,
|
||||
)
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
|
||||
tracker = MemoryTracker("Faiss HNSW")
|
||||
tracker.checkpoint("Initial")
|
||||
@@ -90,9 +90,8 @@ def main():
|
||||
vector_store=vector_store, persist_dir="./storage_faiss"
|
||||
)
|
||||
from llama_index.core import load_index_from_storage
|
||||
|
||||
index = load_index_from_storage(storage_context=storage_context)
|
||||
print("Index loaded from ./storage_faiss")
|
||||
print(f"Index loaded from ./storage_faiss")
|
||||
tracker.checkpoint("After loading existing index")
|
||||
index_loaded = True
|
||||
except Exception as e:
|
||||
@@ -100,18 +99,19 @@ def main():
|
||||
print("Cleaning up corrupted index and building new one...")
|
||||
# Clean up corrupted index
|
||||
import shutil
|
||||
|
||||
if os.path.exists("./storage_faiss"):
|
||||
shutil.rmtree("./storage_faiss")
|
||||
|
||||
|
||||
if not index_loaded:
|
||||
print("Building new Faiss HNSW index...")
|
||||
|
||||
|
||||
# Use the correct Faiss building pattern from the example
|
||||
vector_store = FaissVectorStore(faiss_index=faiss_index)
|
||||
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents, storage_context=storage_context, transformations=[node_parser]
|
||||
documents,
|
||||
storage_context=storage_context,
|
||||
transformations=[node_parser]
|
||||
)
|
||||
tracker.checkpoint("After index building")
|
||||
|
||||
@@ -124,10 +124,10 @@ def main():
|
||||
runtime_start_mem = get_memory_usage()
|
||||
print(f"Before load memory: {runtime_start_mem:.1f} MB")
|
||||
tracker.checkpoint("Before load memory")
|
||||
|
||||
|
||||
query_engine = index.as_query_engine(similarity_top_k=20)
|
||||
queries = [
|
||||
"什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
|
||||
"什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发",
|
||||
"What is LEANN and how does it work?",
|
||||
"华为诺亚方舟实验室的主要研究内容",
|
||||
]
|
||||
@@ -141,7 +141,7 @@ def main():
|
||||
|
||||
runtime_end_mem = get_memory_usage()
|
||||
runtime_overhead = runtime_end_mem - runtime_start_mem
|
||||
|
||||
|
||||
peak_memory = tracker.summary()
|
||||
print(f"Peak Memory: {peak_memory:.1f} MB")
|
||||
print(f"Runtime Memory Overhead: {runtime_overhead:.1f} MB")
|
||||
|
||||
@@ -1,17 +1,15 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
import asyncio
|
||||
import argparse
|
||||
try:
|
||||
import dotenv
|
||||
|
||||
dotenv.load_dotenv()
|
||||
except ModuleNotFoundError:
|
||||
# python-dotenv is not installed; skip loading environment variables
|
||||
dotenv = None
|
||||
from pathlib import Path
|
||||
|
||||
from leann.api import LeannBuilder, LeannChat
|
||||
from typing import List, Any
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
# dotenv.load_dotenv() # handled above if python-dotenv is available
|
||||
@@ -19,45 +17,42 @@ from llama_index.core.node_parser import SentenceSplitter
|
||||
# Default Chrome profile path
|
||||
DEFAULT_CHROME_PROFILE = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
|
||||
|
||||
|
||||
def create_leann_index_from_multiple_chrome_profiles(
|
||||
profile_dirs: list[Path], index_path: str = "chrome_history_index.leann", max_count: int = -1
|
||||
):
|
||||
def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], index_path: str = "chrome_history_index.leann", max_count: int = -1):
|
||||
"""
|
||||
Create LEANN index from multiple Chrome profile data sources.
|
||||
|
||||
|
||||
Args:
|
||||
profile_dirs: List of Path objects pointing to Chrome profile directories
|
||||
index_path: Path to save the LEANN index
|
||||
max_count: Maximum number of history entries to process per profile
|
||||
"""
|
||||
print("Creating LEANN index from multiple Chrome profile data sources...")
|
||||
|
||||
|
||||
# Load documents using ChromeHistoryReader from history_data
|
||||
from history_data.history import ChromeHistoryReader
|
||||
|
||||
reader = ChromeHistoryReader()
|
||||
|
||||
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
all_documents = []
|
||||
total_processed = 0
|
||||
|
||||
|
||||
# Process each Chrome profile directory
|
||||
for i, profile_dir in enumerate(profile_dirs):
|
||||
print(f"\nProcessing Chrome profile {i + 1}/{len(profile_dirs)}: {profile_dir}")
|
||||
|
||||
print(f"\nProcessing Chrome profile {i+1}/{len(profile_dirs)}: {profile_dir}")
|
||||
|
||||
try:
|
||||
documents = reader.load_data(
|
||||
chrome_profile_path=str(profile_dir), max_count=max_count
|
||||
chrome_profile_path=str(profile_dir),
|
||||
max_count=max_count
|
||||
)
|
||||
if documents:
|
||||
print(f"Loaded {len(documents)} history documents from {profile_dir}")
|
||||
all_documents.extend(documents)
|
||||
total_processed += len(documents)
|
||||
|
||||
|
||||
# Check if we've reached the max count
|
||||
if max_count > 0 and total_processed >= max_count:
|
||||
print(f"Reached max count of {max_count} documents")
|
||||
@@ -67,22 +62,18 @@ def create_leann_index_from_multiple_chrome_profiles(
|
||||
except Exception as e:
|
||||
print(f"Error processing {profile_dir}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
if not all_documents:
|
||||
print("No documents loaded from any source. Exiting.")
|
||||
# highlight info that you need to close all chrome browser before running this script and high light the instruction!!
|
||||
print(
|
||||
"\033[91mYou need to close or quit all chrome browser before running this script\033[0m"
|
||||
)
|
||||
print("\033[91mYou need to close or quit all chrome browser before running this script\033[0m")
|
||||
return None
|
||||
|
||||
print(
|
||||
f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles"
|
||||
)
|
||||
|
||||
|
||||
print(f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles")
|
||||
|
||||
# Create text splitter with 256 chunk size
|
||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
|
||||
|
||||
|
||||
# Convert Documents to text strings and chunk them
|
||||
all_texts = []
|
||||
for doc in all_documents:
|
||||
@@ -92,48 +83,43 @@ def create_leann_index_from_multiple_chrome_profiles(
|
||||
text = node.get_content()
|
||||
# text = '[Title] ' + doc.metadata["title"] + '\n' + text
|
||||
all_texts.append(text)
|
||||
|
||||
|
||||
print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
|
||||
|
||||
|
||||
# Create LEANN index directory
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="facebook/contriever",
|
||||
graph_degree=32,
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True,
|
||||
num_threads=1, # Force single-threaded mode
|
||||
num_threads=1 # Force single-threaded mode
|
||||
)
|
||||
|
||||
print(f"Adding {len(all_texts)} history chunks to index...")
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
|
||||
|
||||
builder.build_index(index_path)
|
||||
print(f"\nLEANN index built at {index_path}!")
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
|
||||
return index_path
|
||||
|
||||
|
||||
def create_leann_index(
|
||||
profile_path: str | None = None,
|
||||
index_path: str = "chrome_history_index.leann",
|
||||
max_count: int = 1000,
|
||||
):
|
||||
def create_leann_index(profile_path: str = None, index_path: str = "chrome_history_index.leann", max_count: int = 1000):
|
||||
"""
|
||||
Create LEANN index from Chrome history data.
|
||||
|
||||
|
||||
Args:
|
||||
profile_path: Path to the Chrome profile directory (optional, uses default if None)
|
||||
index_path: Path to save the LEANN index
|
||||
@@ -141,31 +127,33 @@ def create_leann_index(
|
||||
"""
|
||||
print("Creating LEANN index from Chrome history data...")
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Load documents using ChromeHistoryReader from history_data
|
||||
from history_data.history import ChromeHistoryReader
|
||||
|
||||
reader = ChromeHistoryReader()
|
||||
|
||||
documents = reader.load_data(chrome_profile_path=profile_path, max_count=max_count)
|
||||
|
||||
|
||||
documents = reader.load_data(
|
||||
chrome_profile_path=profile_path,
|
||||
max_count=max_count
|
||||
)
|
||||
|
||||
if not documents:
|
||||
print("No documents loaded. Exiting.")
|
||||
return None
|
||||
|
||||
|
||||
print(f"Loaded {len(documents)} history documents")
|
||||
|
||||
|
||||
# Create text splitter with 256 chunk size
|
||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
|
||||
|
||||
|
||||
# Convert Documents to text strings and chunk them
|
||||
all_texts = []
|
||||
for doc in documents:
|
||||
@@ -173,55 +161,54 @@ def create_leann_index(
|
||||
nodes = text_splitter.get_nodes_from_documents([doc])
|
||||
for node in nodes:
|
||||
all_texts.append(node.get_content())
|
||||
|
||||
|
||||
print(f"Created {len(all_texts)} text chunks from {len(documents)} documents")
|
||||
|
||||
|
||||
# Create LEANN index directory
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="facebook/contriever",
|
||||
graph_degree=32,
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True,
|
||||
num_threads=1, # Force single-threaded mode
|
||||
num_threads=1 # Force single-threaded mode
|
||||
)
|
||||
|
||||
print(f"Adding {len(all_texts)} history chunks to index...")
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
|
||||
|
||||
builder.build_index(index_path)
|
||||
print(f"\nLEANN index built at {index_path}!")
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
|
||||
return index_path
|
||||
|
||||
|
||||
async def query_leann_index(index_path: str, query: str):
|
||||
"""
|
||||
Query the LEANN index.
|
||||
|
||||
|
||||
Args:
|
||||
index_path: Path to the LEANN index
|
||||
query: The query string
|
||||
"""
|
||||
print("\n[PHASE 2] Starting Leann chat session...")
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=index_path)
|
||||
|
||||
|
||||
print(f"You: {query}")
|
||||
chat_response = chat.ask(
|
||||
query,
|
||||
top_k=10,
|
||||
query,
|
||||
top_k=10,
|
||||
recompute_beighbor_embeddings=True,
|
||||
complexity=32,
|
||||
beam_width=1,
|
||||
@@ -230,60 +217,40 @@ async def query_leann_index(index_path: str, query: str):
|
||||
"model": "gpt-4o",
|
||||
"api_key": os.getenv("OPENAI_API_KEY"),
|
||||
},
|
||||
llm_kwargs={"temperature": 0.0, "max_tokens": 1000},
|
||||
llm_kwargs={
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 1000
|
||||
}
|
||||
)
|
||||
|
||||
print(f"Leann chat response: \033[36m{chat_response}\033[0m")
|
||||
|
||||
|
||||
async def main():
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LEANN Chrome History Reader - Create and query browser history index"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chrome-profile",
|
||||
type=str,
|
||||
default=DEFAULT_CHROME_PROFILE,
|
||||
help=f"Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index-dir",
|
||||
type=str,
|
||||
default="./google_history_index",
|
||||
help="Directory to store the LEANN index (default: ./chrome_history_index_leann_test)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-entries",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Maximum number of history entries to process (default: 1000)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Single query to run (default: runs example queries)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--auto-find-profiles",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Automatically find all Chrome profiles (default: True)",
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index')
|
||||
parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE,
|
||||
help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this')
|
||||
parser.add_argument('--index-dir', type=str, default="./google_history_index",
|
||||
help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)')
|
||||
parser.add_argument('--max-entries', type=int, default=1000,
|
||||
help='Maximum number of history entries to process (default: 1000)')
|
||||
parser.add_argument('--query', type=str, default=None,
|
||||
help='Single query to run (default: runs example queries)')
|
||||
parser.add_argument('--auto-find-profiles', action='store_true', default=True,
|
||||
help='Automatically find all Chrome profiles (default: True)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
INDEX_DIR = Path(args.index_dir)
|
||||
INDEX_PATH = str(INDEX_DIR / "chrome_history.leann")
|
||||
|
||||
|
||||
print(f"Using Chrome profile: {args.chrome_profile}")
|
||||
print(f"Index directory: {INDEX_DIR}")
|
||||
print(f"Max entries: {args.max_entries}")
|
||||
|
||||
|
||||
# Find Chrome profile directories
|
||||
from history_data.history import ChromeHistoryReader
|
||||
|
||||
|
||||
if args.auto_find_profiles:
|
||||
profile_dirs = ChromeHistoryReader.find_chrome_profiles()
|
||||
if not profile_dirs:
|
||||
@@ -296,12 +263,10 @@ async def main():
|
||||
print(f"Chrome profile not found: {profile_path}")
|
||||
return
|
||||
profile_dirs = [profile_path]
|
||||
|
||||
|
||||
# Create or load the LEANN index from all sources
|
||||
index_path = create_leann_index_from_multiple_chrome_profiles(
|
||||
profile_dirs, INDEX_PATH, args.max_entries
|
||||
)
|
||||
|
||||
index_path = create_leann_index_from_multiple_chrome_profiles(profile_dirs, INDEX_PATH, args.max_entries)
|
||||
|
||||
if index_path:
|
||||
if args.query:
|
||||
# Run single query
|
||||
@@ -310,13 +275,12 @@ async def main():
|
||||
# Example queries
|
||||
queries = [
|
||||
"What websites did I visit about machine learning?",
|
||||
"Find my search history about programming",
|
||||
"Find my search history about programming"
|
||||
]
|
||||
|
||||
|
||||
for query in queries:
|
||||
print("\n" + "=" * 60)
|
||||
print("\n" + "="*60)
|
||||
await query_leann_index(index_path, query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
@@ -1,3 +1,3 @@
|
||||
from .history import ChromeHistoryReader
|
||||
|
||||
__all__ = ["ChromeHistoryReader"]
|
||||
__all__ = ['ChromeHistoryReader']
|
||||
@@ -1,81 +1,77 @@
|
||||
import os
|
||||
import sqlite3
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from typing import List, Any
|
||||
from llama_index.core import Document
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
|
||||
|
||||
class ChromeHistoryReader(BaseReader):
|
||||
"""
|
||||
Chrome browser history reader that extracts browsing data from SQLite database.
|
||||
|
||||
|
||||
Reads Chrome history from the default Chrome profile location and creates documents
|
||||
with embedded metadata similar to the email reader structure.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize."""
|
||||
pass
|
||||
|
||||
def load_data(self, input_dir: str | None = None, **load_kwargs: Any) -> list[Document]:
|
||||
|
||||
def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
|
||||
"""
|
||||
Load Chrome history data from the default Chrome profile location.
|
||||
|
||||
|
||||
Args:
|
||||
input_dir: Not used for Chrome history (kept for compatibility)
|
||||
**load_kwargs:
|
||||
max_count (int): Maximum amount of history entries to read.
|
||||
chrome_profile_path (str): Custom path to Chrome profile directory.
|
||||
"""
|
||||
docs: list[Document] = []
|
||||
max_count = load_kwargs.get("max_count", 1000)
|
||||
chrome_profile_path = load_kwargs.get("chrome_profile_path", None)
|
||||
|
||||
docs: List[Document] = []
|
||||
max_count = load_kwargs.get('max_count', 1000)
|
||||
chrome_profile_path = load_kwargs.get('chrome_profile_path', None)
|
||||
|
||||
# Default Chrome profile path on macOS
|
||||
if chrome_profile_path is None:
|
||||
chrome_profile_path = os.path.expanduser(
|
||||
"~/Library/Application Support/Google/Chrome/Default"
|
||||
)
|
||||
|
||||
chrome_profile_path = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
|
||||
|
||||
history_db_path = os.path.join(chrome_profile_path, "History")
|
||||
|
||||
|
||||
if not os.path.exists(history_db_path):
|
||||
print(f"Chrome history database not found at: {history_db_path}")
|
||||
return docs
|
||||
|
||||
|
||||
try:
|
||||
# Connect to the Chrome history database
|
||||
print(f"Connecting to database: {history_db_path}")
|
||||
conn = sqlite3.connect(history_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
# Query to get browsing history with metadata (removed created_time column)
|
||||
query = """
|
||||
SELECT
|
||||
SELECT
|
||||
datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime') as last_visit,
|
||||
url,
|
||||
title,
|
||||
visit_count,
|
||||
typed_count,
|
||||
url,
|
||||
title,
|
||||
visit_count,
|
||||
typed_count,
|
||||
hidden
|
||||
FROM urls
|
||||
FROM urls
|
||||
ORDER BY last_visit_time DESC
|
||||
"""
|
||||
|
||||
|
||||
print(f"Executing query on database: {history_db_path}")
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
print(f"Query returned {len(rows)} rows")
|
||||
|
||||
|
||||
count = 0
|
||||
for row in rows:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
|
||||
last_visit, url, title, visit_count, typed_count, hidden = row
|
||||
|
||||
|
||||
# Create document content with metadata embedded in text
|
||||
doc_content = f"""
|
||||
[Title]: {title}
|
||||
@@ -84,38 +80,38 @@ class ChromeHistoryReader(BaseReader):
|
||||
[Visit times]: {visit_count}
|
||||
[Typed times]: {typed_count}
|
||||
"""
|
||||
|
||||
|
||||
# Create document with embedded metadata
|
||||
doc = Document(text=doc_content, metadata={"title": title[0:150]})
|
||||
doc = Document(text=doc_content, metadata={ "title": title[0:150]})
|
||||
# if len(title) > 150:
|
||||
# print(f"Title is too long: {title}")
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
|
||||
conn.close()
|
||||
print(f"Loaded {len(docs)} Chrome history documents")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading Chrome history: {e}")
|
||||
return docs
|
||||
|
||||
|
||||
return docs
|
||||
|
||||
@staticmethod
|
||||
def find_chrome_profiles() -> list[Path]:
|
||||
def find_chrome_profiles() -> List[Path]:
|
||||
"""
|
||||
Find all Chrome profile directories.
|
||||
|
||||
|
||||
Returns:
|
||||
List of Path objects pointing to Chrome profile directories
|
||||
"""
|
||||
chrome_base_path = Path(os.path.expanduser("~/Library/Application Support/Google/Chrome"))
|
||||
profile_dirs = []
|
||||
|
||||
|
||||
if not chrome_base_path.exists():
|
||||
print(f"Chrome directory not found at: {chrome_base_path}")
|
||||
return profile_dirs
|
||||
|
||||
|
||||
# Find all profile directories
|
||||
for profile_dir in chrome_base_path.iterdir():
|
||||
if profile_dir.is_dir() and profile_dir.name != "System Profile":
|
||||
@@ -123,59 +119,53 @@ class ChromeHistoryReader(BaseReader):
|
||||
if history_path.exists():
|
||||
profile_dirs.append(profile_dir)
|
||||
print(f"Found Chrome profile: {profile_dir}")
|
||||
|
||||
|
||||
print(f"Found {len(profile_dirs)} Chrome profiles")
|
||||
return profile_dirs
|
||||
|
||||
@staticmethod
|
||||
def export_history_to_file(
|
||||
output_file: str = "chrome_history_export.txt", max_count: int = 1000
|
||||
):
|
||||
def export_history_to_file(output_file: str = "chrome_history_export.txt", max_count: int = 1000):
|
||||
"""
|
||||
Export Chrome history to a text file using the same SQL query format.
|
||||
|
||||
|
||||
Args:
|
||||
output_file: Path to the output file
|
||||
max_count: Maximum number of entries to export
|
||||
"""
|
||||
chrome_profile_path = os.path.expanduser(
|
||||
"~/Library/Application Support/Google/Chrome/Default"
|
||||
)
|
||||
chrome_profile_path = os.path.expanduser("~/Library/Application Support/Google/Chrome/Default")
|
||||
history_db_path = os.path.join(chrome_profile_path, "History")
|
||||
|
||||
|
||||
if not os.path.exists(history_db_path):
|
||||
print(f"Chrome history database not found at: {history_db_path}")
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(history_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
|
||||
query = """
|
||||
SELECT
|
||||
SELECT
|
||||
datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime') as last_visit,
|
||||
url,
|
||||
title,
|
||||
visit_count,
|
||||
typed_count,
|
||||
url,
|
||||
title,
|
||||
visit_count,
|
||||
typed_count,
|
||||
hidden
|
||||
FROM urls
|
||||
FROM urls
|
||||
ORDER BY last_visit_time DESC
|
||||
LIMIT ?
|
||||
"""
|
||||
|
||||
|
||||
cursor.execute(query, (max_count,))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
for row in rows:
|
||||
last_visit, url, title, visit_count, typed_count, hidden = row
|
||||
f.write(
|
||||
f"{last_visit}\t{url}\t{title}\t{visit_count}\t{typed_count}\t{hidden}\n"
|
||||
)
|
||||
|
||||
f.write(f"{last_visit}\t{url}\t{title}\t{visit_count}\t{typed_count}\t{hidden}\n")
|
||||
|
||||
conn.close()
|
||||
print(f"Exported {len(rows)} history entries to {output_file}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error exporting Chrome history: {e}")
|
||||
print(f"Error exporting Chrome history: {e}")
|
||||
@@ -2,31 +2,30 @@ import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from typing import List, Any, Dict, Optional
|
||||
from llama_index.core import Document
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
class WeChatHistoryReader(BaseReader):
|
||||
"""
|
||||
WeChat chat history reader that extracts chat data from exported JSON files.
|
||||
|
||||
|
||||
Reads WeChat chat history from exported JSON files (from wechat-exporter tool)
|
||||
and creates documents with embedded metadata similar to the Chrome history reader structure.
|
||||
|
||||
|
||||
Also includes utilities for automatic WeChat chat history export.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize."""
|
||||
self.packages_dir = Path(__file__).parent.parent.parent / "packages"
|
||||
self.wechat_exporter_dir = self.packages_dir / "wechat-exporter"
|
||||
self.wechat_decipher_dir = self.packages_dir / "wechat-decipher-macos"
|
||||
|
||||
|
||||
def check_wechat_running(self) -> bool:
|
||||
"""Check if WeChat is currently running."""
|
||||
try:
|
||||
@@ -34,30 +33,24 @@ class WeChatHistoryReader(BaseReader):
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def install_wechattweak(self) -> bool:
|
||||
"""Install WeChatTweak CLI tool."""
|
||||
try:
|
||||
# Create wechat-exporter directory if it doesn't exist
|
||||
self.wechat_exporter_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
wechattweak_path = self.wechat_exporter_dir / "wechattweak-cli"
|
||||
if not wechattweak_path.exists():
|
||||
print("Downloading WeChatTweak CLI...")
|
||||
subprocess.run(
|
||||
[
|
||||
"curl",
|
||||
"-L",
|
||||
"-o",
|
||||
str(wechattweak_path),
|
||||
"https://github.com/JettChenT/WeChatTweak-CLI/releases/latest/download/wechattweak-cli",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
subprocess.run([
|
||||
"curl", "-L", "-o", str(wechattweak_path),
|
||||
"https://github.com/JettChenT/WeChatTweak-CLI/releases/latest/download/wechattweak-cli"
|
||||
], check=True)
|
||||
|
||||
# Make executable
|
||||
wechattweak_path.chmod(0o755)
|
||||
|
||||
|
||||
# Install WeChatTweak
|
||||
print("Installing WeChatTweak...")
|
||||
subprocess.run(["sudo", str(wechattweak_path), "install"], check=True)
|
||||
@@ -65,7 +58,7 @@ class WeChatHistoryReader(BaseReader):
|
||||
except Exception as e:
|
||||
print(f"Error installing WeChatTweak: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def restart_wechat(self):
|
||||
"""Restart WeChat to apply WeChatTweak."""
|
||||
try:
|
||||
@@ -76,325 +69,302 @@ class WeChatHistoryReader(BaseReader):
|
||||
time.sleep(5) # Wait for WeChat to start
|
||||
except Exception as e:
|
||||
print(f"Error restarting WeChat: {e}")
|
||||
|
||||
|
||||
def check_api_available(self) -> bool:
|
||||
"""Check if WeChatTweak API is available."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["curl", "-s", "http://localhost:48065/wechat/allcontacts"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
result = subprocess.run([
|
||||
"curl", "-s", "http://localhost:48065/wechat/allcontacts"
|
||||
], capture_output=True, text=True, timeout=5)
|
||||
return result.returncode == 0 and result.stdout.strip()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
def _extract_readable_text(self, content: str) -> str:
|
||||
"""
|
||||
Extract readable text from message content, removing XML and system messages.
|
||||
|
||||
|
||||
Args:
|
||||
content: The raw message content (can be string or dict)
|
||||
|
||||
|
||||
Returns:
|
||||
Cleaned, readable text
|
||||
"""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
|
||||
# Handle dictionary content (like quoted messages)
|
||||
if isinstance(content, dict):
|
||||
# Extract text from dictionary structure
|
||||
text_parts = []
|
||||
if "title" in content:
|
||||
text_parts.append(str(content["title"]))
|
||||
if "quoted" in content:
|
||||
text_parts.append(str(content["quoted"]))
|
||||
if "content" in content:
|
||||
text_parts.append(str(content["content"]))
|
||||
if "text" in content:
|
||||
text_parts.append(str(content["text"]))
|
||||
|
||||
if 'title' in content:
|
||||
text_parts.append(str(content['title']))
|
||||
if 'quoted' in content:
|
||||
text_parts.append(str(content['quoted']))
|
||||
if 'content' in content:
|
||||
text_parts.append(str(content['content']))
|
||||
if 'text' in content:
|
||||
text_parts.append(str(content['text']))
|
||||
|
||||
if text_parts:
|
||||
return " | ".join(text_parts)
|
||||
else:
|
||||
# If we can't extract meaningful text from dict, return empty
|
||||
return ""
|
||||
|
||||
|
||||
# Handle string content
|
||||
if not isinstance(content, str):
|
||||
return ""
|
||||
|
||||
|
||||
# Remove common prefixes like "wxid_xxx:\n"
|
||||
clean_content = re.sub(r"^wxid_[^:]+:\s*", "", content)
|
||||
clean_content = re.sub(r"^[^:]+:\s*", "", clean_content)
|
||||
|
||||
clean_content = re.sub(r'^wxid_[^:]+:\s*', '', content)
|
||||
clean_content = re.sub(r'^[^:]+:\s*', '', clean_content)
|
||||
|
||||
# If it's just XML or system message, return empty
|
||||
if clean_content.strip().startswith("<") or "recalled a message" in clean_content:
|
||||
if clean_content.strip().startswith('<') or 'recalled a message' in clean_content:
|
||||
return ""
|
||||
|
||||
|
||||
return clean_content.strip()
|
||||
|
||||
|
||||
def _is_text_message(self, content: str) -> bool:
|
||||
"""
|
||||
Check if a message contains readable text content.
|
||||
|
||||
|
||||
Args:
|
||||
content: The message content (can be string or dict)
|
||||
|
||||
|
||||
Returns:
|
||||
True if the message contains readable text, False otherwise
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
|
||||
|
||||
# Handle dictionary content
|
||||
if isinstance(content, dict):
|
||||
# Check if dict has any readable text fields
|
||||
text_fields = ["title", "quoted", "content", "text"]
|
||||
text_fields = ['title', 'quoted', 'content', 'text']
|
||||
for field in text_fields:
|
||||
if content.get(field):
|
||||
if field in content and content[field]:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# Handle string content
|
||||
if not isinstance(content, str):
|
||||
return False
|
||||
|
||||
|
||||
# Skip image messages (contain XML with img tags)
|
||||
if "<img" in content and "cdnurl" in content:
|
||||
if '<img' in content and 'cdnurl' in content:
|
||||
return False
|
||||
|
||||
|
||||
# Skip emoji messages (contain emoji XML tags)
|
||||
if "<emoji" in content and "productid" in content:
|
||||
if '<emoji' in content and 'productid' in content:
|
||||
return False
|
||||
|
||||
|
||||
# Skip voice messages
|
||||
if "<voice" in content:
|
||||
if '<voice' in content:
|
||||
return False
|
||||
|
||||
|
||||
# Skip video messages
|
||||
if "<video" in content:
|
||||
if '<video' in content:
|
||||
return False
|
||||
|
||||
|
||||
# Skip file messages
|
||||
if "<appmsg" in content and "appid" in content:
|
||||
if '<appmsg' in content and 'appid' in content:
|
||||
return False
|
||||
|
||||
|
||||
# Skip system messages (like "recalled a message")
|
||||
if "recalled a message" in content:
|
||||
if 'recalled a message' in content:
|
||||
return False
|
||||
|
||||
|
||||
# Check if there's actual readable text (not just XML or system messages)
|
||||
# Remove common prefixes like "wxid_xxx:\n" and check for actual content
|
||||
clean_content = re.sub(r"^wxid_[^:]+:\s*", "", content)
|
||||
clean_content = re.sub(r"^[^:]+:\s*", "", clean_content)
|
||||
|
||||
clean_content = re.sub(r'^wxid_[^:]+:\s*', '', content)
|
||||
clean_content = re.sub(r'^[^:]+:\s*', '', clean_content)
|
||||
|
||||
# If after cleaning we have meaningful text, consider it readable
|
||||
if len(clean_content.strip()) > 0 and not clean_content.strip().startswith("<"):
|
||||
if len(clean_content.strip()) > 0 and not clean_content.strip().startswith('<'):
|
||||
return True
|
||||
|
||||
|
||||
return False
|
||||
|
||||
def _concatenate_messages(
|
||||
self,
|
||||
messages: list[dict],
|
||||
max_length: int = 128,
|
||||
time_window_minutes: int = 30,
|
||||
overlap_messages: int = 0,
|
||||
) -> list[dict]:
|
||||
|
||||
def _concatenate_messages(self, messages: List[Dict], max_length: int = 128,
|
||||
time_window_minutes: int = 30, overlap_messages: int = 0) -> List[Dict]:
|
||||
"""
|
||||
Concatenate messages based on length and time rules.
|
||||
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries
|
||||
max_length: Maximum length for concatenated message groups. Use -1 to disable length constraint.
|
||||
time_window_minutes: Time window in minutes to group messages together. Use -1 to disable time constraint.
|
||||
overlap_messages: Number of messages to overlap between consecutive groups
|
||||
|
||||
|
||||
Returns:
|
||||
List of concatenated message groups
|
||||
"""
|
||||
if not messages:
|
||||
return []
|
||||
|
||||
|
||||
concatenated_groups = []
|
||||
current_group = []
|
||||
current_length = 0
|
||||
last_timestamp = None
|
||||
|
||||
|
||||
for message in messages:
|
||||
# Extract message info
|
||||
content = message.get("content", "")
|
||||
message_text = message.get("message", "")
|
||||
create_time = message.get("createTime", 0)
|
||||
message.get("fromUser", "")
|
||||
message.get("toUser", "")
|
||||
message.get("isSentFromSelf", False)
|
||||
|
||||
content = message.get('content', '')
|
||||
message_text = message.get('message', '')
|
||||
create_time = message.get('createTime', 0)
|
||||
from_user = message.get('fromUser', '')
|
||||
to_user = message.get('toUser', '')
|
||||
is_sent_from_self = message.get('isSentFromSelf', False)
|
||||
|
||||
# Extract readable text
|
||||
readable_text = self._extract_readable_text(content)
|
||||
if not readable_text:
|
||||
readable_text = message_text
|
||||
|
||||
|
||||
# Skip empty messages
|
||||
if not readable_text.strip():
|
||||
continue
|
||||
|
||||
|
||||
# Check time window constraint (only if time_window_minutes != -1)
|
||||
if time_window_minutes != -1 and last_timestamp is not None and create_time > 0:
|
||||
time_diff_minutes = (create_time - last_timestamp) / 60
|
||||
if time_diff_minutes > time_window_minutes:
|
||||
# Time gap too large, start new group
|
||||
if current_group:
|
||||
concatenated_groups.append(
|
||||
{
|
||||
"messages": current_group,
|
||||
"total_length": current_length,
|
||||
"start_time": current_group[0].get("createTime", 0),
|
||||
"end_time": current_group[-1].get("createTime", 0),
|
||||
}
|
||||
)
|
||||
concatenated_groups.append({
|
||||
'messages': current_group,
|
||||
'total_length': current_length,
|
||||
'start_time': current_group[0].get('createTime', 0),
|
||||
'end_time': current_group[-1].get('createTime', 0)
|
||||
})
|
||||
# Keep last few messages for overlap
|
||||
if overlap_messages > 0 and len(current_group) > overlap_messages:
|
||||
current_group = current_group[-overlap_messages:]
|
||||
current_length = sum(
|
||||
len(
|
||||
self._extract_readable_text(msg.get("content", ""))
|
||||
or msg.get("message", "")
|
||||
)
|
||||
for msg in current_group
|
||||
)
|
||||
current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
|
||||
else:
|
||||
current_group = []
|
||||
current_length = 0
|
||||
|
||||
|
||||
# Check length constraint (only if max_length != -1)
|
||||
message_length = len(readable_text)
|
||||
if max_length != -1 and current_length + message_length > max_length and current_group:
|
||||
# Current group would exceed max length, save it and start new
|
||||
concatenated_groups.append(
|
||||
{
|
||||
"messages": current_group,
|
||||
"total_length": current_length,
|
||||
"start_time": current_group[0].get("createTime", 0),
|
||||
"end_time": current_group[-1].get("createTime", 0),
|
||||
}
|
||||
)
|
||||
concatenated_groups.append({
|
||||
'messages': current_group,
|
||||
'total_length': current_length,
|
||||
'start_time': current_group[0].get('createTime', 0),
|
||||
'end_time': current_group[-1].get('createTime', 0)
|
||||
})
|
||||
# Keep last few messages for overlap
|
||||
if overlap_messages > 0 and len(current_group) > overlap_messages:
|
||||
current_group = current_group[-overlap_messages:]
|
||||
current_length = sum(
|
||||
len(
|
||||
self._extract_readable_text(msg.get("content", ""))
|
||||
or msg.get("message", "")
|
||||
)
|
||||
for msg in current_group
|
||||
)
|
||||
current_length = sum(len(self._extract_readable_text(msg.get('content', '')) or msg.get('message', '')) for msg in current_group)
|
||||
else:
|
||||
current_group = []
|
||||
current_length = 0
|
||||
|
||||
|
||||
# Add message to current group
|
||||
current_group.append(message)
|
||||
current_length += message_length
|
||||
last_timestamp = create_time
|
||||
|
||||
|
||||
# Add the last group if it exists
|
||||
if current_group:
|
||||
concatenated_groups.append(
|
||||
{
|
||||
"messages": current_group,
|
||||
"total_length": current_length,
|
||||
"start_time": current_group[0].get("createTime", 0),
|
||||
"end_time": current_group[-1].get("createTime", 0),
|
||||
}
|
||||
)
|
||||
|
||||
concatenated_groups.append({
|
||||
'messages': current_group,
|
||||
'total_length': current_length,
|
||||
'start_time': current_group[0].get('createTime', 0),
|
||||
'end_time': current_group[-1].get('createTime', 0)
|
||||
})
|
||||
|
||||
return concatenated_groups
|
||||
|
||||
def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str:
|
||||
|
||||
def _create_concatenated_content(self, message_group: Dict, contact_name: str) -> str:
|
||||
"""
|
||||
Create concatenated content from a group of messages.
|
||||
|
||||
|
||||
Args:
|
||||
message_group: Dictionary containing messages and metadata
|
||||
contact_name: Name of the contact
|
||||
|
||||
|
||||
Returns:
|
||||
Formatted concatenated content
|
||||
"""
|
||||
messages = message_group["messages"]
|
||||
start_time = message_group["start_time"]
|
||||
end_time = message_group["end_time"]
|
||||
|
||||
messages = message_group['messages']
|
||||
start_time = message_group['start_time']
|
||||
end_time = message_group['end_time']
|
||||
|
||||
# Format timestamps
|
||||
if start_time:
|
||||
try:
|
||||
start_timestamp = datetime.fromtimestamp(start_time)
|
||||
start_time_str = start_timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
start_time_str = start_timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
start_time_str = str(start_time)
|
||||
else:
|
||||
start_time_str = "Unknown"
|
||||
|
||||
|
||||
if end_time:
|
||||
try:
|
||||
end_timestamp = datetime.fromtimestamp(end_time)
|
||||
end_time_str = end_timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
end_time_str = end_timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
end_time_str = str(end_time)
|
||||
else:
|
||||
end_time_str = "Unknown"
|
||||
|
||||
|
||||
# Build concatenated message content
|
||||
message_parts = []
|
||||
for message in messages:
|
||||
content = message.get("content", "")
|
||||
message_text = message.get("message", "")
|
||||
create_time = message.get("createTime", 0)
|
||||
is_sent_from_self = message.get("isSentFromSelf", False)
|
||||
|
||||
content = message.get('content', '')
|
||||
message_text = message.get('message', '')
|
||||
create_time = message.get('createTime', 0)
|
||||
is_sent_from_self = message.get('isSentFromSelf', False)
|
||||
|
||||
# Extract readable text
|
||||
readable_text = self._extract_readable_text(content)
|
||||
if not readable_text:
|
||||
readable_text = message_text
|
||||
|
||||
|
||||
# Format individual message
|
||||
if create_time:
|
||||
try:
|
||||
timestamp = datetime.fromtimestamp(create_time)
|
||||
# change to YYYY-MM-DD HH:MM:SS
|
||||
time_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
time_str = str(create_time)
|
||||
else:
|
||||
time_str = "Unknown"
|
||||
|
||||
|
||||
sender = "[Me]" if is_sent_from_self else "[Contact]"
|
||||
message_parts.append(f"({time_str}) {sender}: {readable_text}")
|
||||
|
||||
|
||||
concatenated_text = "\n".join(message_parts)
|
||||
|
||||
|
||||
# Create final document content
|
||||
doc_content = f"""
|
||||
Contact: {contact_name}
|
||||
Time Range: {start_time_str} - {end_time_str}
|
||||
Messages ({len(messages)} messages, {message_group["total_length"]} chars):
|
||||
Messages ({len(messages)} messages, {message_group['total_length']} chars):
|
||||
|
||||
{concatenated_text}
|
||||
"""
|
||||
# TODO @yichuan give better format and rich info here!
|
||||
# TODO @yichuan give better format and rich info here!
|
||||
doc_content = f"""
|
||||
{concatenated_text}
|
||||
"""
|
||||
return doc_content, contact_name
|
||||
|
||||
def load_data(self, input_dir: str | None = None, **load_kwargs: Any) -> list[Document]:
|
||||
|
||||
def load_data(self, input_dir: str = None, **load_kwargs: Any) -> List[Document]:
|
||||
"""
|
||||
Load WeChat chat history data from exported JSON files.
|
||||
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing exported WeChat JSON files
|
||||
**load_kwargs:
|
||||
@@ -406,103 +376,97 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars):
|
||||
time_window_minutes (int): Time window in minutes to group messages together (default: 30).
|
||||
overlap_messages (int): Number of messages to overlap between consecutive groups (default: 2).
|
||||
"""
|
||||
docs: list[Document] = []
|
||||
max_count = load_kwargs.get("max_count", 1000)
|
||||
wechat_export_dir = load_kwargs.get("wechat_export_dir", None)
|
||||
include_non_text = load_kwargs.get("include_non_text", False)
|
||||
concatenate_messages = load_kwargs.get("concatenate_messages", False)
|
||||
load_kwargs.get("max_length", 1000)
|
||||
load_kwargs.get("time_window_minutes", 30)
|
||||
|
||||
docs: List[Document] = []
|
||||
max_count = load_kwargs.get('max_count', 1000)
|
||||
wechat_export_dir = load_kwargs.get('wechat_export_dir', None)
|
||||
include_non_text = load_kwargs.get('include_non_text', False)
|
||||
concatenate_messages = load_kwargs.get('concatenate_messages', False)
|
||||
max_length = load_kwargs.get('max_length', 1000)
|
||||
time_window_minutes = load_kwargs.get('time_window_minutes', 30)
|
||||
|
||||
# Default WeChat export path
|
||||
if wechat_export_dir is None:
|
||||
wechat_export_dir = "./wechat_export_test"
|
||||
|
||||
|
||||
if not os.path.exists(wechat_export_dir):
|
||||
print(f"WeChat export directory not found at: {wechat_export_dir}")
|
||||
return docs
|
||||
|
||||
|
||||
try:
|
||||
# Find all JSON files in the export directory
|
||||
json_files = list(Path(wechat_export_dir).glob("*.json"))
|
||||
print(f"Found {len(json_files)} WeChat chat history files")
|
||||
|
||||
|
||||
count = 0
|
||||
for json_file in json_files:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
|
||||
try:
|
||||
with open(json_file, encoding="utf-8") as f:
|
||||
with open(json_file, 'r', encoding='utf-8') as f:
|
||||
chat_data = json.load(f)
|
||||
|
||||
|
||||
# Extract contact name from filename
|
||||
contact_name = json_file.stem
|
||||
|
||||
|
||||
if concatenate_messages:
|
||||
# Filter messages to only include readable text messages
|
||||
readable_messages = []
|
||||
for message in chat_data:
|
||||
try:
|
||||
content = message.get("content", "")
|
||||
content = message.get('content', '')
|
||||
if not include_non_text and not self._is_text_message(content):
|
||||
continue
|
||||
|
||||
|
||||
readable_text = self._extract_readable_text(content)
|
||||
if not readable_text and not include_non_text:
|
||||
continue
|
||||
|
||||
|
||||
readable_messages.append(message)
|
||||
except Exception as e:
|
||||
print(f"Error processing message in {json_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
# Concatenate messages based on rules
|
||||
message_groups = self._concatenate_messages(
|
||||
readable_messages,
|
||||
max_length=-1,
|
||||
readable_messages,
|
||||
max_length=-1,
|
||||
time_window_minutes=-1,
|
||||
overlap_messages=0, # Keep 2 messages overlap between groups
|
||||
overlap_messages=0 # Keep 2 messages overlap between groups
|
||||
)
|
||||
|
||||
|
||||
# Create documents from concatenated groups
|
||||
for message_group in message_groups:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
doc_content, contact_name = self._create_concatenated_content(
|
||||
message_group, contact_name
|
||||
)
|
||||
doc = Document(
|
||||
text=doc_content, metadata={"contact_name": contact_name}
|
||||
)
|
||||
|
||||
doc_content, contact_name = self._create_concatenated_content(message_group, contact_name)
|
||||
doc = Document(text=doc_content, metadata={"contact_name": contact_name})
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
print(
|
||||
f"Created {len(message_groups)} concatenated message groups for {contact_name}"
|
||||
)
|
||||
|
||||
|
||||
print(f"Created {len(message_groups)} concatenated message groups for {contact_name}")
|
||||
|
||||
else:
|
||||
# Original single-message processing
|
||||
for message in chat_data:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
|
||||
# Extract message information
|
||||
message.get("fromUser", "")
|
||||
message.get("toUser", "")
|
||||
content = message.get("content", "")
|
||||
message_text = message.get("message", "")
|
||||
create_time = message.get("createTime", 0)
|
||||
is_sent_from_self = message.get("isSentFromSelf", False)
|
||||
|
||||
from_user = message.get('fromUser', '')
|
||||
to_user = message.get('toUser', '')
|
||||
content = message.get('content', '')
|
||||
message_text = message.get('message', '')
|
||||
create_time = message.get('createTime', 0)
|
||||
is_sent_from_self = message.get('isSentFromSelf', False)
|
||||
|
||||
# Handle content that might be dict or string
|
||||
try:
|
||||
# Check if this is a readable text message
|
||||
if not include_non_text and not self._is_text_message(content):
|
||||
continue
|
||||
|
||||
|
||||
# Extract readable text
|
||||
readable_text = self._extract_readable_text(content)
|
||||
if not readable_text and not include_non_text:
|
||||
@@ -511,17 +475,17 @@ Messages ({len(messages)} messages, {message_group["total_length"]} chars):
|
||||
# Skip messages that cause processing errors
|
||||
print(f"Error processing message in {json_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
# Convert timestamp to readable format
|
||||
if create_time:
|
||||
try:
|
||||
timestamp = datetime.fromtimestamp(create_time)
|
||||
time_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
time_str = str(create_time)
|
||||
else:
|
||||
time_str = "Unknown"
|
||||
|
||||
|
||||
# Create document content with metadata header and contact info
|
||||
doc_content = f"""
|
||||
Contact: {contact_name}
|
||||
@@ -529,64 +493,57 @@ Is sent from self: {is_sent_from_self}
|
||||
Time: {time_str}
|
||||
Message: {readable_text if readable_text else message_text}
|
||||
"""
|
||||
|
||||
|
||||
# Create document with embedded metadata
|
||||
doc = Document(text=doc_content, metadata={})
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading {json_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
print(f"Loaded {len(docs)} WeChat chat documents")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading WeChat history: {e}")
|
||||
return docs
|
||||
|
||||
|
||||
return docs
|
||||
|
||||
@staticmethod
|
||||
def find_wechat_export_dirs() -> list[Path]:
|
||||
def find_wechat_export_dirs() -> List[Path]:
|
||||
"""
|
||||
Find all WeChat export directories.
|
||||
|
||||
|
||||
Returns:
|
||||
List of Path objects pointing to WeChat export directories
|
||||
"""
|
||||
export_dirs = []
|
||||
|
||||
|
||||
# Look for common export directory names
|
||||
possible_dirs = [
|
||||
Path("./wechat_export_test"),
|
||||
Path("./wechat_export"),
|
||||
Path("./wechat_chat_history"),
|
||||
Path("./chat_export"),
|
||||
Path("./chat_export")
|
||||
]
|
||||
|
||||
|
||||
for export_dir in possible_dirs:
|
||||
if export_dir.exists() and export_dir.is_dir():
|
||||
json_files = list(export_dir.glob("*.json"))
|
||||
if json_files:
|
||||
export_dirs.append(export_dir)
|
||||
print(
|
||||
f"Found WeChat export directory: {export_dir} with {len(json_files)} files"
|
||||
)
|
||||
|
||||
print(f"Found WeChat export directory: {export_dir} with {len(json_files)} files")
|
||||
|
||||
print(f"Found {len(export_dirs)} WeChat export directories")
|
||||
return export_dirs
|
||||
|
||||
@staticmethod
|
||||
def export_chat_to_file(
|
||||
output_file: str = "wechat_chat_export.txt",
|
||||
max_count: int = 1000,
|
||||
export_dir: str | None = None,
|
||||
include_non_text: bool = False,
|
||||
):
|
||||
def export_chat_to_file(output_file: str = "wechat_chat_export.txt", max_count: int = 1000, export_dir: str = None, include_non_text: bool = False):
|
||||
"""
|
||||
Export WeChat chat history to a text file.
|
||||
|
||||
|
||||
Args:
|
||||
output_file: Path to the output file
|
||||
max_count: Maximum number of entries to export
|
||||
@@ -595,36 +552,36 @@ Message: {readable_text if readable_text else message_text}
|
||||
"""
|
||||
if export_dir is None:
|
||||
export_dir = "./wechat_export_test"
|
||||
|
||||
|
||||
if not os.path.exists(export_dir):
|
||||
print(f"WeChat export directory not found at: {export_dir}")
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
json_files = list(Path(export_dir).glob("*.json"))
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
count = 0
|
||||
for json_file in json_files:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
|
||||
try:
|
||||
with open(json_file, encoding="utf-8") as json_f:
|
||||
with open(json_file, 'r', encoding='utf-8') as json_f:
|
||||
chat_data = json.load(json_f)
|
||||
|
||||
|
||||
contact_name = json_file.stem
|
||||
f.write(f"\n=== Chat with {contact_name} ===\n")
|
||||
|
||||
|
||||
for message in chat_data:
|
||||
if count >= max_count and max_count > 0:
|
||||
break
|
||||
|
||||
from_user = message.get("fromUser", "")
|
||||
content = message.get("content", "")
|
||||
message_text = message.get("message", "")
|
||||
create_time = message.get("createTime", 0)
|
||||
|
||||
|
||||
from_user = message.get('fromUser', '')
|
||||
content = message.get('content', '')
|
||||
message_text = message.get('message', '')
|
||||
create_time = message.get('createTime', 0)
|
||||
|
||||
# Skip non-text messages unless requested
|
||||
if not include_non_text:
|
||||
reader = WeChatHistoryReader()
|
||||
@@ -634,90 +591,83 @@ Message: {readable_text if readable_text else message_text}
|
||||
if not readable_text:
|
||||
continue
|
||||
message_text = readable_text
|
||||
|
||||
|
||||
if create_time:
|
||||
try:
|
||||
timestamp = datetime.fromtimestamp(create_time)
|
||||
time_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except (ValueError, OSError):
|
||||
time_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
|
||||
except:
|
||||
time_str = str(create_time)
|
||||
else:
|
||||
time_str = "Unknown"
|
||||
|
||||
|
||||
f.write(f"[{time_str}] {from_user}: {message_text}\n")
|
||||
count += 1
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {json_file}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
print(f"Exported {count} chat entries to {output_file}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error exporting WeChat chat history: {e}")
|
||||
|
||||
def export_wechat_chat_history(self, export_dir: str = "./wechat_export_direct") -> Path | None:
|
||||
def export_wechat_chat_history(self, export_dir: str = "./wechat_export_direct") -> Optional[Path]:
|
||||
"""
|
||||
Export WeChat chat history using wechat-exporter tool.
|
||||
|
||||
|
||||
Args:
|
||||
export_dir: Directory to save exported chat history
|
||||
|
||||
|
||||
Returns:
|
||||
Path to export directory if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
# Create export directory
|
||||
export_path = Path(export_dir)
|
||||
export_path.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
print(f"Exporting WeChat chat history to {export_path}...")
|
||||
|
||||
|
||||
# Check if wechat-exporter directory exists
|
||||
if not self.wechat_exporter_dir.exists():
|
||||
print(f"wechat-exporter directory not found at: {self.wechat_exporter_dir}")
|
||||
return None
|
||||
|
||||
|
||||
# Install requirements if needed
|
||||
requirements_file = self.wechat_exporter_dir / "requirements.txt"
|
||||
if requirements_file.exists():
|
||||
print("Installing wechat-exporter requirements...")
|
||||
subprocess.run(["uv", "pip", "install", "-r", str(requirements_file)], check=True)
|
||||
|
||||
subprocess.run([
|
||||
"uv", "pip", "install", "-r", str(requirements_file)
|
||||
], check=True)
|
||||
|
||||
# Run the export command
|
||||
print("Running wechat-exporter...")
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(self.wechat_exporter_dir / "main.py"),
|
||||
"export-all",
|
||||
str(export_path),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run([
|
||||
sys.executable, str(self.wechat_exporter_dir / "main.py"),
|
||||
"export-all", str(export_path)
|
||||
], capture_output=True, text=True, check=True)
|
||||
|
||||
print("Export command output:")
|
||||
print(result.stdout)
|
||||
if result.stderr:
|
||||
print("Export errors:")
|
||||
print(result.stderr)
|
||||
|
||||
|
||||
# Check if export was successful
|
||||
if export_path.exists() and any(export_path.glob("*.json")):
|
||||
json_files = list(export_path.glob("*.json"))
|
||||
print(
|
||||
f"Successfully exported {len(json_files)} chat history files to {export_path}"
|
||||
)
|
||||
print(f"Successfully exported {len(json_files)} chat history files to {export_path}")
|
||||
return export_path
|
||||
else:
|
||||
print("Export completed but no JSON files found")
|
||||
return None
|
||||
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Export command failed: {e}")
|
||||
print(f"Command output: {e.stdout}")
|
||||
@@ -728,18 +678,18 @@ Message: {readable_text if readable_text else message_text}
|
||||
print("Please ensure WeChat is running and WeChatTweak is installed.")
|
||||
return None
|
||||
|
||||
def find_or_export_wechat_data(self, export_dir: str = "./wechat_export_direct") -> list[Path]:
|
||||
def find_or_export_wechat_data(self, export_dir: str = "./wechat_export_direct") -> List[Path]:
|
||||
"""
|
||||
Find existing WeChat exports or create new ones.
|
||||
|
||||
|
||||
Args:
|
||||
export_dir: Directory to save exported chat history if needed
|
||||
|
||||
|
||||
Returns:
|
||||
List of Path objects pointing to WeChat export directories
|
||||
"""
|
||||
export_dirs = []
|
||||
|
||||
|
||||
# Look for existing exports in common locations
|
||||
possible_export_dirs = [
|
||||
Path("./wechat_database_export"),
|
||||
@@ -747,25 +697,23 @@ Message: {readable_text if readable_text else message_text}
|
||||
Path("./wechat_export"),
|
||||
Path("./wechat_export_direct"),
|
||||
Path("./wechat_chat_history"),
|
||||
Path("./chat_export"),
|
||||
Path("./chat_export")
|
||||
]
|
||||
|
||||
|
||||
for export_dir_path in possible_export_dirs:
|
||||
if export_dir_path.exists() and any(export_dir_path.glob("*.json")):
|
||||
export_dirs.append(export_dir_path)
|
||||
print(f"Found existing export: {export_dir_path}")
|
||||
|
||||
|
||||
# If no existing exports, try to export automatically
|
||||
if not export_dirs:
|
||||
print("No existing WeChat exports found. Starting direct export...")
|
||||
|
||||
|
||||
# Try to export using wechat-exporter
|
||||
exported_path = self.export_wechat_chat_history(export_dir)
|
||||
if exported_path:
|
||||
export_dirs = [exported_path]
|
||||
else:
|
||||
print(
|
||||
"Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed."
|
||||
)
|
||||
|
||||
return export_dirs
|
||||
print("Failed to export WeChat data. Please ensure WeChat is running and WeChatTweak is installed.")
|
||||
|
||||
return export_dirs
|
||||
@@ -1,42 +1,33 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import asyncio
|
||||
import dotenv
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
|
||||
# Add the project root to Python path so we can import from examples
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from leann.api import LeannBuilder, LeannChat
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
# Auto-detect user's mail path
|
||||
def get_mail_path():
|
||||
"""Get the mail path for the current user"""
|
||||
home_dir = os.path.expanduser("~")
|
||||
return os.path.join(home_dir, "Library", "Mail")
|
||||
|
||||
|
||||
# Default mail path for macOS
|
||||
DEFAULT_MAIL_PATH = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data"
|
||||
|
||||
|
||||
def create_leann_index_from_multiple_sources(
|
||||
messages_dirs: list[Path],
|
||||
index_path: str = "mail_index.leann",
|
||||
max_count: int = -1,
|
||||
include_html: bool = False,
|
||||
embedding_model: str = "facebook/contriever",
|
||||
):
|
||||
def create_leann_index_from_multiple_sources(messages_dirs: List[Path], index_path: str = "mail_index.leann", max_count: int = -1, include_html: bool = False, embedding_model: str = "facebook/contriever"):
|
||||
"""
|
||||
Create LEANN index from multiple mail data sources.
|
||||
|
||||
|
||||
Args:
|
||||
messages_dirs: List of Path objects pointing to Messages directories
|
||||
index_path: Path to save the LEANN index
|
||||
@@ -44,32 +35,31 @@ def create_leann_index_from_multiple_sources(
|
||||
include_html: Whether to include HTML content in email processing
|
||||
"""
|
||||
print("Creating LEANN index from multiple mail data sources...")
|
||||
|
||||
|
||||
# Load documents using EmlxReader from LEANN_email_reader
|
||||
from examples.email_data.LEANN_email_reader import EmlxReader
|
||||
|
||||
reader = EmlxReader(include_html=include_html)
|
||||
# from email_data.email import EmlxMboxReader
|
||||
# from pathlib import Path
|
||||
# reader = EmlxMboxReader()
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
all_documents = []
|
||||
total_processed = 0
|
||||
|
||||
|
||||
# Process each Messages directory
|
||||
for i, messages_dir in enumerate(messages_dirs):
|
||||
print(f"\nProcessing Messages directory {i + 1}/{len(messages_dirs)}: {messages_dir}")
|
||||
|
||||
print(f"\nProcessing Messages directory {i+1}/{len(messages_dirs)}: {messages_dir}")
|
||||
|
||||
try:
|
||||
documents = reader.load_data(messages_dir)
|
||||
if documents:
|
||||
print(f"Loaded {len(documents)} email documents from {messages_dir}")
|
||||
all_documents.extend(documents)
|
||||
total_processed += len(documents)
|
||||
|
||||
|
||||
# Check if we've reached the max count
|
||||
if max_count > 0 and total_processed >= max_count:
|
||||
print(f"Reached max count of {max_count} documents")
|
||||
@@ -79,18 +69,16 @@ def create_leann_index_from_multiple_sources(
|
||||
except Exception as e:
|
||||
print(f"Error processing {messages_dir}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
if not all_documents:
|
||||
print("No documents loaded from any source. Exiting.")
|
||||
return None
|
||||
|
||||
print(
|
||||
f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories and starting to split them into chunks"
|
||||
)
|
||||
|
||||
|
||||
print(f"\nTotal loaded {len(all_documents)} email documents from {len(messages_dirs)} directories and starting to split them into chunks")
|
||||
|
||||
# Create text splitter with 256 chunk size
|
||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25)
|
||||
|
||||
|
||||
# Convert Documents to text strings and chunk them
|
||||
all_texts = []
|
||||
for doc in all_documents:
|
||||
@@ -100,53 +88,44 @@ def create_leann_index_from_multiple_sources(
|
||||
text = node.get_content()
|
||||
# text = '[subject] ' + doc.metadata["subject"] + '\n' + text
|
||||
all_texts.append(text)
|
||||
|
||||
print(
|
||||
f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks"
|
||||
)
|
||||
|
||||
|
||||
print(f"Finished splitting {len(all_documents)} documents into {len(all_texts)} text chunks")
|
||||
|
||||
# Create LEANN index directory
|
||||
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model=embedding_model,
|
||||
graph_degree=32,
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True,
|
||||
num_threads=1, # Force single-threaded mode
|
||||
num_threads=1 # Force single-threaded mode
|
||||
)
|
||||
|
||||
print(f"Adding {len(all_texts)} email chunks to index...")
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
|
||||
|
||||
builder.build_index(index_path)
|
||||
print(f"\nLEANN index built at {index_path}!")
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
|
||||
return index_path
|
||||
|
||||
|
||||
def create_leann_index(
|
||||
mail_path: str,
|
||||
index_path: str = "mail_index.leann",
|
||||
max_count: int = 1000,
|
||||
include_html: bool = False,
|
||||
embedding_model: str = "facebook/contriever",
|
||||
):
|
||||
def create_leann_index(mail_path: str, index_path: str = "mail_index.leann", max_count: int = 1000, include_html: bool = False, embedding_model: str = "facebook/contriever"):
|
||||
"""
|
||||
Create LEANN index from mail data.
|
||||
|
||||
|
||||
Args:
|
||||
mail_path: Path to the mail directory
|
||||
index_path: Path to save the LEANN index
|
||||
@@ -155,33 +134,32 @@ def create_leann_index(
|
||||
"""
|
||||
print("Creating LEANN index from mail data...")
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Load documents using EmlxReader from LEANN_email_reader
|
||||
from examples.email_data.LEANN_email_reader import EmlxReader
|
||||
|
||||
reader = EmlxReader(include_html=include_html)
|
||||
# from email_data.email import EmlxMboxReader
|
||||
# from pathlib import Path
|
||||
# reader = EmlxMboxReader()
|
||||
documents = reader.load_data(Path(mail_path))
|
||||
|
||||
|
||||
if not documents:
|
||||
print("No documents loaded. Exiting.")
|
||||
return None
|
||||
|
||||
|
||||
print(f"Loaded {len(documents)} email documents")
|
||||
|
||||
|
||||
# Create text splitter with 256 chunk size
|
||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
|
||||
|
||||
|
||||
# Convert Documents to text strings and chunk them
|
||||
all_texts = []
|
||||
for doc in documents:
|
||||
@@ -189,135 +167,111 @@ def create_leann_index(
|
||||
nodes = text_splitter.get_nodes_from_documents([doc])
|
||||
for node in nodes:
|
||||
all_texts.append(node.get_content())
|
||||
|
||||
|
||||
print(f"Created {len(all_texts)} text chunks from {len(documents)} documents")
|
||||
|
||||
|
||||
# Create LEANN index directory
|
||||
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print("--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model=embedding_model,
|
||||
graph_degree=32,
|
||||
graph_degree=32,
|
||||
complexity=64,
|
||||
is_compact=True,
|
||||
is_recompute=True,
|
||||
num_threads=1, # Force single-threaded mode
|
||||
num_threads=1 # Force single-threaded mode
|
||||
)
|
||||
|
||||
print(f"Adding {len(all_texts)} email chunks to index...")
|
||||
for chunk_text in all_texts:
|
||||
builder.add_text(chunk_text)
|
||||
|
||||
|
||||
builder.build_index(index_path)
|
||||
print(f"\nLEANN index built at {index_path}!")
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
|
||||
return index_path
|
||||
|
||||
|
||||
async def query_leann_index(index_path: str, query: str):
|
||||
"""
|
||||
Query the LEANN index.
|
||||
|
||||
|
||||
Args:
|
||||
index_path: Path to the LEANN index
|
||||
query: The query string
|
||||
"""
|
||||
print("\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=index_path, llm_config={"type": "openai", "model": "gpt-4o"})
|
||||
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=index_path,
|
||||
llm_config={"type": "openai", "model": "gpt-4o"})
|
||||
|
||||
print(f"You: {query}")
|
||||
import time
|
||||
|
||||
time.time()
|
||||
start_time = time.time()
|
||||
chat_response = chat.ask(
|
||||
query,
|
||||
top_k=20,
|
||||
query,
|
||||
top_k=20,
|
||||
recompute_beighbor_embeddings=True,
|
||||
complexity=32,
|
||||
beam_width=1,
|
||||
)
|
||||
time.time()
|
||||
end_time = time.time()
|
||||
# print(f"Time taken: {end_time - start_time} seconds")
|
||||
# highlight the answer
|
||||
print(f"Leann chat response: \033[36m{chat_response}\033[0m")
|
||||
|
||||
|
||||
async def main():
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description="LEANN Mail Reader - Create and query email index")
|
||||
parser = argparse.ArgumentParser(description='LEANN Mail Reader - Create and query email index')
|
||||
# Remove --mail-path argument and auto-detect all Messages directories
|
||||
# Remove DEFAULT_MAIL_PATH
|
||||
parser.add_argument(
|
||||
"--index-dir",
|
||||
type=str,
|
||||
default="./mail_index",
|
||||
help="Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-emails",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Maximum number of emails to process (-1 means all)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query",
|
||||
type=str,
|
||||
default="Give me some funny advertisement about apple or other companies",
|
||||
help="Single query to run (default: runs example queries)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-html",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Include HTML content in email processing (default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding-model",
|
||||
type=str,
|
||||
default="facebook/contriever",
|
||||
help="Embedding model to use (default: facebook/contriever)",
|
||||
)
|
||||
|
||||
parser.add_argument('--index-dir', type=str, default="./mail_index",
|
||||
help='Directory to store the LEANN index (default: ./mail_index_leann_raw_text_all_dicts)')
|
||||
parser.add_argument('--max-emails', type=int, default=1000,
|
||||
help='Maximum number of emails to process (-1 means all)')
|
||||
parser.add_argument('--query', type=str, default="Give me some funny advertisement about apple or other companies",
|
||||
help='Single query to run (default: runs example queries)')
|
||||
parser.add_argument('--include-html', action='store_true', default=False,
|
||||
help='Include HTML content in email processing (default: False)')
|
||||
parser.add_argument('--embedding-model', type=str, default="facebook/contriever",
|
||||
help='Embedding model to use (default: facebook/contriever)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"args: {args}")
|
||||
|
||||
|
||||
# Automatically find all Messages directories under the current user's Mail directory
|
||||
from examples.email_data.LEANN_email_reader import find_all_messages_directories
|
||||
|
||||
mail_path = get_mail_path()
|
||||
print(f"Searching for email data in: {mail_path}")
|
||||
messages_dirs = find_all_messages_directories(mail_path)
|
||||
# messages_dirs = find_all_messages_directories(DEFAULT_MAIL_PATH)
|
||||
# messages_dirs = [DEFAULT_MAIL_PATH]
|
||||
# messages_dirs = messages_dirs[:1]
|
||||
|
||||
print("len(messages_dirs): ", len(messages_dirs))
|
||||
|
||||
|
||||
print('len(messages_dirs): ', len(messages_dirs))
|
||||
|
||||
|
||||
if not messages_dirs:
|
||||
print("No Messages directories found. Exiting.")
|
||||
return
|
||||
|
||||
|
||||
INDEX_DIR = Path(args.index_dir)
|
||||
INDEX_PATH = str(INDEX_DIR / "mail_documents.leann")
|
||||
print(f"Index directory: {INDEX_DIR}")
|
||||
print(f"Found {len(messages_dirs)} Messages directories.")
|
||||
|
||||
|
||||
# Create or load the LEANN index from all sources
|
||||
index_path = create_leann_index_from_multiple_sources(
|
||||
messages_dirs, INDEX_PATH, args.max_emails, args.include_html, args.embedding_model
|
||||
)
|
||||
|
||||
index_path = create_leann_index_from_multiple_sources(messages_dirs, INDEX_PATH, args.max_emails, args.include_html, args.embedding_model)
|
||||
|
||||
if index_path:
|
||||
if args.query:
|
||||
# Run single query
|
||||
@@ -327,12 +281,11 @@ async def main():
|
||||
queries = [
|
||||
"Hows Berkeley Graduate Student Instructor",
|
||||
"how's the icloud related advertisement saying",
|
||||
"Whats the number of class recommend to take per semester for incoming EECS students",
|
||||
"Whats the number of class recommend to take per semester for incoming EECS students"
|
||||
]
|
||||
for query in queries:
|
||||
print("\n" + "=" * 60)
|
||||
print("\n" + "="*60)
|
||||
await query_leann_index(index_path, query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
@@ -1,30 +1,26 @@
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
|
||||
# Add the project root to Python path so we can import from examples
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
import torch
|
||||
from llama_index.core import StorageContext, VectorStoreIndex
|
||||
from llama_index.core import VectorStoreIndex, StorageContext
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
# --- EMBEDDING MODEL ---
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
import torch
|
||||
|
||||
# --- END EMBEDDING MODEL ---
|
||||
|
||||
# Import EmlxReader from the new module
|
||||
from examples.email_data.LEANN_email_reader import EmlxReader
|
||||
|
||||
|
||||
def create_and_save_index(
|
||||
mail_path: str,
|
||||
save_dir: str = "mail_index_embedded",
|
||||
max_count: int = 1000,
|
||||
include_html: bool = False,
|
||||
):
|
||||
def create_and_save_index(mail_path: str, save_dir: str = "mail_index_embedded", max_count: int = 1000, include_html: bool = False):
|
||||
print("Creating index from mail data with embedded metadata...")
|
||||
documents = EmlxReader(include_html=include_html).load_data(mail_path, max_count=max_count)
|
||||
if not documents:
|
||||
@@ -34,7 +30,7 @@ def create_and_save_index(
|
||||
# Use facebook/contriever as the embedder
|
||||
embed_model = HuggingFaceEmbedding(model_name="facebook/contriever")
|
||||
# set on device
|
||||
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
embed_model._model.to("cuda")
|
||||
# set mps
|
||||
@@ -43,19 +39,21 @@ def create_and_save_index(
|
||||
else:
|
||||
embed_model._model.to("cpu")
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents, transformations=[text_splitter], embed_model=embed_model
|
||||
documents,
|
||||
transformations=[text_splitter],
|
||||
embed_model=embed_model
|
||||
)
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
index.storage_context.persist(persist_dir=save_dir)
|
||||
print(f"Index saved to {save_dir}")
|
||||
return index
|
||||
|
||||
|
||||
def load_index(save_dir: str = "mail_index_embedded"):
|
||||
try:
|
||||
storage_context = StorageContext.from_defaults(persist_dir=save_dir)
|
||||
index = VectorStoreIndex.from_vector_store(
|
||||
storage_context.vector_store, storage_context=storage_context
|
||||
storage_context.vector_store,
|
||||
storage_context=storage_context
|
||||
)
|
||||
print(f"Index loaded from {save_dir}")
|
||||
return index
|
||||
@@ -63,7 +61,6 @@ def load_index(save_dir: str = "mail_index_embedded"):
|
||||
print(f"Error loading index: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def query_index(index, query: str):
|
||||
if index is None:
|
||||
print("No index available for querying.")
|
||||
@@ -73,57 +70,39 @@ def query_index(index, query: str):
|
||||
print(f"Query: {query}")
|
||||
print(f"Response: {response}")
|
||||
|
||||
|
||||
def main():
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LlamaIndex Mail Reader - Create and query email index"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mail-path",
|
||||
type=str,
|
||||
default="/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages",
|
||||
help="Path to mail data directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-dir",
|
||||
type=str,
|
||||
default="mail_index_embedded",
|
||||
help="Directory to store the index (default: mail_index_embedded)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-emails", type=int, default=10000, help="Maximum number of emails to process"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-html",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Include HTML content in email processing (default: False)",
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser(description='LlamaIndex Mail Reader - Create and query email index')
|
||||
parser.add_argument('--mail-path', type=str,
|
||||
default="/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages",
|
||||
help='Path to mail data directory')
|
||||
parser.add_argument('--save-dir', type=str, default="mail_index_embedded",
|
||||
help='Directory to store the index (default: mail_index_embedded)')
|
||||
parser.add_argument('--max-emails', type=int, default=10000,
|
||||
help='Maximum number of emails to process')
|
||||
parser.add_argument('--include-html', action='store_true', default=False,
|
||||
help='Include HTML content in email processing (default: False)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
mail_path = args.mail_path
|
||||
save_dir = args.save_dir
|
||||
|
||||
|
||||
if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")):
|
||||
print("Loading existing index...")
|
||||
index = load_index(save_dir)
|
||||
else:
|
||||
print("Creating new index...")
|
||||
index = create_and_save_index(
|
||||
mail_path, save_dir, max_count=args.max_emails, include_html=args.include_html
|
||||
)
|
||||
index = create_and_save_index(mail_path, save_dir, max_count=args.max_emails, include_html=args.include_html)
|
||||
if index:
|
||||
queries = [
|
||||
"Hows Berkeley Graduate Student Instructor",
|
||||
"how's the icloud related advertisement saying",
|
||||
"Whats the number of class recommend to take per semester for incoming EECS students",
|
||||
"Whats the number of class recommend to take per semester for incoming EECS students"
|
||||
]
|
||||
for query in queries:
|
||||
print("\n" + "=" * 50)
|
||||
print("\n" + "="*50)
|
||||
query_index(index, query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
@@ -1,11 +1,10 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
import dotenv
|
||||
from leann.api import LeannBuilder, LeannChat
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
import asyncio
|
||||
import dotenv
|
||||
from leann.api import LeannBuilder, LeannChat
|
||||
from pathlib import Path
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
@@ -57,7 +56,7 @@ async def main(args):
|
||||
else:
|
||||
print(f"--- Using existing index at {INDEX_DIR} ---")
|
||||
|
||||
print("\n[PHASE 2] Starting Leann chat session...")
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
|
||||
llm_config = {"type": "hf", "model": "Qwen/Qwen3-4B"}
|
||||
llm_config = {"type": "ollama", "model": "qwen3:8b"}
|
||||
@@ -65,7 +64,7 @@ async def main(args):
|
||||
|
||||
chat = LeannChat(index_path=INDEX_PATH, llm_config=llm_config)
|
||||
# query = (
|
||||
# "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
|
||||
# "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
|
||||
# )
|
||||
query = args.query
|
||||
|
||||
@@ -75,7 +74,9 @@ async def main(args):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run Leann Chat with various LLM backends.")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run Leann Chat with various LLM backends."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--llm",
|
||||
type=str,
|
||||
|
||||
@@ -14,55 +14,48 @@ Key features:
|
||||
- Document-level result consolidation
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
from dataclasses import dataclass
|
||||
from collections import defaultdict
|
||||
import json
|
||||
|
||||
@dataclass
|
||||
class PatchResult:
|
||||
"""Represents a single patch search result."""
|
||||
|
||||
patch_id: int
|
||||
image_name: str
|
||||
image_path: str
|
||||
coordinates: tuple[int, int, int, int] # (x1, y1, x2, y2)
|
||||
coordinates: Tuple[int, int, int, int] # (x1, y1, x2, y2)
|
||||
score: float
|
||||
attention_score: float
|
||||
scale: float
|
||||
metadata: dict[str, Any]
|
||||
|
||||
metadata: Dict[str, Any]
|
||||
|
||||
@dataclass
|
||||
class AggregatedResult:
|
||||
"""Represents an aggregated document-level result."""
|
||||
|
||||
image_name: str
|
||||
image_path: str
|
||||
doc_score: float
|
||||
patch_count: int
|
||||
best_patch: PatchResult
|
||||
all_patches: list[PatchResult]
|
||||
all_patches: List[PatchResult]
|
||||
aggregation_method: str
|
||||
spatial_clusters: list[list[PatchResult]] | None = None
|
||||
|
||||
spatial_clusters: Optional[List[List[PatchResult]]] = None
|
||||
|
||||
class MultiVectorAggregator:
|
||||
"""
|
||||
Aggregates multiple patch-level results into document-level results.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
aggregation_method: str = "maxsim",
|
||||
spatial_clustering: bool = True,
|
||||
cluster_distance_threshold: float = 100.0,
|
||||
):
|
||||
|
||||
def __init__(self,
|
||||
aggregation_method: str = "maxsim",
|
||||
spatial_clustering: bool = True,
|
||||
cluster_distance_threshold: float = 100.0):
|
||||
"""
|
||||
Initialize the aggregator.
|
||||
|
||||
|
||||
Args:
|
||||
aggregation_method: "maxsim", "voting", "weighted", or "mean"
|
||||
spatial_clustering: Whether to cluster spatially close patches
|
||||
@@ -71,23 +64,23 @@ class MultiVectorAggregator:
|
||||
self.aggregation_method = aggregation_method
|
||||
self.spatial_clustering = spatial_clustering
|
||||
self.cluster_distance_threshold = cluster_distance_threshold
|
||||
|
||||
def aggregate_results(
|
||||
self, search_results: list[dict[str, Any]], top_k: int = 10
|
||||
) -> list[AggregatedResult]:
|
||||
|
||||
def aggregate_results(self,
|
||||
search_results: List[Dict[str, Any]],
|
||||
top_k: int = 10) -> List[AggregatedResult]:
|
||||
"""
|
||||
Aggregate patch-level search results into document-level results.
|
||||
|
||||
|
||||
Args:
|
||||
search_results: List of search results from LeannSearcher
|
||||
top_k: Number of top documents to return
|
||||
|
||||
|
||||
Returns:
|
||||
List of aggregated document results
|
||||
"""
|
||||
# Group results by image
|
||||
image_groups = defaultdict(list)
|
||||
|
||||
|
||||
for result in search_results:
|
||||
metadata = result.metadata
|
||||
if "image_name" in metadata and "patch_id" in metadata:
|
||||
@@ -99,57 +92,55 @@ class MultiVectorAggregator:
|
||||
score=result.score,
|
||||
attention_score=metadata.get("attention_score", 0.0),
|
||||
scale=metadata.get("scale", 1.0),
|
||||
metadata=metadata,
|
||||
metadata=metadata
|
||||
)
|
||||
image_groups[metadata["image_name"]].append(patch_result)
|
||||
|
||||
|
||||
# Aggregate each image group
|
||||
aggregated_results = []
|
||||
for image_name, patches in image_groups.items():
|
||||
if len(patches) == 0:
|
||||
continue
|
||||
|
||||
|
||||
agg_result = self._aggregate_image_patches(image_name, patches)
|
||||
aggregated_results.append(agg_result)
|
||||
|
||||
|
||||
# Sort by aggregated score and return top-k
|
||||
aggregated_results.sort(key=lambda x: x.doc_score, reverse=True)
|
||||
return aggregated_results[:top_k]
|
||||
|
||||
def _aggregate_image_patches(
|
||||
self, image_name: str, patches: list[PatchResult]
|
||||
) -> AggregatedResult:
|
||||
|
||||
def _aggregate_image_patches(self, image_name: str, patches: List[PatchResult]) -> AggregatedResult:
|
||||
"""Aggregate patches for a single image."""
|
||||
|
||||
|
||||
if self.aggregation_method == "maxsim":
|
||||
doc_score = max(patch.score for patch in patches)
|
||||
best_patch = max(patches, key=lambda p: p.score)
|
||||
|
||||
|
||||
elif self.aggregation_method == "voting":
|
||||
# Count patches above threshold
|
||||
threshold = np.percentile([p.score for p in patches], 75)
|
||||
doc_score = sum(1 for patch in patches if patch.score >= threshold)
|
||||
best_patch = max(patches, key=lambda p: p.score)
|
||||
|
||||
|
||||
elif self.aggregation_method == "weighted":
|
||||
# Weight by attention scores
|
||||
total_weighted_score = sum(p.score * p.attention_score for p in patches)
|
||||
total_weights = sum(p.attention_score for p in patches)
|
||||
doc_score = total_weighted_score / max(total_weights, 1e-8)
|
||||
best_patch = max(patches, key=lambda p: p.score * p.attention_score)
|
||||
|
||||
|
||||
elif self.aggregation_method == "mean":
|
||||
doc_score = np.mean([patch.score for patch in patches])
|
||||
best_patch = max(patches, key=lambda p: p.score)
|
||||
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown aggregation method: {self.aggregation_method}")
|
||||
|
||||
|
||||
# Spatial clustering if enabled
|
||||
spatial_clusters = None
|
||||
if self.spatial_clustering:
|
||||
spatial_clusters = self._cluster_patches_spatially(patches)
|
||||
|
||||
|
||||
return AggregatedResult(
|
||||
image_name=image_name,
|
||||
image_path=patches[0].image_path,
|
||||
@@ -158,23 +149,23 @@ class MultiVectorAggregator:
|
||||
best_patch=best_patch,
|
||||
all_patches=sorted(patches, key=lambda p: p.score, reverse=True),
|
||||
aggregation_method=self.aggregation_method,
|
||||
spatial_clusters=spatial_clusters,
|
||||
spatial_clusters=spatial_clusters
|
||||
)
|
||||
|
||||
def _cluster_patches_spatially(self, patches: list[PatchResult]) -> list[list[PatchResult]]:
|
||||
|
||||
def _cluster_patches_spatially(self, patches: List[PatchResult]) -> List[List[PatchResult]]:
|
||||
"""Cluster patches that are spatially close to each other."""
|
||||
if len(patches) <= 1:
|
||||
return [patches]
|
||||
|
||||
|
||||
clusters = []
|
||||
remaining_patches = patches.copy()
|
||||
|
||||
|
||||
while remaining_patches:
|
||||
# Start new cluster with highest scoring remaining patch
|
||||
seed_patch = max(remaining_patches, key=lambda p: p.score)
|
||||
current_cluster = [seed_patch]
|
||||
remaining_patches.remove(seed_patch)
|
||||
|
||||
|
||||
# Add nearby patches to cluster
|
||||
added_to_cluster = True
|
||||
while added_to_cluster:
|
||||
@@ -184,175 +175,145 @@ class MultiVectorAggregator:
|
||||
current_cluster.append(patch)
|
||||
remaining_patches.remove(patch)
|
||||
added_to_cluster = True
|
||||
|
||||
|
||||
clusters.append(current_cluster)
|
||||
|
||||
|
||||
return sorted(clusters, key=lambda cluster: max(p.score for p in cluster), reverse=True)
|
||||
|
||||
def _is_patch_nearby(self, patch: PatchResult, cluster: list[PatchResult]) -> bool:
|
||||
|
||||
def _is_patch_nearby(self, patch: PatchResult, cluster: List[PatchResult]) -> bool:
|
||||
"""Check if a patch is spatially close to any patch in the cluster."""
|
||||
patch_center = self._get_patch_center(patch.coordinates)
|
||||
|
||||
|
||||
for cluster_patch in cluster:
|
||||
cluster_center = self._get_patch_center(cluster_patch.coordinates)
|
||||
distance = np.sqrt(
|
||||
(patch_center[0] - cluster_center[0]) ** 2
|
||||
+ (patch_center[1] - cluster_center[1]) ** 2
|
||||
)
|
||||
|
||||
distance = np.sqrt((patch_center[0] - cluster_center[0])**2 +
|
||||
(patch_center[1] - cluster_center[1])**2)
|
||||
|
||||
if distance <= self.cluster_distance_threshold:
|
||||
return True
|
||||
|
||||
|
||||
return False
|
||||
|
||||
def _get_patch_center(self, coordinates: tuple[int, int, int, int]) -> tuple[float, float]:
|
||||
|
||||
def _get_patch_center(self, coordinates: Tuple[int, int, int, int]) -> Tuple[float, float]:
|
||||
"""Get center point of a patch."""
|
||||
x1, y1, x2, y2 = coordinates
|
||||
return ((x1 + x2) / 2, (y1 + y2) / 2)
|
||||
|
||||
def print_aggregated_results(
|
||||
self, results: list[AggregatedResult], max_patches_per_doc: int = 3
|
||||
):
|
||||
|
||||
def print_aggregated_results(self, results: List[AggregatedResult], max_patches_per_doc: int = 3):
|
||||
"""Pretty print aggregated results."""
|
||||
print(f"\n🔍 Aggregated Results (method: {self.aggregation_method})")
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
for i, result in enumerate(results):
|
||||
print(f"\n{i + 1}. {result.image_name}")
|
||||
print(f"\n{i+1}. {result.image_name}")
|
||||
print(f" Doc Score: {result.doc_score:.4f} | Patches: {result.patch_count}")
|
||||
print(f" Path: {result.image_path}")
|
||||
|
||||
|
||||
# Show best patch
|
||||
best = result.best_patch
|
||||
print(
|
||||
f" 🌟 Best Patch: #{best.patch_id} at {best.coordinates} (score: {best.score:.4f})"
|
||||
)
|
||||
|
||||
print(f" 🌟 Best Patch: #{best.patch_id} at {best.coordinates} (score: {best.score:.4f})")
|
||||
|
||||
# Show top patches
|
||||
print(" 📍 Top Patches:")
|
||||
print(f" 📍 Top Patches:")
|
||||
for j, patch in enumerate(result.all_patches[:max_patches_per_doc]):
|
||||
print(
|
||||
f" {j + 1}. Patch #{patch.patch_id}: {patch.score:.4f} at {patch.coordinates}"
|
||||
)
|
||||
|
||||
print(f" {j+1}. Patch #{patch.patch_id}: {patch.score:.4f} at {patch.coordinates}")
|
||||
|
||||
# Show spatial clusters if available
|
||||
if result.spatial_clusters and len(result.spatial_clusters) > 1:
|
||||
print(f" 🗂️ Spatial Clusters: {len(result.spatial_clusters)}")
|
||||
for j, cluster in enumerate(result.spatial_clusters[:2]): # Show top 2 clusters
|
||||
cluster_score = max(p.score for p in cluster)
|
||||
print(
|
||||
f" Cluster {j + 1}: {len(cluster)} patches (best: {cluster_score:.4f})"
|
||||
)
|
||||
|
||||
print(f" Cluster {j+1}: {len(cluster)} patches (best: {cluster_score:.4f})")
|
||||
|
||||
def demo_aggregation():
|
||||
"""Demonstrate the multi-vector aggregation functionality."""
|
||||
print("=== Multi-Vector Aggregation Demo ===")
|
||||
|
||||
|
||||
# Simulate some patch-level search results
|
||||
# In real usage, these would come from LeannSearcher.search()
|
||||
|
||||
|
||||
class MockResult:
|
||||
def __init__(self, score, metadata):
|
||||
self.score = score
|
||||
self.metadata = metadata
|
||||
|
||||
|
||||
# Simulate results for 2 images with multiple patches each
|
||||
mock_results = [
|
||||
# Image 1: cats_and_kitchen.jpg - 4 patches
|
||||
MockResult(
|
||||
0.85,
|
||||
{
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 3,
|
||||
"coordinates": [100, 50, 224, 174], # Kitchen area
|
||||
"attention_score": 0.92,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(
|
||||
0.78,
|
||||
{
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 7,
|
||||
"coordinates": [200, 300, 324, 424], # Cat area
|
||||
"attention_score": 0.88,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(
|
||||
0.72,
|
||||
{
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 12,
|
||||
"coordinates": [150, 100, 274, 224], # Appliances
|
||||
"attention_score": 0.75,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(
|
||||
0.65,
|
||||
{
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 15,
|
||||
"coordinates": [50, 250, 174, 374], # Furniture
|
||||
"attention_score": 0.70,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
# Image 2: city_street.jpg - 3 patches
|
||||
MockResult(
|
||||
0.68,
|
||||
{
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 2,
|
||||
"coordinates": [300, 100, 424, 224], # Buildings
|
||||
"attention_score": 0.80,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(
|
||||
0.62,
|
||||
{
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 8,
|
||||
"coordinates": [100, 350, 224, 474], # Street level
|
||||
"attention_score": 0.75,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(
|
||||
0.55,
|
||||
{
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 11,
|
||||
"coordinates": [400, 200, 524, 324], # Sky area
|
||||
"attention_score": 0.60,
|
||||
"scale": 1.0,
|
||||
},
|
||||
),
|
||||
MockResult(0.85, {
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 3,
|
||||
"coordinates": [100, 50, 224, 174], # Kitchen area
|
||||
"attention_score": 0.92,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(0.78, {
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 7,
|
||||
"coordinates": [200, 300, 324, 424], # Cat area
|
||||
"attention_score": 0.88,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(0.72, {
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 12,
|
||||
"coordinates": [150, 100, 274, 224], # Appliances
|
||||
"attention_score": 0.75,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(0.65, {
|
||||
"image_name": "cats_and_kitchen.jpg",
|
||||
"image_path": "/path/to/cats_and_kitchen.jpg",
|
||||
"patch_id": 15,
|
||||
"coordinates": [50, 250, 174, 374], # Furniture
|
||||
"attention_score": 0.70,
|
||||
"scale": 1.0
|
||||
}),
|
||||
|
||||
# Image 2: city_street.jpg - 3 patches
|
||||
MockResult(0.68, {
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 2,
|
||||
"coordinates": [300, 100, 424, 224], # Buildings
|
||||
"attention_score": 0.80,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(0.62, {
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 8,
|
||||
"coordinates": [100, 350, 224, 474], # Street level
|
||||
"attention_score": 0.75,
|
||||
"scale": 1.0
|
||||
}),
|
||||
MockResult(0.55, {
|
||||
"image_name": "city_street.jpg",
|
||||
"image_path": "/path/to/city_street.jpg",
|
||||
"patch_id": 11,
|
||||
"coordinates": [400, 200, 524, 324], # Sky area
|
||||
"attention_score": 0.60,
|
||||
"scale": 1.0
|
||||
}),
|
||||
]
|
||||
|
||||
|
||||
# Test different aggregation methods
|
||||
methods = ["maxsim", "voting", "weighted", "mean"]
|
||||
|
||||
|
||||
for method in methods:
|
||||
print(f"\n{'=' * 20} {method.upper()} AGGREGATION {'=' * 20}")
|
||||
|
||||
print(f"\n{'='*20} {method.upper()} AGGREGATION {'='*20}")
|
||||
|
||||
aggregator = MultiVectorAggregator(
|
||||
aggregation_method=method, spatial_clustering=True, cluster_distance_threshold=100.0
|
||||
aggregation_method=method,
|
||||
spatial_clustering=True,
|
||||
cluster_distance_threshold=100.0
|
||||
)
|
||||
|
||||
|
||||
aggregated = aggregator.aggregate_results(mock_results, top_k=5)
|
||||
aggregator.print_aggregated_results(aggregated)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demo_aggregation()
|
||||
demo_aggregation()
|
||||
@@ -6,24 +6,22 @@ Complete example showing how to build and search with OpenAI embeddings using HN
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import dotenv
|
||||
from pathlib import Path
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
# Load environment variables
|
||||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
def main():
|
||||
# Check if OpenAI API key is available
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
print("ERROR: OPENAI_API_KEY environment variable not set")
|
||||
return False
|
||||
|
||||
|
||||
print(f"✅ OpenAI API key found: {api_key[:10]}...")
|
||||
|
||||
|
||||
# Sample texts
|
||||
sample_texts = [
|
||||
"Machine learning is a powerful technology that enables computers to learn from data.",
|
||||
@@ -35,15 +33,15 @@ def main():
|
||||
"Artificial intelligence aims to create machines that can perform human-like tasks.",
|
||||
"Python is a popular programming language used extensively in data science and AI.",
|
||||
"Neural networks are inspired by the structure and function of the human brain.",
|
||||
"Big data refers to extremely large datasets that require special tools to process.",
|
||||
"Big data refers to extremely large datasets that require special tools to process."
|
||||
]
|
||||
|
||||
|
||||
INDEX_DIR = Path("./simple_openai_test_index")
|
||||
INDEX_PATH = str(INDEX_DIR / "simple_test.leann")
|
||||
|
||||
print("\n=== Building Index with OpenAI Embeddings ===")
|
||||
|
||||
print(f"\n=== Building Index with OpenAI Embeddings ===")
|
||||
print(f"Index path: {INDEX_PATH}")
|
||||
|
||||
|
||||
try:
|
||||
# Use proper configuration for OpenAI embeddings
|
||||
builder = LeannBuilder(
|
||||
@@ -51,63 +49,60 @@ def main():
|
||||
embedding_model="text-embedding-3-small",
|
||||
embedding_mode="openai",
|
||||
# HNSW settings for OpenAI embeddings
|
||||
M=16, # Smaller graph degree
|
||||
efConstruction=64, # Smaller construction complexity
|
||||
is_compact=True, # Enable compact storage for recompute
|
||||
is_recompute=True, # MUST enable for OpenAI embeddings
|
||||
M=16, # Smaller graph degree
|
||||
efConstruction=64, # Smaller construction complexity
|
||||
is_compact=True, # Enable compact storage for recompute
|
||||
is_recompute=True, # MUST enable for OpenAI embeddings
|
||||
num_threads=1,
|
||||
)
|
||||
|
||||
|
||||
print(f"Adding {len(sample_texts)} texts to the index...")
|
||||
for i, text in enumerate(sample_texts):
|
||||
metadata = {"id": f"doc_{i}", "topic": "AI"}
|
||||
builder.add_text(text, metadata)
|
||||
|
||||
|
||||
print("Building index...")
|
||||
builder.build_index(INDEX_PATH)
|
||||
print("✅ Index built successfully!")
|
||||
|
||||
print(f"✅ Index built successfully!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error building index: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
print("\n=== Testing Search ===")
|
||||
|
||||
|
||||
print(f"\n=== Testing Search ===")
|
||||
|
||||
try:
|
||||
searcher = LeannSearcher(INDEX_PATH)
|
||||
|
||||
|
||||
test_queries = [
|
||||
"What is machine learning?",
|
||||
"How do neural networks work?",
|
||||
"Programming languages for data science",
|
||||
"Programming languages for data science"
|
||||
]
|
||||
|
||||
|
||||
for query in test_queries:
|
||||
print(f"\n🔍 Query: '{query}'")
|
||||
results = searcher.search(query, top_k=3)
|
||||
|
||||
|
||||
print(f" Found {len(results)} results:")
|
||||
for i, result in enumerate(results):
|
||||
print(f" {i + 1}. Score: {result.score:.4f}")
|
||||
print(f" {i+1}. Score: {result.score:.4f}")
|
||||
print(f" Text: {result.text[:80]}...")
|
||||
|
||||
print("\n✅ Search test completed successfully!")
|
||||
|
||||
print(f"\n✅ Search test completed successfully!")
|
||||
return True
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error during search: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = main()
|
||||
if success:
|
||||
print("\n🎉 Simple OpenAI index test completed successfully!")
|
||||
print(f"\n🎉 Simple OpenAI index test completed successfully!")
|
||||
else:
|
||||
print("\n💥 Simple OpenAI index test failed!")
|
||||
print(f"\n💥 Simple OpenAI index test failed!")
|
||||
@@ -1,23 +1,18 @@
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
from leann.api import LeannChat
|
||||
from pathlib import Path
|
||||
|
||||
INDEX_DIR = Path("./test_pdf_index_huawei")
|
||||
INDEX_PATH = str(INDEX_DIR / "pdf_documents.leann")
|
||||
|
||||
|
||||
async def main():
|
||||
print("\n[PHASE 2] Starting Leann chat session...")
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=INDEX_PATH)
|
||||
query = "What is the main idea of RL and give me 5 exapmle of classic RL algorithms?"
|
||||
query = "Based on the paper, what are the main techniques LEANN explores to reduce the storage overhead and DLPM explore to achieve Fairness and Efiiciency trade-off?"
|
||||
# query = "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
|
||||
response = chat.ask(
|
||||
query, top_k=20, recompute_beighbor_embeddings=True, complexity=32, beam_width=1
|
||||
)
|
||||
# query = "什么是盘古大模型以及盘古开发过程中遇到了什么阴暗面,任务令一般在什么城市颁发"
|
||||
response = chat.ask(query,top_k=20,recompute_beighbor_embeddings=True,complexity=32,beam_width=1)
|
||||
print(f"\n[PHASE 2] Response: {response}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
@@ -5,21 +5,24 @@ It correctly compares results by fetching the text content for both the new sear
|
||||
results and the golden standard results, making the comparison robust to ID changes.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
from typing import List
|
||||
|
||||
from leann.api import LeannSearcher, LeannBuilder
|
||||
|
||||
|
||||
def download_data_if_needed(data_root: Path, download_embeddings: bool = False):
|
||||
"""Checks if the data directory exists, and if not, downloads it from HF Hub."""
|
||||
if not data_root.exists():
|
||||
print(f"Data directory '{data_root}' not found.")
|
||||
print("Downloading evaluation data from Hugging Face Hub... (this may take a moment)")
|
||||
print(
|
||||
"Downloading evaluation data from Hugging Face Hub... (this may take a moment)"
|
||||
)
|
||||
try:
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
@@ -60,7 +63,7 @@ def download_data_if_needed(data_root: Path, download_embeddings: bool = False):
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def download_embeddings_if_needed(data_root: Path, dataset_type: str | None = None):
|
||||
def download_embeddings_if_needed(data_root: Path, dataset_type: str = None):
|
||||
"""Download embeddings files specifically."""
|
||||
embeddings_dir = data_root / "embeddings"
|
||||
|
||||
@@ -98,7 +101,7 @@ def download_embeddings_if_needed(data_root: Path, dataset_type: str | None = No
|
||||
|
||||
|
||||
# --- Helper Function to get Golden Passages ---
|
||||
def get_golden_texts(searcher: LeannSearcher, golden_ids: list[int]) -> set:
|
||||
def get_golden_texts(searcher: LeannSearcher, golden_ids: List[int]) -> set:
|
||||
"""
|
||||
Retrieves the text for golden passage IDs directly from the LeannSearcher's
|
||||
passage manager.
|
||||
@@ -110,20 +113,24 @@ def get_golden_texts(searcher: LeannSearcher, golden_ids: list[int]) -> set:
|
||||
passage_data = searcher.passage_manager.get_passage(str(gid))
|
||||
golden_texts.add(passage_data["text"])
|
||||
except KeyError:
|
||||
print(f"Warning: Golden passage ID '{gid}' not found in the index's passage data.")
|
||||
print(
|
||||
f"Warning: Golden passage ID '{gid}' not found in the index's passage data."
|
||||
)
|
||||
return golden_texts
|
||||
|
||||
|
||||
def load_queries(file_path: Path) -> list[str]:
|
||||
def load_queries(file_path: Path) -> List[str]:
|
||||
queries = []
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
queries.append(data["query"])
|
||||
return queries
|
||||
|
||||
|
||||
def build_index_from_embeddings(embeddings_file: str, output_path: str, backend: str = "hnsw"):
|
||||
def build_index_from_embeddings(
|
||||
embeddings_file: str, output_path: str, backend: str = "hnsw"
|
||||
):
|
||||
"""
|
||||
Build a LEANN index from pre-computed embeddings.
|
||||
|
||||
@@ -166,7 +173,9 @@ def build_index_from_embeddings(embeddings_file: str, output_path: str, backend:
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run recall evaluation on a LEANN index.")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run recall evaluation on a LEANN index."
|
||||
)
|
||||
parser.add_argument(
|
||||
"index_path",
|
||||
type=str,
|
||||
@@ -193,7 +202,9 @@ def main():
|
||||
parser.add_argument(
|
||||
"--num-queries", type=int, default=10, help="Number of queries to evaluate."
|
||||
)
|
||||
parser.add_argument("--top-k", type=int, default=3, help="The 'k' value for recall@k.")
|
||||
parser.add_argument(
|
||||
"--top-k", type=int, default=3, help="The 'k' value for recall@k."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ef-search", type=int, default=120, help="The 'efSearch' parameter for HNSW."
|
||||
)
|
||||
@@ -208,7 +219,9 @@ def main():
|
||||
# Download data based on mode
|
||||
if args.mode == "build":
|
||||
# For building mode, we need embeddings
|
||||
download_data_if_needed(data_root, download_embeddings=False) # Basic data first
|
||||
download_data_if_needed(
|
||||
data_root, download_embeddings=False
|
||||
) # Basic data first
|
||||
|
||||
# Auto-detect dataset type and download embeddings
|
||||
if args.embeddings_file:
|
||||
@@ -249,7 +262,9 @@ def main():
|
||||
print(f"Index built successfully: {built_index_path}")
|
||||
|
||||
# Ask if user wants to run evaluation
|
||||
eval_response = input("Run evaluation on the built index? (y/n): ").strip().lower()
|
||||
eval_response = (
|
||||
input("Run evaluation on the built index? (y/n): ").strip().lower()
|
||||
)
|
||||
if eval_response != "y":
|
||||
print("Index building complete. Exiting.")
|
||||
return
|
||||
@@ -278,8 +293,12 @@ def main():
|
||||
break
|
||||
|
||||
if not args.index_path:
|
||||
print("No indices found. The data download should have included pre-built indices.")
|
||||
print("Please check the data/indices/ directory or provide --index-path manually.")
|
||||
print(
|
||||
"No indices found. The data download should have included pre-built indices."
|
||||
)
|
||||
print(
|
||||
"Please check the data/indices/ directory or provide --index-path manually."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# Detect dataset type from index path to select the correct ground truth
|
||||
@@ -291,10 +310,14 @@ def main():
|
||||
else:
|
||||
# Fallback: try to infer from the index directory name
|
||||
dataset_type = Path(args.index_path).name
|
||||
print(f"WARNING: Could not detect dataset type from path, inferred '{dataset_type}'.")
|
||||
print(
|
||||
f"WARNING: Could not detect dataset type from path, inferred '{dataset_type}'."
|
||||
)
|
||||
|
||||
queries_file = data_root / "queries" / "nq_open.jsonl"
|
||||
golden_results_file = data_root / "ground_truth" / dataset_type / "flat_results_nq_k3.json"
|
||||
golden_results_file = (
|
||||
data_root / "ground_truth" / dataset_type / "flat_results_nq_k3.json"
|
||||
)
|
||||
|
||||
print(f"INFO: Detected dataset type: {dataset_type}")
|
||||
print(f"INFO: Using queries file: {queries_file}")
|
||||
@@ -304,7 +327,7 @@ def main():
|
||||
searcher = LeannSearcher(args.index_path)
|
||||
queries = load_queries(queries_file)
|
||||
|
||||
with open(golden_results_file) as f:
|
||||
with open(golden_results_file, "r") as f:
|
||||
golden_results_data = json.load(f)
|
||||
|
||||
num_eval_queries = min(args.num_queries, len(queries))
|
||||
@@ -316,7 +339,9 @@ def main():
|
||||
|
||||
for i in range(num_eval_queries):
|
||||
start_time = time.time()
|
||||
new_results = searcher.search(queries[i], top_k=args.top_k, ef=args.ef_search)
|
||||
new_results = searcher.search(
|
||||
queries[i], top_k=args.top_k, ef=args.ef_search
|
||||
)
|
||||
search_times.append(time.time() - start_time)
|
||||
|
||||
# Correct Recall Calculation: Based on TEXT content
|
||||
|
||||
@@ -4,25 +4,18 @@ Run: uv run python examples/simple_demo.py
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
from leann import LeannBuilder, LeannChat, LeannSearcher
|
||||
from leann import LeannBuilder, LeannSearcher, LeannChat
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Simple demo of Leann with selectable embedding models."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--embedding_model",
|
||||
type=str,
|
||||
default="sentence-transformers/all-mpnet-base-v2",
|
||||
help="The embedding model to use, e.g., 'sentence-transformers/all-mpnet-base-v2' or 'text-embedding-ada-002'.",
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Simple demo of Leann with selectable embedding models.")
|
||||
parser.add_argument("--embedding_model", type=str, default="sentence-transformers/all-mpnet-base-v2",
|
||||
help="The embedding model to use, e.g., 'sentence-transformers/all-mpnet-base-v2' or 'text-embedding-ada-002'.")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"=== Leann Simple Demo with {args.embedding_model} ===")
|
||||
print()
|
||||
|
||||
|
||||
# Sample knowledge base
|
||||
chunks = [
|
||||
"Machine learning is a subset of artificial intelligence that enables computers to learn without being explicitly programmed.",
|
||||
@@ -34,7 +27,7 @@ def main():
|
||||
"Big data refers to extremely large datasets that require special tools and techniques to process.",
|
||||
"Cloud computing provides on-demand access to computing resources over the internet.",
|
||||
]
|
||||
|
||||
|
||||
print("1. Building index (no embeddings stored)...")
|
||||
builder = LeannBuilder(
|
||||
embedding_model=args.embedding_model,
|
||||
@@ -44,45 +37,45 @@ def main():
|
||||
builder.add_text(chunk)
|
||||
builder.build_index("demo_knowledge.leann")
|
||||
print()
|
||||
|
||||
|
||||
print("2. Searching with real-time embeddings...")
|
||||
searcher = LeannSearcher("demo_knowledge.leann")
|
||||
|
||||
|
||||
queries = [
|
||||
"What is machine learning?",
|
||||
"How does neural network work?",
|
||||
"How does neural network work?",
|
||||
"Tell me about data processing",
|
||||
]
|
||||
|
||||
|
||||
for query in queries:
|
||||
print(f"Query: {query}")
|
||||
results = searcher.search(query, top_k=2)
|
||||
|
||||
|
||||
for i, result in enumerate(results, 1):
|
||||
print(f" {i}. Score: {result.score:.3f}")
|
||||
print(f" Text: {result.text[:100]}...")
|
||||
print()
|
||||
|
||||
|
||||
print("3. Interactive chat demo:")
|
||||
print(" (Note: Requires OpenAI API key for real responses)")
|
||||
|
||||
|
||||
chat = LeannChat("demo_knowledge.leann")
|
||||
|
||||
|
||||
# Demo questions
|
||||
demo_questions: list[str] = [
|
||||
"What is the difference between machine learning and deep learning?",
|
||||
"How is data science related to big data?",
|
||||
]
|
||||
|
||||
|
||||
for question in demo_questions:
|
||||
print(f" Q: {question}")
|
||||
response = chat.ask(question)
|
||||
print(f" A: {response}")
|
||||
print()
|
||||
|
||||
|
||||
print("Demo completed! Try running:")
|
||||
print(" uv run python examples/document_search.py")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
@@ -1,11 +1,13 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import asyncio
|
||||
import dotenv
|
||||
from leann.api import LeannBuilder, LeannChat
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Any, Optional
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
import requests
|
||||
import time
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
@@ -14,7 +16,7 @@ DEFAULT_WECHAT_EXPORT_DIR = "./wechat_export_direct"
|
||||
|
||||
|
||||
def create_leann_index_from_multiple_wechat_exports(
|
||||
export_dirs: list[Path],
|
||||
export_dirs: List[Path],
|
||||
index_path: str = "wechat_history_index.leann",
|
||||
max_count: int = -1,
|
||||
):
|
||||
@@ -36,13 +38,15 @@ def create_leann_index_from_multiple_wechat_exports(
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
all_documents = []
|
||||
total_processed = 0
|
||||
|
||||
# Process each WeChat export directory
|
||||
for i, export_dir in enumerate(export_dirs):
|
||||
print(f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}")
|
||||
print(
|
||||
f"\nProcessing WeChat export {i + 1}/{len(export_dirs)}: {export_dir}"
|
||||
)
|
||||
|
||||
try:
|
||||
documents = reader.load_data(
|
||||
@@ -82,12 +86,7 @@ def create_leann_index_from_multiple_wechat_exports(
|
||||
# Split the document into chunks
|
||||
nodes = text_splitter.get_nodes_from_documents([doc])
|
||||
for node in nodes:
|
||||
text = (
|
||||
"[Contact] means the message is from: "
|
||||
+ doc.metadata["contact_name"]
|
||||
+ "\n"
|
||||
+ node.get_content()
|
||||
)
|
||||
text = '[Contact] means the message is from: ' + doc.metadata["contact_name"] + '\n' + node.get_content()
|
||||
all_texts.append(text)
|
||||
|
||||
print(
|
||||
@@ -95,12 +94,12 @@ def create_leann_index_from_multiple_wechat_exports(
|
||||
)
|
||||
|
||||
# Create LEANN index directory
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print("--- Building new LEANN index ---")
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
@@ -126,7 +125,7 @@ def create_leann_index_from_multiple_wechat_exports(
|
||||
|
||||
|
||||
def create_leann_index(
|
||||
export_dir: str | None = None,
|
||||
export_dir: str = None,
|
||||
index_path: str = "wechat_history_index.leann",
|
||||
max_count: int = 1000,
|
||||
):
|
||||
@@ -142,12 +141,12 @@ def create_leann_index(
|
||||
INDEX_DIR = Path(index_path).parent
|
||||
|
||||
if not INDEX_DIR.exists():
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print("--- Building new LEANN index ---")
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Load documents using WeChatHistoryReader from history_data
|
||||
from history_data.wechat_history import WeChatHistoryReader
|
||||
@@ -180,12 +179,12 @@ def create_leann_index(
|
||||
print(f"Created {len(all_texts)} text chunks from {len(documents)} documents")
|
||||
|
||||
# Create LEANN index directory
|
||||
print("--- Index directory not found, building new index ---")
|
||||
print(f"--- Index directory not found, building new index ---")
|
||||
INDEX_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print("--- Building new LEANN index ---")
|
||||
print(f"--- Building new LEANN index ---")
|
||||
|
||||
print("\n[PHASE 1] Building Leann index...")
|
||||
print(f"\n[PHASE 1] Building Leann index...")
|
||||
|
||||
# Use HNSW backend for better macOS compatibility
|
||||
builder = LeannBuilder(
|
||||
@@ -218,7 +217,7 @@ async def query_leann_index(index_path: str, query: str):
|
||||
index_path: Path to the LEANN index
|
||||
query: The query string
|
||||
"""
|
||||
print("\n[PHASE 2] Starting Leann chat session...")
|
||||
print(f"\n[PHASE 2] Starting Leann chat session...")
|
||||
chat = LeannChat(index_path=index_path)
|
||||
|
||||
print(f"You: {query}")
|
||||
@@ -308,7 +307,7 @@ async def main():
|
||||
else:
|
||||
# Example queries
|
||||
queries = [
|
||||
"我想买魔术师约翰逊的球衣,给我一些对应聊天记录?",
|
||||
"我想买魔术师约翰逊的球衣,给我一些对应聊天记录?",
|
||||
]
|
||||
|
||||
for query in queries:
|
||||
|
||||
@@ -3,6 +3,34 @@
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
project(leann_backend_diskann_wrapper)
|
||||
|
||||
# Find Python - scikit-build-core should provide this
|
||||
find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
|
||||
|
||||
# Print Python information for debugging
|
||||
message(STATUS "Python_FOUND: ${Python_FOUND}")
|
||||
message(STATUS "Python_VERSION: ${Python_VERSION}")
|
||||
message(STATUS "Python_EXECUTABLE: ${Python_EXECUTABLE}")
|
||||
message(STATUS "Python_INCLUDE_DIRS: ${Python_INCLUDE_DIRS}")
|
||||
|
||||
# Pass Python information to DiskANN through cache variables
|
||||
set(Python_EXECUTABLE ${Python_EXECUTABLE} CACHE FILEPATH "Python executable" FORCE)
|
||||
set(Python_INCLUDE_DIRS ${Python_INCLUDE_DIRS} CACHE PATH "Python include dirs" FORCE)
|
||||
set(Python_LIBRARIES ${Python_LIBRARIES} CACHE FILEPATH "Python libraries" FORCE)
|
||||
set(Python_VERSION ${Python_VERSION} CACHE STRING "Python version" FORCE)
|
||||
set(Python_FOUND ${Python_FOUND} CACHE BOOL "Python found" FORCE)
|
||||
|
||||
# Also set Python3 variables for compatibility
|
||||
set(Python3_EXECUTABLE ${Python_EXECUTABLE} CACHE FILEPATH "Python3 executable" FORCE)
|
||||
set(Python3_INCLUDE_DIRS ${Python_INCLUDE_DIRS} CACHE PATH "Python3 include dirs" FORCE)
|
||||
set(Python3_LIBRARIES ${Python_LIBRARIES} CACHE FILEPATH "Python3 libraries" FORCE)
|
||||
set(Python3_VERSION ${Python_VERSION} CACHE STRING "Python3 version" FORCE)
|
||||
set(Python3_FOUND ${Python_FOUND} CACHE BOOL "Python3 found" FORCE)
|
||||
set(Python3_Development_FOUND TRUE CACHE BOOL "Python3 development found" FORCE)
|
||||
|
||||
# Set Python finding strategy
|
||||
set(Python_FIND_VIRTUALENV ONLY CACHE STRING "" FORCE)
|
||||
set(Python3_FIND_VIRTUALENV ONLY CACHE STRING "" FORCE)
|
||||
|
||||
# Tell CMake to directly enter the DiskANN submodule and execute its own CMakeLists.txt
|
||||
# DiskANN will handle everything itself, including compiling Python bindings
|
||||
add_subdirectory(src/third_party/DiskANN)
|
||||
add_subdirectory(third_party/DiskANN)
|
||||
|
||||
@@ -1 +1 @@
|
||||
# This file makes the directory a Python package
|
||||
# This file makes the directory a Python package
|
||||
@@ -1 +1 @@
|
||||
from . import diskann_backend as diskann_backend
|
||||
from . import diskann_backend
|
||||
@@ -1,19 +1,20 @@
|
||||
import contextlib
|
||||
import logging
|
||||
import numpy as np
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
from typing import Dict, Any, List, Literal, Optional
|
||||
import contextlib
|
||||
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
from leann.searcher_base import BaseSearcher
|
||||
from leann.registry import register_backend
|
||||
from leann.interface import (
|
||||
LeannBackendBuilderInterface,
|
||||
LeannBackendFactoryInterface,
|
||||
LeannBackendBuilderInterface,
|
||||
LeannBackendSearcherInterface,
|
||||
)
|
||||
from leann.registry import register_backend
|
||||
from leann.searcher_base import BaseSearcher
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -99,7 +100,7 @@ class DiskannBuilder(LeannBackendBuilderInterface):
|
||||
def __init__(self, **kwargs):
|
||||
self.build_params = kwargs
|
||||
|
||||
def build(self, data: np.ndarray, ids: list[str], index_path: str, **kwargs):
|
||||
def build(self, data: np.ndarray, ids: List[str], index_path: str, **kwargs):
|
||||
path = Path(index_path)
|
||||
index_dir = path.parent
|
||||
index_prefix = path.stem
|
||||
@@ -185,11 +186,11 @@ class DiskannSearcher(BaseSearcher):
|
||||
prune_ratio: float = 0.0,
|
||||
recompute_embeddings: bool = False,
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
zmq_port: int | None = None,
|
||||
zmq_port: Optional[int] = None,
|
||||
batch_recompute: bool = False,
|
||||
dedup_node_dis: bool = False,
|
||||
**kwargs,
|
||||
) -> dict[str, Any]:
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Search for nearest neighbors using DiskANN index.
|
||||
|
||||
@@ -215,10 +216,14 @@ class DiskannSearcher(BaseSearcher):
|
||||
# Handle zmq_port compatibility: DiskANN can now update port at runtime
|
||||
if recompute_embeddings:
|
||||
if zmq_port is None:
|
||||
raise ValueError("zmq_port must be provided if recompute_embeddings is True")
|
||||
raise ValueError(
|
||||
"zmq_port must be provided if recompute_embeddings is True"
|
||||
)
|
||||
current_port = self._index.get_zmq_port()
|
||||
if zmq_port != current_port:
|
||||
logger.debug(f"Updating DiskANN zmq_port from {current_port} to {zmq_port}")
|
||||
logger.debug(
|
||||
f"Updating DiskANN zmq_port from {current_port} to {zmq_port}"
|
||||
)
|
||||
self._index.set_zmq_port(zmq_port)
|
||||
|
||||
# DiskANN doesn't support "proportional" strategy
|
||||
@@ -254,6 +259,8 @@ class DiskannSearcher(BaseSearcher):
|
||||
use_global_pruning,
|
||||
)
|
||||
|
||||
string_labels = [[str(int_label) for int_label in batch_labels] for batch_labels in labels]
|
||||
string_labels = [
|
||||
[str(int_label) for int_label in batch_labels] for batch_labels in labels
|
||||
]
|
||||
|
||||
return {"labels": string_labels, "distances": distances}
|
||||
|
||||
@@ -3,16 +3,16 @@ DiskANN-specific embedding server
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import os
|
||||
import zmq
|
||||
import numpy as np
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import sys
|
||||
import logging
|
||||
|
||||
# Set up logging based on environment variable
|
||||
LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
|
||||
@@ -32,7 +32,7 @@ if not logger.handlers:
|
||||
|
||||
|
||||
def create_diskann_embedding_server(
|
||||
passages_file: str | None = None,
|
||||
passages_file: Optional[str] = None,
|
||||
zmq_port: int = 5555,
|
||||
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
||||
embedding_mode: str = "sentence-transformers",
|
||||
@@ -50,8 +50,8 @@ def create_diskann_embedding_server(
|
||||
sys.path.insert(0, str(leann_core_path))
|
||||
|
||||
try:
|
||||
from leann.api import PassageManager
|
||||
from leann.embedding_compute import compute_embeddings
|
||||
from leann.api import PassageManager
|
||||
|
||||
logger.info("Successfully imported unified embedding computation module")
|
||||
except ImportError as e:
|
||||
@@ -76,7 +76,7 @@ def create_diskann_embedding_server(
|
||||
raise ValueError("Only metadata files (.meta.json) are supported")
|
||||
|
||||
# Load metadata to get passage sources
|
||||
with open(passages_file) as f:
|
||||
with open(passages_file, "r") as f:
|
||||
meta = json.load(f)
|
||||
|
||||
passages = PassageManager(meta["passage_sources"])
|
||||
@@ -150,7 +150,9 @@ def create_diskann_embedding_server(
|
||||
):
|
||||
texts = request
|
||||
is_text_request = True
|
||||
logger.info(f"✅ MSGPACK: Direct text request for {len(texts)} texts")
|
||||
logger.info(
|
||||
f"✅ MSGPACK: Direct text request for {len(texts)} texts"
|
||||
)
|
||||
else:
|
||||
raise ValueError("Not a valid msgpack text request")
|
||||
except Exception as msgpack_error:
|
||||
@@ -165,7 +167,9 @@ def create_diskann_embedding_server(
|
||||
passage_data = passages.get_passage(str(nid))
|
||||
txt = passage_data["text"]
|
||||
if not txt:
|
||||
raise RuntimeError(f"FATAL: Empty text for passage ID {nid}")
|
||||
raise RuntimeError(
|
||||
f"FATAL: Empty text for passage ID {nid}"
|
||||
)
|
||||
texts.append(txt)
|
||||
except KeyError as e:
|
||||
logger.error(f"Passage ID {nid} not found: {e}")
|
||||
@@ -176,7 +180,9 @@ def create_diskann_embedding_server(
|
||||
|
||||
# Debug logging
|
||||
logger.debug(f"Processing {len(texts)} texts")
|
||||
logger.debug(f"Text lengths: {[len(t) for t in texts[:5]]}") # Show first 5
|
||||
logger.debug(
|
||||
f"Text lengths: {[len(t) for t in texts[:5]]}"
|
||||
) # Show first 5
|
||||
|
||||
# Process embeddings using unified computation
|
||||
embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
|
||||
@@ -193,7 +199,9 @@ def create_diskann_embedding_server(
|
||||
else:
|
||||
# For DiskANN C++ compatibility: return protobuf format
|
||||
resp_proto = embedding_pb2.NodeEmbeddingResponse()
|
||||
hidden_contiguous = np.ascontiguousarray(embeddings, dtype=np.float32)
|
||||
hidden_contiguous = np.ascontiguousarray(
|
||||
embeddings, dtype=np.float32
|
||||
)
|
||||
|
||||
# Serialize embeddings data
|
||||
resp_proto.embeddings_data = hidden_contiguous.tobytes()
|
||||
|
||||
@@ -1,28 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: embedding.proto
|
||||
# ruff: noqa
|
||||
"""Generated protocol buffer code."""
|
||||
|
||||
from google.protobuf.internal import builder as _builder
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
||||
b'\n\x0f\x65mbedding.proto\x12\x0eprotoembedding"(\n\x14NodeEmbeddingRequest\x12\x10\n\x08node_ids\x18\x01 \x03(\r"Y\n\x15NodeEmbeddingResponse\x12\x17\n\x0f\x65mbeddings_data\x18\x01 \x01(\x0c\x12\x12\n\ndimensions\x18\x02 \x03(\x05\x12\x13\n\x0bmissing_ids\x18\x03 \x03(\rb\x06proto3'
|
||||
)
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x0f\x65mbedding.proto\x12\x0eprotoembedding\"(\n\x14NodeEmbeddingRequest\x12\x10\n\x08node_ids\x18\x01 \x03(\r\"Y\n\x15NodeEmbeddingResponse\x12\x17\n\x0f\x65mbeddings_data\x18\x01 \x01(\x0c\x12\x12\n\ndimensions\x18\x02 \x03(\x05\x12\x13\n\x0bmissing_ids\x18\x03 \x03(\rb\x06proto3')
|
||||
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "embedding_pb2", globals())
|
||||
if not _descriptor._USE_C_DESCRIPTORS:
|
||||
DESCRIPTOR._options = None
|
||||
_NODEEMBEDDINGREQUEST._serialized_start = 35
|
||||
_NODEEMBEDDINGREQUEST._serialized_end = 75
|
||||
_NODEEMBEDDINGRESPONSE._serialized_start = 77
|
||||
_NODEEMBEDDINGRESPONSE._serialized_end = 166
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'embedding_pb2', globals())
|
||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
||||
|
||||
DESCRIPTOR._options = None
|
||||
_NODEEMBEDDINGREQUEST._serialized_start=35
|
||||
_NODEEMBEDDINGREQUEST._serialized_end=75
|
||||
_NODEEMBEDDINGRESPONSE._serialized_start=77
|
||||
_NODEEMBEDDINGRESPONSE._serialized_end=166
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
|
||||
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "leann-backend-diskann"
|
||||
version = "0.1.14"
|
||||
dependencies = ["leann-core==0.1.14", "numpy", "protobuf>=3.19.0"]
|
||||
version = "0.1.8"
|
||||
dependencies = ["leann-core==0.1.8", "numpy"]
|
||||
|
||||
[tool.scikit-build]
|
||||
# Key: simplified CMake path
|
||||
@@ -16,4 +16,9 @@ wheel.packages = ["leann_backend_diskann"]
|
||||
editable.mode = "redirect"
|
||||
cmake.build-type = "Release"
|
||||
build.verbose = true
|
||||
build.tool-args = ["-j8"]
|
||||
build.tool-args = ["-j8"]
|
||||
wheel.exclude = ["CMakeLists.txt", "src", "third_party/**", "*.o", "*.so"]
|
||||
sdist.include = ["CMakeLists.txt", "src", "third_party", "leann_backend_diskann/*.txt"]
|
||||
|
||||
[tool.scikit-build.cmake.define]
|
||||
CMAKE_BUILD_PARALLEL_LEVEL = "8"
|
||||
Submodule packages/leann-backend-diskann/third_party/DiskANN updated: af2a26481e...81666b7758
@@ -24,6 +24,38 @@ set(MSGPACK_USE_BOOST OFF CACHE BOOL "" FORCE)
|
||||
add_compile_definitions(MSGPACK_NO_BOOST)
|
||||
include_directories(third_party/msgpack-c/include)
|
||||
|
||||
# Find Python for our own use (not for Faiss)
|
||||
if(DEFINED SKBUILD)
|
||||
message(STATUS "Building with scikit-build")
|
||||
# scikit-build-core provides Python information
|
||||
endif()
|
||||
|
||||
# Find Python - scikit-build-core should provide this
|
||||
find_package(Python REQUIRED COMPONENTS Interpreter Development.Module NumPy)
|
||||
|
||||
# Print Python information for debugging
|
||||
message(STATUS "Python_FOUND: ${Python_FOUND}")
|
||||
message(STATUS "Python_VERSION: ${Python_VERSION}")
|
||||
message(STATUS "Python_EXECUTABLE: ${Python_EXECUTABLE}")
|
||||
message(STATUS "Python_INCLUDE_DIRS: ${Python_INCLUDE_DIRS}")
|
||||
message(STATUS "Python_NumPy_INCLUDE_DIRS: ${Python_NumPy_INCLUDE_DIRS}")
|
||||
|
||||
# Pass Python information to faiss through cache variables
|
||||
set(Python_EXECUTABLE ${Python_EXECUTABLE} CACHE FILEPATH "Python executable" FORCE)
|
||||
set(Python_INCLUDE_DIRS ${Python_INCLUDE_DIRS} CACHE PATH "Python include dirs" FORCE)
|
||||
set(Python_NumPy_INCLUDE_DIRS ${Python_NumPy_INCLUDE_DIRS} CACHE PATH "NumPy include dirs" FORCE)
|
||||
set(Python_VERSION ${Python_VERSION} CACHE STRING "Python version" FORCE)
|
||||
set(Python_FOUND ${Python_FOUND} CACHE BOOL "Python found" FORCE)
|
||||
|
||||
# Also set Python3 variables for compatibility
|
||||
set(Python3_EXECUTABLE ${Python_EXECUTABLE} CACHE FILEPATH "Python3 executable" FORCE)
|
||||
set(Python3_INCLUDE_DIRS ${Python_INCLUDE_DIRS} CACHE PATH "Python3 include dirs" FORCE)
|
||||
set(Python3_NumPy_INCLUDE_DIRS ${Python_NumPy_INCLUDE_DIRS} CACHE PATH "NumPy include dirs" FORCE)
|
||||
set(Python3_VERSION ${Python_VERSION} CACHE STRING "Python3 version" FORCE)
|
||||
set(Python3_FOUND ${Python_FOUND} CACHE BOOL "Python3 found" FORCE)
|
||||
set(Python3_Development_FOUND TRUE CACHE BOOL "Python3 development found" FORCE)
|
||||
set(Python3_NumPy_FOUND TRUE CACHE BOOL "Python3 NumPy found" FORCE)
|
||||
|
||||
# Faiss configuration - streamlined build
|
||||
set(FAISS_ENABLE_PYTHON ON CACHE BOOL "" FORCE)
|
||||
set(FAISS_ENABLE_GPU OFF CACHE BOOL "" FORCE)
|
||||
@@ -52,4 +84,8 @@ set(FAISS_BUILD_AVX512 OFF CACHE BOOL "" FORCE)
|
||||
# IMPORTANT: Disable building AVX versions to speed up compilation
|
||||
set(FAISS_BUILD_AVX_VERSIONS OFF CACHE BOOL "" FORCE)
|
||||
|
||||
# Force Faiss to use our Python settings
|
||||
set(Python_FIND_VIRTUALENV ONLY CACHE STRING "" FORCE)
|
||||
set(Python3_FIND_VIRTUALENV ONLY CACHE STRING "" FORCE)
|
||||
|
||||
add_subdirectory(third_party/faiss)
|
||||
@@ -1 +1 @@
|
||||
from . import hnsw_backend as hnsw_backend
|
||||
from . import hnsw_backend
|
||||
|
||||
@@ -1,111 +1,87 @@
|
||||
import argparse
|
||||
import gc # Import garbage collector interface
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import gc # Import garbage collector interface
|
||||
import time
|
||||
# --- FourCCs (add more if needed) ---
|
||||
INDEX_HNSW_FLAT_FOURCC = int.from_bytes(b"IHNf", "little")
|
||||
INDEX_HNSW_FLAT_FOURCC = int.from_bytes(b'IHNf', 'little')
|
||||
# Add other HNSW fourccs if you expect different storage types inside HNSW
|
||||
# INDEX_HNSW_PQ_FOURCC = int.from_bytes(b'IHNp', 'little')
|
||||
# INDEX_HNSW_SQ_FOURCC = int.from_bytes(b'IHNs', 'little')
|
||||
# INDEX_HNSW_CAGRA_FOURCC = int.from_bytes(b'IHNc', 'little') # Example
|
||||
|
||||
EXPECTED_HNSW_FOURCCS = {INDEX_HNSW_FLAT_FOURCC} # Modify if needed
|
||||
NULL_INDEX_FOURCC = int.from_bytes(b"null", "little")
|
||||
EXPECTED_HNSW_FOURCCS = {INDEX_HNSW_FLAT_FOURCC} # Modify if needed
|
||||
NULL_INDEX_FOURCC = int.from_bytes(b'null', 'little')
|
||||
|
||||
# --- Helper functions for reading/writing binary data ---
|
||||
|
||||
|
||||
def read_struct(f, fmt):
|
||||
"""Reads data according to the struct format."""
|
||||
size = struct.calcsize(fmt)
|
||||
data = f.read(size)
|
||||
if len(data) != size:
|
||||
raise EOFError(
|
||||
f"File ended unexpectedly reading struct fmt '{fmt}'. Expected {size} bytes, got {len(data)}."
|
||||
)
|
||||
raise EOFError(f"File ended unexpectedly reading struct fmt '{fmt}'. Expected {size} bytes, got {len(data)}.")
|
||||
return struct.unpack(fmt, data)[0]
|
||||
|
||||
|
||||
def read_vector_raw(f, element_fmt_char):
|
||||
"""Reads a vector (size followed by data), returns count and raw bytes."""
|
||||
count = -1 # Initialize count
|
||||
total_bytes = -1 # Initialize total_bytes
|
||||
count = -1 # Initialize count
|
||||
total_bytes = -1 # Initialize total_bytes
|
||||
try:
|
||||
count = read_struct(f, "<Q") # size_t usually 64-bit unsigned
|
||||
count = read_struct(f, '<Q') # size_t usually 64-bit unsigned
|
||||
element_size = struct.calcsize(element_fmt_char)
|
||||
# --- FIX for MemoryError: Check for unreasonably large count ---
|
||||
max_reasonable_count = 10 * (10**9) # ~10 billion elements limit
|
||||
max_reasonable_count = 10 * (10**9) # ~10 billion elements limit
|
||||
if count > max_reasonable_count or count < 0:
|
||||
raise MemoryError(
|
||||
f"Vector count {count} seems unreasonably large, possibly due to file corruption or incorrect format read."
|
||||
)
|
||||
raise MemoryError(f"Vector count {count} seems unreasonably large, possibly due to file corruption or incorrect format read.")
|
||||
|
||||
total_bytes = count * element_size
|
||||
# --- FIX for MemoryError: Check for huge byte size before allocation ---
|
||||
max_reasonable_bytes = 50 * (1024**3) # ~50 GB limit
|
||||
if total_bytes > max_reasonable_bytes or total_bytes < 0: # Check for overflow
|
||||
raise MemoryError(
|
||||
f"Attempting to read {total_bytes} bytes ({count} elements * {element_size} bytes/element), which exceeds the safety limit. File might be corrupted or format mismatch."
|
||||
)
|
||||
max_reasonable_bytes = 50 * (1024**3) # ~50 GB limit
|
||||
if total_bytes > max_reasonable_bytes or total_bytes < 0: # Check for overflow
|
||||
raise MemoryError(f"Attempting to read {total_bytes} bytes ({count} elements * {element_size} bytes/element), which exceeds the safety limit. File might be corrupted or format mismatch.")
|
||||
|
||||
data_bytes = f.read(total_bytes)
|
||||
|
||||
if len(data_bytes) != total_bytes:
|
||||
raise EOFError(
|
||||
f"File ended unexpectedly reading vector data. Expected {total_bytes} bytes, got {len(data_bytes)}."
|
||||
)
|
||||
raise EOFError(f"File ended unexpectedly reading vector data. Expected {total_bytes} bytes, got {len(data_bytes)}.")
|
||||
return count, data_bytes
|
||||
except (MemoryError, OverflowError) as e:
|
||||
# Add context to the error message
|
||||
print(
|
||||
f"\nError during raw vector read (element_fmt='{element_fmt_char}', count={count}, total_bytes={total_bytes}): {e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
raise e # Re-raise the original error type
|
||||
|
||||
# Add context to the error message
|
||||
print(f"\nError during raw vector read (element_fmt='{element_fmt_char}', count={count}, total_bytes={total_bytes}): {e}", file=sys.stderr)
|
||||
raise e # Re-raise the original error type
|
||||
|
||||
def read_numpy_vector(f, np_dtype, struct_fmt_char):
|
||||
"""Reads a vector into a NumPy array."""
|
||||
count = -1 # Initialize count for robust error handling
|
||||
print(f" Reading vector (dtype={np_dtype}, fmt='{struct_fmt_char}')... ", end="", flush=True)
|
||||
count = -1 # Initialize count for robust error handling
|
||||
print(f" Reading vector (dtype={np_dtype}, fmt='{struct_fmt_char}')... ", end='', flush=True)
|
||||
try:
|
||||
count, data_bytes = read_vector_raw(f, struct_fmt_char)
|
||||
print(f"Count={count}, Bytes={len(data_bytes)}")
|
||||
if count > 0 and len(data_bytes) > 0:
|
||||
arr = np.frombuffer(data_bytes, dtype=np_dtype)
|
||||
if arr.size != count:
|
||||
raise ValueError(
|
||||
f"Inconsistent array size after reading. Expected {count}, got {arr.size}"
|
||||
)
|
||||
raise ValueError(f"Inconsistent array size after reading. Expected {count}, got {arr.size}")
|
||||
return arr
|
||||
elif count == 0:
|
||||
return np.array([], dtype=np_dtype)
|
||||
return np.array([], dtype=np_dtype)
|
||||
else:
|
||||
raise ValueError("Read zero bytes but count > 0.")
|
||||
raise ValueError("Read zero bytes but count > 0.")
|
||||
except MemoryError as e:
|
||||
# Now count should be defined (or -1 if error was in read_struct)
|
||||
print(
|
||||
f"\nMemoryError creating NumPy array (dtype={np_dtype}, count={count}). {e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(f"\nMemoryError creating NumPy array (dtype={np_dtype}, count={count}). {e}", file=sys.stderr)
|
||||
raise e
|
||||
except Exception as e: # Catch other potential errors like ValueError
|
||||
print(
|
||||
f"\nError reading numpy vector (dtype={np_dtype}, fmt='{struct_fmt_char}', count={count}): {e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
except Exception as e: # Catch other potential errors like ValueError
|
||||
print(f"\nError reading numpy vector (dtype={np_dtype}, fmt='{struct_fmt_char}', count={count}): {e}", file=sys.stderr)
|
||||
raise e
|
||||
|
||||
|
||||
def write_numpy_vector(f, arr, struct_fmt_char):
|
||||
"""Writes a NumPy array as a vector (size followed by data)."""
|
||||
count = arr.size
|
||||
f.write(struct.pack("<Q", count))
|
||||
f.write(struct.pack('<Q', count))
|
||||
try:
|
||||
expected_dtype = np.dtype(struct_fmt_char)
|
||||
if arr.dtype != expected_dtype:
|
||||
@@ -113,30 +89,23 @@ def write_numpy_vector(f, arr, struct_fmt_char):
|
||||
else:
|
||||
data_to_write = arr.tobytes()
|
||||
f.write(data_to_write)
|
||||
del data_to_write # Hint GC
|
||||
del data_to_write # Hint GC
|
||||
except MemoryError as e:
|
||||
print(
|
||||
f"\nMemoryError converting NumPy array to bytes for writing (size={count}, dtype={arr.dtype}). {e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
raise e
|
||||
|
||||
print(f"\nMemoryError converting NumPy array to bytes for writing (size={count}, dtype={arr.dtype}). {e}", file=sys.stderr)
|
||||
raise e
|
||||
|
||||
def write_list_vector(f, lst, struct_fmt_char):
|
||||
"""Writes a Python list as a vector iteratively."""
|
||||
count = len(lst)
|
||||
f.write(struct.pack("<Q", count))
|
||||
fmt = "<" + struct_fmt_char
|
||||
f.write(struct.pack('<Q', count))
|
||||
fmt = '<' + struct_fmt_char
|
||||
chunk_size = 1024 * 1024
|
||||
element_size = struct.calcsize(fmt)
|
||||
# Allocate buffer outside the loop if possible, or handle MemoryError during allocation
|
||||
try:
|
||||
buffer = bytearray(chunk_size * element_size)
|
||||
except MemoryError:
|
||||
print(
|
||||
f"MemoryError: Cannot allocate buffer for writing list vector chunk (size {chunk_size * element_size} bytes).",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(f"MemoryError: Cannot allocate buffer for writing list vector chunk (size {chunk_size * element_size} bytes).", file=sys.stderr)
|
||||
raise
|
||||
buffer_count = 0
|
||||
|
||||
@@ -147,80 +116,66 @@ def write_list_vector(f, lst, struct_fmt_char):
|
||||
buffer_count += 1
|
||||
|
||||
if buffer_count == chunk_size or i == count - 1:
|
||||
f.write(buffer[: buffer_count * element_size])
|
||||
f.write(buffer[:buffer_count * element_size])
|
||||
buffer_count = 0
|
||||
|
||||
except struct.error as e:
|
||||
print(
|
||||
f"\nStruct packing error for item {item} at index {i} with format '{fmt}'. {e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(f"\nStruct packing error for item {item} at index {i} with format '{fmt}'. {e}", file=sys.stderr)
|
||||
raise e
|
||||
|
||||
|
||||
def get_cum_neighbors(cum_nneighbor_per_level_np, level):
|
||||
"""Helper to get cumulative neighbors count, matching C++ logic."""
|
||||
if level < 0:
|
||||
return 0
|
||||
if level < 0: return 0
|
||||
if level < len(cum_nneighbor_per_level_np):
|
||||
return cum_nneighbor_per_level_np[level]
|
||||
else:
|
||||
return cum_nneighbor_per_level_np[-1] if len(cum_nneighbor_per_level_np) > 0 else 0
|
||||
|
||||
|
||||
def write_compact_format(
|
||||
f_out,
|
||||
original_hnsw_data,
|
||||
assign_probas_np,
|
||||
cum_nneighbor_per_level_np,
|
||||
levels_np,
|
||||
compact_level_ptr,
|
||||
compact_node_offsets_np,
|
||||
compact_neighbors_data,
|
||||
storage_fourcc,
|
||||
storage_data,
|
||||
):
|
||||
def write_compact_format(f_out, original_hnsw_data, assign_probas_np, cum_nneighbor_per_level_np,
|
||||
levels_np, compact_level_ptr, compact_node_offsets_np,
|
||||
compact_neighbors_data, storage_fourcc, storage_data):
|
||||
"""Write HNSW data in compact format following C++ read order exactly."""
|
||||
# Write IndexHNSW Header
|
||||
f_out.write(struct.pack("<I", original_hnsw_data["index_fourcc"]))
|
||||
f_out.write(struct.pack("<i", original_hnsw_data["d"]))
|
||||
f_out.write(struct.pack("<q", original_hnsw_data["ntotal"]))
|
||||
f_out.write(struct.pack("<q", original_hnsw_data["dummy1"]))
|
||||
f_out.write(struct.pack("<q", original_hnsw_data["dummy2"]))
|
||||
f_out.write(struct.pack("<?", original_hnsw_data["is_trained"]))
|
||||
f_out.write(struct.pack("<i", original_hnsw_data["metric_type"]))
|
||||
if original_hnsw_data["metric_type"] > 1:
|
||||
f_out.write(struct.pack("<f", original_hnsw_data["metric_arg"]))
|
||||
f_out.write(struct.pack('<I', original_hnsw_data['index_fourcc']))
|
||||
f_out.write(struct.pack('<i', original_hnsw_data['d']))
|
||||
f_out.write(struct.pack('<q', original_hnsw_data['ntotal']))
|
||||
f_out.write(struct.pack('<q', original_hnsw_data['dummy1']))
|
||||
f_out.write(struct.pack('<q', original_hnsw_data['dummy2']))
|
||||
f_out.write(struct.pack('<?', original_hnsw_data['is_trained']))
|
||||
f_out.write(struct.pack('<i', original_hnsw_data['metric_type']))
|
||||
if original_hnsw_data['metric_type'] > 1:
|
||||
f_out.write(struct.pack('<f', original_hnsw_data['metric_arg']))
|
||||
|
||||
# Write HNSW struct parts (standard order)
|
||||
write_numpy_vector(f_out, assign_probas_np, "d")
|
||||
write_numpy_vector(f_out, cum_nneighbor_per_level_np, "i")
|
||||
write_numpy_vector(f_out, levels_np, "i")
|
||||
write_numpy_vector(f_out, assign_probas_np, 'd')
|
||||
write_numpy_vector(f_out, cum_nneighbor_per_level_np, 'i')
|
||||
write_numpy_vector(f_out, levels_np, 'i')
|
||||
|
||||
# Write compact format flag
|
||||
f_out.write(struct.pack("<?", True)) # storage_is_compact = True
|
||||
f_out.write(struct.pack('<?', True)) # storage_is_compact = True
|
||||
|
||||
# Write compact data in CORRECT C++ read order: level_ptr, node_offsets FIRST
|
||||
if isinstance(compact_level_ptr, np.ndarray):
|
||||
write_numpy_vector(f_out, compact_level_ptr, "Q")
|
||||
write_numpy_vector(f_out, compact_level_ptr, 'Q')
|
||||
else:
|
||||
write_list_vector(f_out, compact_level_ptr, "Q")
|
||||
|
||||
write_numpy_vector(f_out, compact_node_offsets_np, "Q")
|
||||
write_list_vector(f_out, compact_level_ptr, 'Q')
|
||||
|
||||
write_numpy_vector(f_out, compact_node_offsets_np, 'Q')
|
||||
|
||||
# Write HNSW scalar parameters
|
||||
f_out.write(struct.pack("<i", original_hnsw_data["entry_point"]))
|
||||
f_out.write(struct.pack("<i", original_hnsw_data["max_level"]))
|
||||
f_out.write(struct.pack("<i", original_hnsw_data["efConstruction"]))
|
||||
f_out.write(struct.pack("<i", original_hnsw_data["efSearch"]))
|
||||
f_out.write(struct.pack("<i", original_hnsw_data["dummy_upper_beam"]))
|
||||
f_out.write(struct.pack('<i', original_hnsw_data['entry_point']))
|
||||
f_out.write(struct.pack('<i', original_hnsw_data['max_level']))
|
||||
f_out.write(struct.pack('<i', original_hnsw_data['efConstruction']))
|
||||
f_out.write(struct.pack('<i', original_hnsw_data['efSearch']))
|
||||
f_out.write(struct.pack('<i', original_hnsw_data['dummy_upper_beam']))
|
||||
|
||||
# Write storage fourcc (this determines how to read what follows)
|
||||
f_out.write(struct.pack("<I", storage_fourcc))
|
||||
|
||||
f_out.write(struct.pack('<I', storage_fourcc))
|
||||
|
||||
# Write compact neighbors data AFTER storage fourcc
|
||||
write_list_vector(f_out, compact_neighbors_data, "i")
|
||||
|
||||
write_list_vector(f_out, compact_neighbors_data, 'i')
|
||||
|
||||
# Write storage data if not NULL (only after neighbors)
|
||||
if storage_fourcc != NULL_INDEX_FOURCC and storage_data:
|
||||
f_out.write(storage_data)
|
||||
@@ -228,12 +183,11 @@ def write_compact_format(
|
||||
|
||||
# --- Main Conversion Logic ---
|
||||
|
||||
|
||||
def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=True):
|
||||
"""
|
||||
Converts an HNSW graph file to the CSR format.
|
||||
Supports both original and already-compact formats (backward compatibility).
|
||||
|
||||
|
||||
Args:
|
||||
input_filename: Input HNSW index file
|
||||
output_filename: Output CSR index file
|
||||
@@ -242,228 +196,172 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=
|
||||
print(f"Starting conversion: {input_filename} -> {output_filename}")
|
||||
start_time = time.time()
|
||||
original_hnsw_data = {}
|
||||
neighbors_np = None # Initialize to allow check in finally block
|
||||
neighbors_np = None # Initialize to allow check in finally block
|
||||
try:
|
||||
with open(input_filename, "rb") as f_in, open(output_filename, "wb") as f_out:
|
||||
with open(input_filename, 'rb') as f_in, open(output_filename, 'wb') as f_out:
|
||||
|
||||
# --- Read IndexHNSW FourCC and Header ---
|
||||
print(f"[{time.time() - start_time:.2f}s] Reading Index HNSW header...")
|
||||
# ... (Keep the header reading logic as before) ...
|
||||
hnsw_index_fourcc = read_struct(f_in, "<I")
|
||||
hnsw_index_fourcc = read_struct(f_in, '<I')
|
||||
if hnsw_index_fourcc not in EXPECTED_HNSW_FOURCCS:
|
||||
print(
|
||||
f"Error: Expected HNSW Index FourCC ({list(EXPECTED_HNSW_FOURCCS)}), got {hnsw_index_fourcc:08x}.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return False
|
||||
original_hnsw_data["index_fourcc"] = hnsw_index_fourcc
|
||||
original_hnsw_data["d"] = read_struct(f_in, "<i")
|
||||
original_hnsw_data["ntotal"] = read_struct(f_in, "<q")
|
||||
original_hnsw_data["dummy1"] = read_struct(f_in, "<q")
|
||||
original_hnsw_data["dummy2"] = read_struct(f_in, "<q")
|
||||
original_hnsw_data["is_trained"] = read_struct(f_in, "?")
|
||||
original_hnsw_data["metric_type"] = read_struct(f_in, "<i")
|
||||
original_hnsw_data["metric_arg"] = 0.0
|
||||
if original_hnsw_data["metric_type"] > 1:
|
||||
original_hnsw_data["metric_arg"] = read_struct(f_in, "<f")
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Header read: d={original_hnsw_data['d']}, ntotal={original_hnsw_data['ntotal']}"
|
||||
)
|
||||
print(f"Error: Expected HNSW Index FourCC ({list(EXPECTED_HNSW_FOURCCS)}), got {hnsw_index_fourcc:08x}.", file=sys.stderr)
|
||||
return False
|
||||
original_hnsw_data['index_fourcc'] = hnsw_index_fourcc
|
||||
original_hnsw_data['d'] = read_struct(f_in, '<i')
|
||||
original_hnsw_data['ntotal'] = read_struct(f_in, '<q')
|
||||
original_hnsw_data['dummy1'] = read_struct(f_in, '<q')
|
||||
original_hnsw_data['dummy2'] = read_struct(f_in, '<q')
|
||||
original_hnsw_data['is_trained'] = read_struct(f_in, '?')
|
||||
original_hnsw_data['metric_type'] = read_struct(f_in, '<i')
|
||||
original_hnsw_data['metric_arg'] = 0.0
|
||||
if original_hnsw_data['metric_type'] > 1:
|
||||
original_hnsw_data['metric_arg'] = read_struct(f_in, '<f')
|
||||
print(f"[{time.time() - start_time:.2f}s] Header read: d={original_hnsw_data['d']}, ntotal={original_hnsw_data['ntotal']}")
|
||||
|
||||
|
||||
# --- Read original HNSW struct data ---
|
||||
print(f"[{time.time() - start_time:.2f}s] Reading HNSW struct vectors...")
|
||||
assign_probas_np = read_numpy_vector(f_in, np.float64, "d")
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Read assign_probas ({assign_probas_np.size})"
|
||||
)
|
||||
assign_probas_np = read_numpy_vector(f_in, np.float64, 'd')
|
||||
print(f"[{time.time() - start_time:.2f}s] Read assign_probas ({assign_probas_np.size})")
|
||||
gc.collect()
|
||||
|
||||
cum_nneighbor_per_level_np = read_numpy_vector(f_in, np.int32, "i")
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Read cum_nneighbor_per_level ({cum_nneighbor_per_level_np.size})"
|
||||
)
|
||||
cum_nneighbor_per_level_np = read_numpy_vector(f_in, np.int32, 'i')
|
||||
print(f"[{time.time() - start_time:.2f}s] Read cum_nneighbor_per_level ({cum_nneighbor_per_level_np.size})")
|
||||
gc.collect()
|
||||
|
||||
levels_np = read_numpy_vector(f_in, np.int32, "i")
|
||||
levels_np = read_numpy_vector(f_in, np.int32, 'i')
|
||||
print(f"[{time.time() - start_time:.2f}s] Read levels ({levels_np.size})")
|
||||
gc.collect()
|
||||
|
||||
ntotal = len(levels_np)
|
||||
if ntotal != original_hnsw_data["ntotal"]:
|
||||
print(
|
||||
f"Warning: ntotal mismatch! Header says {original_hnsw_data['ntotal']}, levels vector size is {ntotal}. Using levels vector size.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
original_hnsw_data["ntotal"] = ntotal
|
||||
if ntotal != original_hnsw_data['ntotal']:
|
||||
print(f"Warning: ntotal mismatch! Header says {original_hnsw_data['ntotal']}, levels vector size is {ntotal}. Using levels vector size.", file=sys.stderr)
|
||||
original_hnsw_data['ntotal'] = ntotal
|
||||
|
||||
# --- Check for compact format flag ---
|
||||
print(f"[{time.time() - start_time:.2f}s] Probing for compact storage flag...")
|
||||
pos_before_compact = f_in.tell()
|
||||
try:
|
||||
is_compact_flag = read_struct(f_in, "<?")
|
||||
is_compact_flag = read_struct(f_in, '<?')
|
||||
print(f"[{time.time() - start_time:.2f}s] Found compact flag: {is_compact_flag}")
|
||||
|
||||
|
||||
if is_compact_flag:
|
||||
# Input is already in compact format - read compact data
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Input is already in compact format, reading compact data..."
|
||||
)
|
||||
|
||||
compact_level_ptr = read_numpy_vector(f_in, np.uint64, "Q")
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Read compact_level_ptr ({compact_level_ptr.size})"
|
||||
)
|
||||
|
||||
compact_node_offsets_np = read_numpy_vector(f_in, np.uint64, "Q")
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Read compact_node_offsets ({compact_node_offsets_np.size})"
|
||||
)
|
||||
|
||||
print(f"[{time.time() - start_time:.2f}s] Input is already in compact format, reading compact data...")
|
||||
|
||||
compact_level_ptr = read_numpy_vector(f_in, np.uint64, 'Q')
|
||||
print(f"[{time.time() - start_time:.2f}s] Read compact_level_ptr ({compact_level_ptr.size})")
|
||||
|
||||
compact_node_offsets_np = read_numpy_vector(f_in, np.uint64, 'Q')
|
||||
print(f"[{time.time() - start_time:.2f}s] Read compact_node_offsets ({compact_node_offsets_np.size})")
|
||||
|
||||
# Read scalar parameters
|
||||
original_hnsw_data["entry_point"] = read_struct(f_in, "<i")
|
||||
original_hnsw_data["max_level"] = read_struct(f_in, "<i")
|
||||
original_hnsw_data["efConstruction"] = read_struct(f_in, "<i")
|
||||
original_hnsw_data["efSearch"] = read_struct(f_in, "<i")
|
||||
original_hnsw_data["dummy_upper_beam"] = read_struct(f_in, "<i")
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Read scalar params (ep={original_hnsw_data['entry_point']}, max_lvl={original_hnsw_data['max_level']})"
|
||||
)
|
||||
original_hnsw_data['entry_point'] = read_struct(f_in, '<i')
|
||||
original_hnsw_data['max_level'] = read_struct(f_in, '<i')
|
||||
original_hnsw_data['efConstruction'] = read_struct(f_in, '<i')
|
||||
original_hnsw_data['efSearch'] = read_struct(f_in, '<i')
|
||||
original_hnsw_data['dummy_upper_beam'] = read_struct(f_in, '<i')
|
||||
print(f"[{time.time() - start_time:.2f}s] Read scalar params (ep={original_hnsw_data['entry_point']}, max_lvl={original_hnsw_data['max_level']})")
|
||||
|
||||
# Read storage fourcc
|
||||
storage_fourcc = read_struct(f_in, "<I")
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Found storage fourcc: {storage_fourcc:08x}"
|
||||
)
|
||||
|
||||
storage_fourcc = read_struct(f_in, '<I')
|
||||
print(f"[{time.time() - start_time:.2f}s] Found storage fourcc: {storage_fourcc:08x}")
|
||||
|
||||
if prune_embeddings and storage_fourcc != NULL_INDEX_FOURCC:
|
||||
# Read compact neighbors data
|
||||
compact_neighbors_data_np = read_numpy_vector(f_in, np.int32, "i")
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Read compact neighbors data ({compact_neighbors_data_np.size})"
|
||||
)
|
||||
compact_neighbors_data_np = read_numpy_vector(f_in, np.int32, 'i')
|
||||
print(f"[{time.time() - start_time:.2f}s] Read compact neighbors data ({compact_neighbors_data_np.size})")
|
||||
compact_neighbors_data = compact_neighbors_data_np.tolist()
|
||||
del compact_neighbors_data_np
|
||||
|
||||
|
||||
# Skip storage data and write with NULL marker
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Pruning embeddings: Writing NULL storage marker."
|
||||
)
|
||||
print(f"[{time.time() - start_time:.2f}s] Pruning embeddings: Writing NULL storage marker.")
|
||||
storage_fourcc = NULL_INDEX_FOURCC
|
||||
elif not prune_embeddings:
|
||||
# Read and preserve compact neighbors and storage
|
||||
compact_neighbors_data_np = read_numpy_vector(f_in, np.int32, "i")
|
||||
compact_neighbors_data_np = read_numpy_vector(f_in, np.int32, 'i')
|
||||
compact_neighbors_data = compact_neighbors_data_np.tolist()
|
||||
del compact_neighbors_data_np
|
||||
|
||||
|
||||
# Read remaining storage data
|
||||
storage_data = f_in.read()
|
||||
else:
|
||||
# Already pruned (NULL storage)
|
||||
compact_neighbors_data_np = read_numpy_vector(f_in, np.int32, "i")
|
||||
compact_neighbors_data_np = read_numpy_vector(f_in, np.int32, 'i')
|
||||
compact_neighbors_data = compact_neighbors_data_np.tolist()
|
||||
del compact_neighbors_data_np
|
||||
storage_data = b""
|
||||
|
||||
storage_data = b''
|
||||
|
||||
# Write the updated compact format
|
||||
print(f"[{time.time() - start_time:.2f}s] Writing updated compact format...")
|
||||
write_compact_format(
|
||||
f_out,
|
||||
original_hnsw_data,
|
||||
assign_probas_np,
|
||||
cum_nneighbor_per_level_np,
|
||||
levels_np,
|
||||
compact_level_ptr,
|
||||
compact_node_offsets_np,
|
||||
compact_neighbors_data,
|
||||
storage_fourcc,
|
||||
storage_data if not prune_embeddings else b"",
|
||||
)
|
||||
|
||||
write_compact_format(f_out, original_hnsw_data, assign_probas_np, cum_nneighbor_per_level_np,
|
||||
levels_np, compact_level_ptr, compact_node_offsets_np,
|
||||
compact_neighbors_data, storage_fourcc, storage_data if not prune_embeddings else b'')
|
||||
|
||||
print(f"[{time.time() - start_time:.2f}s] Conversion complete.")
|
||||
return True
|
||||
|
||||
|
||||
else:
|
||||
# is_compact=False, rewind and read original format
|
||||
f_in.seek(pos_before_compact)
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Compact flag is False, reading original format..."
|
||||
)
|
||||
|
||||
print(f"[{time.time() - start_time:.2f}s] Compact flag is False, reading original format...")
|
||||
|
||||
except EOFError:
|
||||
# No compact flag found, assume original format
|
||||
f_in.seek(pos_before_compact)
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] No compact flag found, assuming original format..."
|
||||
)
|
||||
print(f"[{time.time() - start_time:.2f}s] No compact flag found, assuming original format...")
|
||||
|
||||
# --- Handle potential extra byte in original format (like C++ code) ---
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Probing for potential extra byte before non-compact offsets..."
|
||||
)
|
||||
print(f"[{time.time() - start_time:.2f}s] Probing for potential extra byte before non-compact offsets...")
|
||||
pos_before_probe = f_in.tell()
|
||||
try:
|
||||
suspected_flag = read_struct(f_in, "<B") # Read 1 byte
|
||||
suspected_flag = read_struct(f_in, '<B') # Read 1 byte
|
||||
if suspected_flag == 0x00:
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Found and consumed an unexpected 0x00 byte."
|
||||
)
|
||||
print(f"[{time.time() - start_time:.2f}s] Found and consumed an unexpected 0x00 byte.")
|
||||
elif suspected_flag == 0x01:
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] ERROR: Found 0x01 but is_compact should be False"
|
||||
)
|
||||
print(f"[{time.time() - start_time:.2f}s] ERROR: Found 0x01 but is_compact should be False")
|
||||
raise ValueError("Inconsistent compact flag state")
|
||||
else:
|
||||
# Rewind - this byte is part of offsets data
|
||||
f_in.seek(pos_before_probe)
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Rewound to original position (byte was 0x{suspected_flag:02x})"
|
||||
)
|
||||
print(f"[{time.time() - start_time:.2f}s] Rewound to original position (byte was 0x{suspected_flag:02x})")
|
||||
except EOFError:
|
||||
f_in.seek(pos_before_probe)
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] No extra byte found (EOF), proceeding with offsets read"
|
||||
)
|
||||
print(f"[{time.time() - start_time:.2f}s] No extra byte found (EOF), proceeding with offsets read")
|
||||
|
||||
# --- Read original format data ---
|
||||
offsets_np = read_numpy_vector(f_in, np.uint64, "Q")
|
||||
offsets_np = read_numpy_vector(f_in, np.uint64, 'Q')
|
||||
print(f"[{time.time() - start_time:.2f}s] Read offsets ({offsets_np.size})")
|
||||
if len(offsets_np) != ntotal + 1:
|
||||
raise ValueError(
|
||||
f"Inconsistent offsets size: len(levels)={ntotal} but len(offsets)={len(offsets_np)}"
|
||||
)
|
||||
raise ValueError(f"Inconsistent offsets size: len(levels)={ntotal} but len(offsets)={len(offsets_np)}")
|
||||
gc.collect()
|
||||
|
||||
print(f"[{time.time() - start_time:.2f}s] Attempting to read neighbors vector...")
|
||||
neighbors_np = read_numpy_vector(f_in, np.int32, "i")
|
||||
neighbors_np = read_numpy_vector(f_in, np.int32, 'i')
|
||||
print(f"[{time.time() - start_time:.2f}s] Read neighbors ({neighbors_np.size})")
|
||||
expected_neighbors_size = offsets_np[-1] if ntotal > 0 else 0
|
||||
if neighbors_np.size != expected_neighbors_size:
|
||||
print(
|
||||
f"Warning: neighbors vector size mismatch. Expected {expected_neighbors_size} based on offsets, got {neighbors_np.size}."
|
||||
)
|
||||
print(f"Warning: neighbors vector size mismatch. Expected {expected_neighbors_size} based on offsets, got {neighbors_np.size}.")
|
||||
gc.collect()
|
||||
|
||||
original_hnsw_data["entry_point"] = read_struct(f_in, "<i")
|
||||
original_hnsw_data["max_level"] = read_struct(f_in, "<i")
|
||||
original_hnsw_data["efConstruction"] = read_struct(f_in, "<i")
|
||||
original_hnsw_data["efSearch"] = read_struct(f_in, "<i")
|
||||
original_hnsw_data["dummy_upper_beam"] = read_struct(f_in, "<i")
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Read scalar params (ep={original_hnsw_data['entry_point']}, max_lvl={original_hnsw_data['max_level']})"
|
||||
)
|
||||
original_hnsw_data['entry_point'] = read_struct(f_in, '<i')
|
||||
original_hnsw_data['max_level'] = read_struct(f_in, '<i')
|
||||
original_hnsw_data['efConstruction'] = read_struct(f_in, '<i')
|
||||
original_hnsw_data['efSearch'] = read_struct(f_in, '<i')
|
||||
original_hnsw_data['dummy_upper_beam'] = read_struct(f_in, '<i')
|
||||
print(f"[{time.time() - start_time:.2f}s] Read scalar params (ep={original_hnsw_data['entry_point']}, max_lvl={original_hnsw_data['max_level']})")
|
||||
|
||||
print(f"[{time.time() - start_time:.2f}s] Checking for storage data...")
|
||||
storage_fourcc = None
|
||||
try:
|
||||
storage_fourcc = read_struct(f_in, "<I")
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Found storage fourcc: {storage_fourcc:08x}."
|
||||
)
|
||||
storage_fourcc = read_struct(f_in, '<I')
|
||||
print(f"[{time.time() - start_time:.2f}s] Found storage fourcc: {storage_fourcc:08x}.")
|
||||
except EOFError:
|
||||
print(f"[{time.time() - start_time:.2f}s] No storage data found (EOF).")
|
||||
print(f"[{time.time() - start_time:.2f}s] No storage data found (EOF).")
|
||||
except Exception as e:
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Error reading potential storage data: {e}"
|
||||
)
|
||||
print(f"[{time.time() - start_time:.2f}s] Error reading potential storage data: {e}")
|
||||
|
||||
|
||||
# --- Perform Conversion ---
|
||||
print(f"[{time.time() - start_time:.2f}s] Converting to CSR format...")
|
||||
@@ -475,21 +373,17 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=
|
||||
|
||||
current_level_ptr_idx = 0
|
||||
current_data_idx = 0
|
||||
total_valid_neighbors_counted = 0 # For validation
|
||||
total_valid_neighbors_counted = 0 # For validation
|
||||
|
||||
# Optimize calculation by getting slices once per node if possible
|
||||
for i in range(ntotal):
|
||||
if i > 0 and i % (ntotal // 100 or 1) == 0: # Log progress roughly every 1%
|
||||
if i > 0 and i % (ntotal // 100 or 1) == 0: # Log progress roughly every 1%
|
||||
progress = (i / ntotal) * 100
|
||||
elapsed = time.time() - start_time
|
||||
print(
|
||||
f"\r[{elapsed:.2f}s] Converting node {i}/{ntotal} ({progress:.1f}%)...",
|
||||
end="",
|
||||
)
|
||||
print(f"\r[{elapsed:.2f}s] Converting node {i}/{ntotal} ({progress:.1f}%)...", end="")
|
||||
|
||||
node_max_level = levels_np[i] - 1
|
||||
if node_max_level < -1:
|
||||
node_max_level = -1
|
||||
if node_max_level < -1: node_max_level = -1
|
||||
|
||||
node_ptr_start_index = current_level_ptr_idx
|
||||
compact_node_offsets_np[i] = node_ptr_start_index
|
||||
@@ -500,17 +394,13 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=
|
||||
for level in range(node_max_level + 1):
|
||||
compact_level_ptr.append(current_data_idx)
|
||||
|
||||
begin_orig_np = original_offset_start + get_cum_neighbors(
|
||||
cum_nneighbor_per_level_np, level
|
||||
)
|
||||
end_orig_np = original_offset_start + get_cum_neighbors(
|
||||
cum_nneighbor_per_level_np, level + 1
|
||||
)
|
||||
begin_orig_np = original_offset_start + get_cum_neighbors(cum_nneighbor_per_level_np, level)
|
||||
end_orig_np = original_offset_start + get_cum_neighbors(cum_nneighbor_per_level_np, level + 1)
|
||||
|
||||
begin_orig = int(begin_orig_np)
|
||||
end_orig = int(end_orig_np)
|
||||
|
||||
neighbors_len = len(neighbors_np) # Cache length
|
||||
neighbors_len = len(neighbors_np) # Cache length
|
||||
begin_orig = min(max(0, begin_orig), neighbors_len)
|
||||
end_orig = min(max(begin_orig, end_orig), neighbors_len)
|
||||
|
||||
@@ -523,117 +413,83 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=
|
||||
|
||||
if num_valid > 0:
|
||||
# Append valid neighbors
|
||||
compact_neighbors_data.extend(
|
||||
level_neighbors_slice[valid_neighbors_mask]
|
||||
)
|
||||
compact_neighbors_data.extend(level_neighbors_slice[valid_neighbors_mask])
|
||||
current_data_idx += num_valid
|
||||
total_valid_neighbors_counted += num_valid
|
||||
|
||||
|
||||
compact_level_ptr.append(current_data_idx)
|
||||
current_level_ptr_idx += num_pointers_expected
|
||||
|
||||
compact_node_offsets_np[ntotal] = current_level_ptr_idx
|
||||
print(
|
||||
f"\r[{time.time() - start_time:.2f}s] Conversion loop finished. "
|
||||
) # Clear progress line
|
||||
print(f"\r[{time.time() - start_time:.2f}s] Conversion loop finished. ") # Clear progress line
|
||||
|
||||
# --- Validation Checks ---
|
||||
print(f"[{time.time() - start_time:.2f}s] Running validation checks...")
|
||||
valid_check_passed = True
|
||||
# Check 1: Total valid neighbors count
|
||||
print(" Checking total valid neighbor count...")
|
||||
print(f" Checking total valid neighbor count...")
|
||||
expected_valid_count = np.sum(neighbors_np >= 0)
|
||||
if total_valid_neighbors_counted != len(compact_neighbors_data):
|
||||
print(
|
||||
f"Error: Mismatch between counted valid neighbors ({total_valid_neighbors_counted}) and final compact_data size ({len(compact_neighbors_data)})!",
|
||||
file=sys.stderr,
|
||||
)
|
||||
valid_check_passed = False
|
||||
print(f"Error: Mismatch between counted valid neighbors ({total_valid_neighbors_counted}) and final compact_data size ({len(compact_neighbors_data)})!", file=sys.stderr)
|
||||
valid_check_passed = False
|
||||
if expected_valid_count != len(compact_neighbors_data):
|
||||
print(
|
||||
f"Error: Mismatch between NumPy count of valid neighbors ({expected_valid_count}) and final compact_data size ({len(compact_neighbors_data)})!",
|
||||
file=sys.stderr,
|
||||
)
|
||||
valid_check_passed = False
|
||||
print(f"Error: Mismatch between NumPy count of valid neighbors ({expected_valid_count}) and final compact_data size ({len(compact_neighbors_data)})!", file=sys.stderr)
|
||||
valid_check_passed = False
|
||||
else:
|
||||
print(f" OK: Total valid neighbors = {len(compact_neighbors_data)}")
|
||||
print(f" OK: Total valid neighbors = {len(compact_neighbors_data)}")
|
||||
|
||||
# Check 2: Final pointer indices consistency
|
||||
print(" Checking final pointer indices...")
|
||||
print(f" Checking final pointer indices...")
|
||||
if compact_node_offsets_np[ntotal] != len(compact_level_ptr):
|
||||
print(
|
||||
f"Error: Final node offset ({compact_node_offsets_np[ntotal]}) doesn't match level_ptr size ({len(compact_level_ptr)})!",
|
||||
file=sys.stderr,
|
||||
)
|
||||
valid_check_passed = False
|
||||
if (
|
||||
len(compact_level_ptr) > 0 and compact_level_ptr[-1] != len(compact_neighbors_data)
|
||||
) or (len(compact_level_ptr) == 0 and len(compact_neighbors_data) != 0):
|
||||
last_ptr = compact_level_ptr[-1] if len(compact_level_ptr) > 0 else -1
|
||||
print(
|
||||
f"Error: Last level pointer ({last_ptr}) doesn't match compact_data size ({len(compact_neighbors_data)})!",
|
||||
file=sys.stderr,
|
||||
)
|
||||
valid_check_passed = False
|
||||
print(f"Error: Final node offset ({compact_node_offsets_np[ntotal]}) doesn't match level_ptr size ({len(compact_level_ptr)})!", file=sys.stderr)
|
||||
valid_check_passed = False
|
||||
if (len(compact_level_ptr) > 0 and compact_level_ptr[-1] != len(compact_neighbors_data)) or \
|
||||
(len(compact_level_ptr) == 0 and len(compact_neighbors_data) != 0):
|
||||
last_ptr = compact_level_ptr[-1] if len(compact_level_ptr) > 0 else -1
|
||||
print(f"Error: Last level pointer ({last_ptr}) doesn't match compact_data size ({len(compact_neighbors_data)})!", file=sys.stderr)
|
||||
valid_check_passed = False
|
||||
else:
|
||||
print(" OK: Final pointers match data size.")
|
||||
print(f" OK: Final pointers match data size.")
|
||||
|
||||
if not valid_check_passed:
|
||||
print(
|
||||
"Error: Validation checks failed. Output file might be incorrect.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print("Error: Validation checks failed. Output file might be incorrect.", file=sys.stderr)
|
||||
# Optional: Exit here if validation fails
|
||||
# return False
|
||||
|
||||
# --- Explicitly delete large intermediate arrays ---
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Deleting original neighbors and offsets arrays..."
|
||||
)
|
||||
print(f"[{time.time() - start_time:.2f}s] Deleting original neighbors and offsets arrays...")
|
||||
del neighbors_np
|
||||
del offsets_np
|
||||
gc.collect()
|
||||
|
||||
print(
|
||||
f" CSR Stats: |data|={len(compact_neighbors_data)}, |level_ptr|={len(compact_level_ptr)}"
|
||||
)
|
||||
print(f" CSR Stats: |data|={len(compact_neighbors_data)}, |level_ptr|={len(compact_level_ptr)}")
|
||||
|
||||
# --- Write CSR HNSW graph data using unified function ---
|
||||
print(
|
||||
f"[{time.time() - start_time:.2f}s] Writing CSR HNSW graph data in FAISS-compatible order..."
|
||||
)
|
||||
|
||||
print(f"[{time.time() - start_time:.2f}s] Writing CSR HNSW graph data in FAISS-compatible order...")
|
||||
|
||||
# Determine storage fourcc and data based on prune_embeddings
|
||||
if prune_embeddings:
|
||||
print(" Pruning embeddings: Writing NULL storage marker.")
|
||||
print(f" Pruning embeddings: Writing NULL storage marker.")
|
||||
output_storage_fourcc = NULL_INDEX_FOURCC
|
||||
storage_data = b""
|
||||
storage_data = b''
|
||||
else:
|
||||
# Keep embeddings - read and preserve original storage data
|
||||
if storage_fourcc and storage_fourcc != NULL_INDEX_FOURCC:
|
||||
print(" Preserving embeddings: Reading original storage data...")
|
||||
print(f" Preserving embeddings: Reading original storage data...")
|
||||
storage_data = f_in.read() # Read remaining storage data
|
||||
output_storage_fourcc = storage_fourcc
|
||||
print(f" Read {len(storage_data)} bytes of storage data")
|
||||
else:
|
||||
print(" No embeddings found in original file (NULL storage)")
|
||||
print(f" No embeddings found in original file (NULL storage)")
|
||||
output_storage_fourcc = NULL_INDEX_FOURCC
|
||||
storage_data = b""
|
||||
|
||||
storage_data = b''
|
||||
|
||||
# Use the unified write function
|
||||
write_compact_format(
|
||||
f_out,
|
||||
original_hnsw_data,
|
||||
assign_probas_np,
|
||||
cum_nneighbor_per_level_np,
|
||||
levels_np,
|
||||
compact_level_ptr,
|
||||
compact_node_offsets_np,
|
||||
compact_neighbors_data,
|
||||
output_storage_fourcc,
|
||||
storage_data,
|
||||
)
|
||||
|
||||
write_compact_format(f_out, original_hnsw_data, assign_probas_np, cum_nneighbor_per_level_np,
|
||||
levels_np, compact_level_ptr, compact_node_offsets_np,
|
||||
compact_neighbors_data, output_storage_fourcc, storage_data)
|
||||
|
||||
# Clean up memory
|
||||
del assign_probas_np, cum_nneighbor_per_level_np, levels_np
|
||||
del compact_neighbors_data, compact_level_ptr, compact_node_offsets_np
|
||||
@@ -647,63 +503,40 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=
|
||||
print(f"Error: Input file not found: {input_filename}", file=sys.stderr)
|
||||
return False
|
||||
except MemoryError as e:
|
||||
print(f"\nFatal MemoryError during conversion: {e}. Insufficient RAM.", file=sys.stderr)
|
||||
# Clean up potentially partially written output file?
|
||||
try:
|
||||
os.remove(output_filename)
|
||||
except OSError:
|
||||
pass
|
||||
return False
|
||||
print(f"\nFatal MemoryError during conversion: {e}. Insufficient RAM.", file=sys.stderr)
|
||||
# Clean up potentially partially written output file?
|
||||
try: os.remove(output_filename)
|
||||
except OSError: pass
|
||||
return False
|
||||
except EOFError as e:
|
||||
print(
|
||||
f"Error: Reached end of file unexpectedly reading {input_filename}. {e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
try:
|
||||
os.remove(output_filename)
|
||||
except OSError:
|
||||
pass
|
||||
print(f"Error: Reached end of file unexpectedly reading {input_filename}. {e}", file=sys.stderr)
|
||||
try: os.remove(output_filename)
|
||||
except OSError: pass
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"An unexpected error occurred during conversion: {e}", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
try:
|
||||
os.remove(output_filename)
|
||||
except OSError:
|
||||
pass
|
||||
except OSError: pass
|
||||
return False
|
||||
# Ensure neighbors_np is deleted even if an error occurs after its allocation
|
||||
finally:
|
||||
try:
|
||||
if "neighbors_np" in locals() and neighbors_np is not None:
|
||||
del neighbors_np
|
||||
gc.collect()
|
||||
except NameError:
|
||||
pass
|
||||
if 'neighbors_np' in locals() and neighbors_np is not None:
|
||||
del neighbors_np
|
||||
gc.collect()
|
||||
|
||||
|
||||
# --- Script Execution ---
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert a Faiss IndexHNSWFlat file to a CSR-based HNSW graph file."
|
||||
)
|
||||
parser = argparse.ArgumentParser(description="Convert a Faiss IndexHNSWFlat file to a CSR-based HNSW graph file.")
|
||||
parser.add_argument("input_index_file", help="Path to the input IndexHNSWFlat file")
|
||||
parser.add_argument(
|
||||
"output_csr_graph_file", help="Path to write the output CSR HNSW graph file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prune-embeddings",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Prune embedding storage (write NULL storage marker)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--keep-embeddings",
|
||||
action="store_true",
|
||||
help="Keep embedding storage (overrides --prune-embeddings)",
|
||||
)
|
||||
parser.add_argument("output_csr_graph_file", help="Path to write the output CSR HNSW graph file")
|
||||
parser.add_argument("--prune-embeddings", action="store_true", default=True,
|
||||
help="Prune embedding storage (write NULL storage marker)")
|
||||
parser.add_argument("--keep-embeddings", action="store_true",
|
||||
help="Keep embedding storage (overrides --prune-embeddings)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -712,12 +545,10 @@ if __name__ == "__main__":
|
||||
sys.exit(1)
|
||||
|
||||
if os.path.abspath(args.input_index_file) == os.path.abspath(args.output_csr_graph_file):
|
||||
print("Error: Input and output filenames cannot be the same.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
print(f"Error: Input and output filenames cannot be the same.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
prune_embeddings = args.prune_embeddings and not args.keep_embeddings
|
||||
success = convert_hnsw_graph_to_csr(
|
||||
args.input_index_file, args.output_csr_graph_file, prune_embeddings
|
||||
)
|
||||
success = convert_hnsw_graph_to_csr(args.input_index_file, args.output_csr_graph_file, prune_embeddings)
|
||||
if not success:
|
||||
sys.exit(1)
|
||||
sys.exit(1)
|
||||
@@ -1,19 +1,19 @@
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
import numpy as np
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List, Literal, Optional
|
||||
import shutil
|
||||
import logging
|
||||
|
||||
from leann.searcher_base import BaseSearcher
|
||||
from .convert_to_csr import convert_hnsw_graph_to_csr
|
||||
|
||||
from leann.registry import register_backend
|
||||
from leann.interface import (
|
||||
LeannBackendBuilderInterface,
|
||||
LeannBackendFactoryInterface,
|
||||
LeannBackendBuilderInterface,
|
||||
LeannBackendSearcherInterface,
|
||||
)
|
||||
from leann.registry import register_backend
|
||||
from leann.searcher_base import BaseSearcher
|
||||
|
||||
from .convert_to_csr import convert_hnsw_graph_to_csr
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -48,14 +48,8 @@ class HNSWBuilder(LeannBackendBuilderInterface):
|
||||
self.efConstruction = self.build_params.setdefault("efConstruction", 200)
|
||||
self.distance_metric = self.build_params.setdefault("distance_metric", "mips")
|
||||
self.dimensions = self.build_params.get("dimensions")
|
||||
if not self.is_recompute:
|
||||
if self.is_compact:
|
||||
# TODO: support this case @andy
|
||||
raise ValueError(
|
||||
"is_recompute is False, but is_compact is True. This is not compatible now. change is compact to False and you can use the original HNSW index."
|
||||
)
|
||||
|
||||
def build(self, data: np.ndarray, ids: list[str], index_path: str, **kwargs):
|
||||
def build(self, data: np.ndarray, ids: List[str], index_path: str, **kwargs):
|
||||
from . import faiss # type: ignore
|
||||
|
||||
path = Path(index_path)
|
||||
@@ -101,12 +95,16 @@ class HNSWBuilder(LeannBackendBuilderInterface):
|
||||
# index_file_old = index_file.with_suffix(".old")
|
||||
# shutil.move(str(index_file), str(index_file_old))
|
||||
shutil.move(str(csr_temp_file), str(index_file))
|
||||
logger.info(f"INFO: Replaced original index with {mode_str} version at '{index_file}'")
|
||||
logger.info(
|
||||
f"INFO: Replaced original index with {mode_str} version at '{index_file}'"
|
||||
)
|
||||
else:
|
||||
# Clean up and fail fast
|
||||
if csr_temp_file.exists():
|
||||
os.remove(csr_temp_file)
|
||||
raise RuntimeError("CSR conversion failed - cannot proceed with compact format")
|
||||
raise RuntimeError(
|
||||
"CSR conversion failed - cannot proceed with compact format"
|
||||
)
|
||||
|
||||
|
||||
class HNSWSearcher(BaseSearcher):
|
||||
@@ -144,7 +142,7 @@ class HNSWSearcher(BaseSearcher):
|
||||
self,
|
||||
query: np.ndarray,
|
||||
top_k: int,
|
||||
zmq_port: int | None = None,
|
||||
zmq_port: Optional[int] = None,
|
||||
complexity: int = 64,
|
||||
beam_width: int = 1,
|
||||
prune_ratio: float = 0.0,
|
||||
@@ -152,7 +150,7 @@ class HNSWSearcher(BaseSearcher):
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
batch_size: int = 0,
|
||||
**kwargs,
|
||||
) -> dict[str, Any]:
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Search for nearest neighbors using HNSW index.
|
||||
|
||||
@@ -181,7 +179,9 @@ class HNSWSearcher(BaseSearcher):
|
||||
raise RuntimeError("Recompute is required for pruned index.")
|
||||
if recompute_embeddings:
|
||||
if zmq_port is None:
|
||||
raise ValueError("zmq_port must be provided if recompute_embeddings is True")
|
||||
raise ValueError(
|
||||
"zmq_port must be provided if recompute_embeddings is True"
|
||||
)
|
||||
|
||||
if query.dtype != np.float32:
|
||||
query = query.astype(np.float32)
|
||||
@@ -190,7 +190,9 @@ class HNSWSearcher(BaseSearcher):
|
||||
|
||||
params = faiss.SearchParametersHNSW()
|
||||
if zmq_port is not None:
|
||||
params.zmq_port = zmq_port # C++ code won't use this if recompute_embeddings is False
|
||||
params.zmq_port = (
|
||||
zmq_port # C++ code won't use this if recompute_embeddings is False
|
||||
)
|
||||
params.efSearch = complexity
|
||||
params.beam_size = beam_width
|
||||
|
||||
@@ -203,7 +205,9 @@ class HNSWSearcher(BaseSearcher):
|
||||
params.send_neigh_times_ratio = 0.0
|
||||
elif pruning_strategy == "proportional":
|
||||
params.local_prune = False
|
||||
params.send_neigh_times_ratio = 1.0 # Any value > 1e-6 triggers proportional mode
|
||||
params.send_neigh_times_ratio = (
|
||||
1.0 # Any value > 1e-6 triggers proportional mode
|
||||
)
|
||||
else: # "global"
|
||||
params.local_prune = False
|
||||
params.send_neigh_times_ratio = 0.0
|
||||
@@ -224,6 +228,8 @@ class HNSWSearcher(BaseSearcher):
|
||||
params,
|
||||
)
|
||||
|
||||
string_labels = [[str(int_label) for int_label in batch_labels] for batch_labels in labels]
|
||||
string_labels = [
|
||||
[str(int_label) for int_label in batch_labels] for batch_labels in labels
|
||||
]
|
||||
|
||||
return {"labels": string_labels, "distances": distances}
|
||||
|
||||
@@ -3,17 +3,17 @@ HNSW-specific embedding server
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import msgpack
|
||||
import numpy as np
|
||||
import os
|
||||
import zmq
|
||||
import numpy as np
|
||||
import msgpack
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import sys
|
||||
import logging
|
||||
|
||||
# Set up logging based on environment variable
|
||||
LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
|
||||
@@ -33,7 +33,7 @@ if not logger.handlers:
|
||||
|
||||
|
||||
def create_hnsw_embedding_server(
|
||||
passages_file: str | None = None,
|
||||
passages_file: Optional[str] = None,
|
||||
zmq_port: int = 5555,
|
||||
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
||||
distance_metric: str = "mips",
|
||||
@@ -52,8 +52,8 @@ def create_hnsw_embedding_server(
|
||||
sys.path.insert(0, str(leann_core_path))
|
||||
|
||||
try:
|
||||
from leann.api import PassageManager
|
||||
from leann.embedding_compute import compute_embeddings
|
||||
from leann.api import PassageManager
|
||||
|
||||
logger.info("Successfully imported unified embedding computation module")
|
||||
except ImportError as e:
|
||||
@@ -78,22 +78,10 @@ def create_hnsw_embedding_server(
|
||||
raise ValueError("Only metadata files (.meta.json) are supported")
|
||||
|
||||
# Load metadata to get passage sources
|
||||
with open(passages_file) as f:
|
||||
with open(passages_file, "r") as f:
|
||||
meta = json.load(f)
|
||||
|
||||
# Convert relative paths to absolute paths based on metadata file location
|
||||
metadata_dir = Path(passages_file).parent.parent # Go up one level from the metadata file
|
||||
passage_sources = []
|
||||
for source in meta["passage_sources"]:
|
||||
source_copy = source.copy()
|
||||
# Convert relative paths to absolute paths
|
||||
if not Path(source_copy["path"]).is_absolute():
|
||||
source_copy["path"] = str(metadata_dir / source_copy["path"])
|
||||
if not Path(source_copy["index_path"]).is_absolute():
|
||||
source_copy["index_path"] = str(metadata_dir / source_copy["index_path"])
|
||||
passage_sources.append(source_copy)
|
||||
|
||||
passages = PassageManager(passage_sources)
|
||||
passages = PassageManager(meta["passage_sources"])
|
||||
logger.info(
|
||||
f"Loaded PassageManager with {len(passages.global_offset_map)} passages from metadata"
|
||||
)
|
||||
@@ -132,7 +120,9 @@ def create_hnsw_embedding_server(
|
||||
response = embeddings.tolist()
|
||||
socket.send(msgpack.packb(response))
|
||||
e2e_end = time.time()
|
||||
logger.info(f"⏱️ Text embedding E2E time: {e2e_end - e2e_start:.6f}s")
|
||||
logger.info(
|
||||
f"⏱️ Text embedding E2E time: {e2e_end - e2e_start:.6f}s"
|
||||
)
|
||||
continue
|
||||
|
||||
# Handle distance calculation requests
|
||||
@@ -158,13 +148,17 @@ def create_hnsw_embedding_server(
|
||||
texts.append(txt)
|
||||
except KeyError:
|
||||
logger.error(f"Passage ID {nid} not found")
|
||||
raise RuntimeError(f"FATAL: Passage with ID {nid} not found")
|
||||
raise RuntimeError(
|
||||
f"FATAL: Passage with ID {nid} not found"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Exception looking up passage ID {nid}: {e}")
|
||||
raise
|
||||
|
||||
# Process embeddings
|
||||
embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
|
||||
embeddings = compute_embeddings(
|
||||
texts, model_name, mode=embedding_mode
|
||||
)
|
||||
logger.info(
|
||||
f"Computed embeddings for {len(texts)} texts, shape: {embeddings.shape}"
|
||||
)
|
||||
@@ -178,12 +172,18 @@ def create_hnsw_embedding_server(
|
||||
distances = -np.dot(embeddings, query_vector)
|
||||
|
||||
response_payload = distances.flatten().tolist()
|
||||
response_bytes = msgpack.packb([response_payload], use_single_float=True)
|
||||
logger.debug(f"Sending distance response with {len(distances)} distances")
|
||||
response_bytes = msgpack.packb(
|
||||
[response_payload], use_single_float=True
|
||||
)
|
||||
logger.debug(
|
||||
f"Sending distance response with {len(distances)} distances"
|
||||
)
|
||||
|
||||
socket.send(response_bytes)
|
||||
e2e_end = time.time()
|
||||
logger.info(f"⏱️ Distance calculation E2E time: {e2e_end - e2e_start:.6f}s")
|
||||
logger.info(
|
||||
f"⏱️ Distance calculation E2E time: {e2e_end - e2e_start:.6f}s"
|
||||
)
|
||||
continue
|
||||
|
||||
# Standard embedding request (passage ID lookup)
|
||||
@@ -208,7 +208,9 @@ def create_hnsw_embedding_server(
|
||||
passage_data = passages.get_passage(str(nid))
|
||||
txt = passage_data["text"]
|
||||
if not txt:
|
||||
raise RuntimeError(f"FATAL: Empty text for passage ID {nid}")
|
||||
raise RuntimeError(
|
||||
f"FATAL: Empty text for passage ID {nid}"
|
||||
)
|
||||
texts.append(txt)
|
||||
except KeyError:
|
||||
raise RuntimeError(f"FATAL: Passage with ID {nid} not found")
|
||||
@@ -227,9 +229,11 @@ def create_hnsw_embedding_server(
|
||||
logger.error(
|
||||
f"NaN or Inf detected in embeddings! Requested IDs: {node_ids[:5]}..."
|
||||
)
|
||||
raise AssertionError()
|
||||
assert False
|
||||
|
||||
hidden_contiguous_f32 = np.ascontiguousarray(embeddings, dtype=np.float32)
|
||||
hidden_contiguous_f32 = np.ascontiguousarray(
|
||||
embeddings, dtype=np.float32
|
||||
)
|
||||
response_payload = [
|
||||
list(hidden_contiguous_f32.shape),
|
||||
hidden_contiguous_f32.flatten().tolist(),
|
||||
@@ -266,15 +270,15 @@ def create_hnsw_embedding_server(
|
||||
if __name__ == "__main__":
|
||||
import signal
|
||||
import sys
|
||||
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
logger.info(f"Received signal {sig}, shutting down gracefully...")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
# Register signal handlers for graceful shutdown
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description="HNSW Embedding service")
|
||||
parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
|
||||
parser.add_argument(
|
||||
|
||||
@@ -6,22 +6,24 @@ build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "leann-backend-hnsw"
|
||||
version = "0.1.14"
|
||||
version = "0.1.8"
|
||||
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
|
||||
dependencies = [
|
||||
"leann-core==0.1.14",
|
||||
"leann-core==0.1.8",
|
||||
"numpy",
|
||||
"pyzmq>=23.0.0",
|
||||
"msgpack>=1.0.0",
|
||||
]
|
||||
|
||||
[tool.scikit-build]
|
||||
wheel.packages = ["leann_backend_hnsw"]
|
||||
editable.mode = "redirect"
|
||||
cmake.build-type = "Release"
|
||||
build.verbose = true
|
||||
build.tool-args = ["-j8"]
|
||||
wheel.exclude = ["CMakeLists.txt", "src", "third_party"]
|
||||
sdist.include = ["CMakeLists.txt", "src", "third_party", "leann_backend_hnsw/*.txt"]
|
||||
cmake.args = ["-DCMAKE_BUILD_TYPE=Release"]
|
||||
# Ensure CMake can find system libraries
|
||||
build-dir = "build/{cache_tag}"
|
||||
minimum-version = "build-system.requires"
|
||||
|
||||
# CMake definitions to optimize compilation
|
||||
[tool.scikit-build.cmake.define]
|
||||
CMAKE_BUILD_PARALLEL_LEVEL = "8"
|
||||
CMAKE_BUILD_PARALLEL_LEVEL = "8"
|
||||
SKBUILD_SOABI = "YES"
|
||||
Submodule packages/leann-backend-hnsw/third_party/faiss updated: ff22e2c86b...553f937220
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "leann-core"
|
||||
version = "0.1.14"
|
||||
version = "0.1.8"
|
||||
description = "Core API and plugin system for LEANN"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
@@ -15,31 +15,12 @@ dependencies = [
|
||||
"numpy>=1.20.0",
|
||||
"tqdm>=4.60.0",
|
||||
"psutil>=5.8.0",
|
||||
"pyzmq>=23.0.0",
|
||||
"pyzmq>=23.0.0,<27", # Cap at 26.x for manylinux2014 compatibility
|
||||
"msgpack>=1.0.0",
|
||||
"torch>=2.0.0",
|
||||
"sentence-transformers>=2.2.0",
|
||||
"llama-index-core>=0.12.0",
|
||||
"llama-index-readers-file>=0.4.0", # Essential for document reading
|
||||
"llama-index-embeddings-huggingface>=0.5.5", # For embeddings
|
||||
"python-dotenv>=1.0.0",
|
||||
"openai>=1.0.0",
|
||||
"huggingface-hub>=0.20.0",
|
||||
"transformers>=4.30.0",
|
||||
"requests>=2.25.0",
|
||||
"accelerate>=0.20.0",
|
||||
"PyPDF2>=3.0.0",
|
||||
"pymupdf>=1.23.0",
|
||||
"pdfplumber>=0.10.0",
|
||||
"mlx>=0.26.3; sys_platform == 'darwin'",
|
||||
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
colab = [
|
||||
"torch>=2.0.0,<3.0.0", # Limit torch version to avoid conflicts
|
||||
"transformers>=4.30.0,<5.0.0", # Limit transformers version
|
||||
"accelerate>=0.20.0,<1.0.0", # Limit accelerate version
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@@ -14,4 +14,4 @@ from .registry import BACKEND_REGISTRY, autodiscover_backends
|
||||
|
||||
autodiscover_backends()
|
||||
|
||||
__all__ = ["BACKEND_REGISTRY", "LeannBuilder", "LeannChat", "LeannSearcher"]
|
||||
__all__ = ["LeannBuilder", "LeannSearcher", "LeannChat", "BACKEND_REGISTRY"]
|
||||
@@ -4,30 +4,27 @@ with the correct, original embedding logic from the user's reference code.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import pickle
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
import numpy as np
|
||||
|
||||
from leann.interface import LeannBackendSearcherInterface
|
||||
|
||||
from .chat import get_llm
|
||||
from .interface import LeannBackendFactoryInterface
|
||||
import numpy as np
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Literal
|
||||
from dataclasses import dataclass, field
|
||||
from .registry import BACKEND_REGISTRY
|
||||
from .interface import LeannBackendFactoryInterface
|
||||
from .chat import get_llm
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def compute_embeddings(
|
||||
chunks: list[str],
|
||||
chunks: List[str],
|
||||
model_name: str,
|
||||
mode: str = "sentence-transformers",
|
||||
use_server: bool = True,
|
||||
port: int | None = None,
|
||||
port: Optional[int] = None,
|
||||
is_build=False,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
@@ -64,7 +61,9 @@ def compute_embeddings(
|
||||
)
|
||||
|
||||
|
||||
def compute_embeddings_via_server(chunks: list[str], model_name: str, port: int) -> np.ndarray:
|
||||
def compute_embeddings_via_server(
|
||||
chunks: List[str], model_name: str, port: int
|
||||
) -> np.ndarray:
|
||||
"""Computes embeddings using sentence-transformers.
|
||||
|
||||
Args:
|
||||
@@ -74,9 +73,9 @@ def compute_embeddings_via_server(chunks: list[str], model_name: str, port: int)
|
||||
logger.info(
|
||||
f"Computing embeddings for {len(chunks)} chunks using SentenceTransformer model '{model_name}' (via embedding server)..."
|
||||
)
|
||||
import zmq
|
||||
import msgpack
|
||||
import numpy as np
|
||||
import zmq
|
||||
|
||||
# Connect to embedding server
|
||||
context = zmq.Context()
|
||||
@@ -105,11 +104,11 @@ class SearchResult:
|
||||
id: str
|
||||
score: float
|
||||
text: str
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
class PassageManager:
|
||||
def __init__(self, passage_sources: list[dict[str, Any]]):
|
||||
def __init__(self, passage_sources: List[Dict[str, Any]]):
|
||||
self.offset_maps = {}
|
||||
self.passage_files = {}
|
||||
self.global_offset_map = {} # Combined map for fast lookup
|
||||
@@ -118,15 +117,8 @@ class PassageManager:
|
||||
assert source["type"] == "jsonl", "only jsonl is supported"
|
||||
passage_file = source["path"]
|
||||
index_file = source["index_path"] # .idx file
|
||||
|
||||
# Fix path resolution for Colab and other environments
|
||||
if not Path(index_file).is_absolute():
|
||||
# If relative path, try to resolve it properly
|
||||
index_file = str(Path(index_file).resolve())
|
||||
|
||||
if not Path(index_file).exists():
|
||||
raise FileNotFoundError(f"Passage index file not found: {index_file}")
|
||||
|
||||
with open(index_file, "rb") as f:
|
||||
offset_map = pickle.load(f)
|
||||
self.offset_maps[passage_file] = offset_map
|
||||
@@ -136,11 +128,11 @@ class PassageManager:
|
||||
for passage_id, offset in offset_map.items():
|
||||
self.global_offset_map[passage_id] = (passage_file, offset)
|
||||
|
||||
def get_passage(self, passage_id: str) -> dict[str, Any]:
|
||||
def get_passage(self, passage_id: str) -> Dict[str, Any]:
|
||||
if passage_id in self.global_offset_map:
|
||||
passage_file, offset = self.global_offset_map[passage_id]
|
||||
# Lazy file opening - only open when needed
|
||||
with open(passage_file, encoding="utf-8") as f:
|
||||
with open(passage_file, "r", encoding="utf-8") as f:
|
||||
f.seek(offset)
|
||||
return json.loads(f.readline())
|
||||
raise KeyError(f"Passage ID not found: {passage_id}")
|
||||
@@ -151,12 +143,14 @@ class LeannBuilder:
|
||||
self,
|
||||
backend_name: str,
|
||||
embedding_model: str = "facebook/contriever",
|
||||
dimensions: int | None = None,
|
||||
dimensions: Optional[int] = None,
|
||||
embedding_mode: str = "sentence-transformers",
|
||||
**backend_kwargs,
|
||||
):
|
||||
self.backend_name = backend_name
|
||||
backend_factory: LeannBackendFactoryInterface | None = BACKEND_REGISTRY.get(backend_name)
|
||||
backend_factory: LeannBackendFactoryInterface | None = BACKEND_REGISTRY.get(
|
||||
backend_name
|
||||
)
|
||||
if backend_factory is None:
|
||||
raise ValueError(f"Backend '{backend_name}' not found or not registered.")
|
||||
self.backend_factory = backend_factory
|
||||
@@ -164,9 +158,9 @@ class LeannBuilder:
|
||||
self.dimensions = dimensions
|
||||
self.embedding_mode = embedding_mode
|
||||
self.backend_kwargs = backend_kwargs
|
||||
self.chunks: list[dict[str, Any]] = []
|
||||
self.chunks: List[Dict[str, Any]] = []
|
||||
|
||||
def add_text(self, text: str, metadata: dict[str, Any] | None = None):
|
||||
def add_text(self, text: str, metadata: Optional[Dict[str, Any]] = None):
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
passage_id = metadata.get("id", str(len(self.chunks)))
|
||||
@@ -196,7 +190,9 @@ class LeannBuilder:
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
|
||||
chunk_iterator = tqdm(self.chunks, desc="Writing passages", unit="chunk")
|
||||
chunk_iterator = tqdm(
|
||||
self.chunks, desc="Writing passages", unit="chunk"
|
||||
)
|
||||
except ImportError:
|
||||
chunk_iterator = self.chunks
|
||||
|
||||
@@ -226,7 +222,9 @@ class LeannBuilder:
|
||||
string_ids = [chunk["id"] for chunk in self.chunks]
|
||||
current_backend_kwargs = {**self.backend_kwargs, "dimensions": self.dimensions}
|
||||
builder_instance = self.backend_factory.builder(**current_backend_kwargs)
|
||||
builder_instance.build(embeddings, string_ids, index_path, **current_backend_kwargs)
|
||||
builder_instance.build(
|
||||
embeddings, string_ids, index_path, **current_backend_kwargs
|
||||
)
|
||||
leann_meta_path = index_dir / f"{index_name}.meta.json"
|
||||
meta_data = {
|
||||
"version": "1.0",
|
||||
@@ -275,7 +273,9 @@ class LeannBuilder:
|
||||
ids, embeddings = data
|
||||
|
||||
if not isinstance(embeddings, np.ndarray):
|
||||
raise ValueError(f"Expected embeddings to be numpy array, got {type(embeddings)}")
|
||||
raise ValueError(
|
||||
f"Expected embeddings to be numpy array, got {type(embeddings)}"
|
||||
)
|
||||
|
||||
if len(ids) != embeddings.shape[0]:
|
||||
raise ValueError(
|
||||
@@ -287,7 +287,9 @@ class LeannBuilder:
|
||||
if self.dimensions is None:
|
||||
self.dimensions = embedding_dim
|
||||
elif self.dimensions != embedding_dim:
|
||||
raise ValueError(f"Dimension mismatch: expected {self.dimensions}, got {embedding_dim}")
|
||||
raise ValueError(
|
||||
f"Dimension mismatch: expected {self.dimensions}, got {embedding_dim}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Building index from precomputed embeddings: {len(ids)} items, {embedding_dim} dimensions"
|
||||
@@ -372,24 +374,26 @@ class LeannBuilder:
|
||||
with open(leann_meta_path, "w", encoding="utf-8") as f:
|
||||
json.dump(meta_data, f, indent=2)
|
||||
|
||||
logger.info(f"Index built successfully from precomputed embeddings: {index_path}")
|
||||
logger.info(
|
||||
f"Index built successfully from precomputed embeddings: {index_path}"
|
||||
)
|
||||
|
||||
|
||||
class LeannSearcher:
|
||||
def __init__(self, index_path: str, enable_warmup: bool = False, **backend_kwargs):
|
||||
# Fix path resolution for Colab and other environments
|
||||
if not Path(index_path).is_absolute():
|
||||
index_path = str(Path(index_path).resolve())
|
||||
|
||||
self.meta_path_str = f"{index_path}.meta.json"
|
||||
if not Path(self.meta_path_str).exists():
|
||||
raise FileNotFoundError(f"Leann metadata file not found at {self.meta_path_str}")
|
||||
with open(self.meta_path_str, encoding="utf-8") as f:
|
||||
raise FileNotFoundError(
|
||||
f"Leann metadata file not found at {self.meta_path_str}"
|
||||
)
|
||||
with open(self.meta_path_str, "r", encoding="utf-8") as f:
|
||||
self.meta_data = json.load(f)
|
||||
backend_name = self.meta_data["backend_name"]
|
||||
self.embedding_model = self.meta_data["embedding_model"]
|
||||
# Support both old and new format
|
||||
self.embedding_mode = self.meta_data.get("embedding_mode", "sentence-transformers")
|
||||
self.embedding_mode = self.meta_data.get(
|
||||
"embedding_mode", "sentence-transformers"
|
||||
)
|
||||
self.passage_manager = PassageManager(self.meta_data.get("passage_sources", []))
|
||||
backend_factory = BACKEND_REGISTRY.get(backend_name)
|
||||
if backend_factory is None:
|
||||
@@ -411,7 +415,7 @@ class LeannSearcher:
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
expected_zmq_port: int = 5557,
|
||||
**kwargs,
|
||||
) -> list[SearchResult]:
|
||||
) -> List[SearchResult]:
|
||||
logger.info("🔍 LeannSearcher.search() called:")
|
||||
logger.info(f" Query: '{query}'")
|
||||
logger.info(f" Top_k: {top_k}")
|
||||
@@ -438,7 +442,7 @@ class LeannSearcher:
|
||||
zmq_port=zmq_port,
|
||||
)
|
||||
# logger.info(f" Generated embedding shape: {query_embedding.shape}")
|
||||
time.time() - start_time
|
||||
embedding_time = time.time() - start_time
|
||||
# logger.info(f" Embedding time: {embedding_time} seconds")
|
||||
|
||||
start_time = time.time()
|
||||
@@ -453,15 +457,17 @@ class LeannSearcher:
|
||||
zmq_port=zmq_port,
|
||||
**kwargs,
|
||||
)
|
||||
time.time() - start_time
|
||||
search_time = time.time() - start_time
|
||||
# logger.info(f" Search time: {search_time} seconds")
|
||||
logger.info(f" Backend returned: labels={len(results.get('labels', [[]])[0])} results")
|
||||
logger.info(
|
||||
f" Backend returned: labels={len(results.get('labels', [[]])[0])} results"
|
||||
)
|
||||
|
||||
enriched_results = []
|
||||
if "labels" in results and "distances" in results:
|
||||
logger.info(f" Processing {len(results['labels'][0])} passage IDs:")
|
||||
for i, (string_id, dist) in enumerate(
|
||||
zip(results["labels"][0], results["distances"][0], strict=False)
|
||||
zip(results["labels"][0], results["distances"][0])
|
||||
):
|
||||
try:
|
||||
passage_data = self.passage_manager.get_passage(string_id)
|
||||
@@ -473,15 +479,15 @@ class LeannSearcher:
|
||||
metadata=passage_data.get("metadata", {}),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# Color codes for better logging
|
||||
GREEN = "\033[92m"
|
||||
BLUE = "\033[94m"
|
||||
YELLOW = "\033[93m"
|
||||
RESET = "\033[0m"
|
||||
|
||||
|
||||
# Truncate text for display (first 100 chars)
|
||||
display_text = passage_data["text"]
|
||||
display_text = passage_data['text']
|
||||
logger.info(
|
||||
f" {GREEN}✓{RESET} {BLUE}[{i + 1:2d}]{RESET} {YELLOW}ID:{RESET} '{string_id}' {YELLOW}Score:{RESET} {dist:.4f} {YELLOW}Text:{RESET} {display_text}"
|
||||
)
|
||||
@@ -499,7 +505,7 @@ class LeannChat:
|
||||
def __init__(
|
||||
self,
|
||||
index_path: str,
|
||||
llm_config: dict[str, Any] | None = None,
|
||||
llm_config: Optional[Dict[str, Any]] = None,
|
||||
enable_warmup: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
@@ -515,7 +521,7 @@ class LeannChat:
|
||||
prune_ratio: float = 0.0,
|
||||
recompute_embeddings: bool = True,
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
llm_kwargs: dict[str, Any] | None = None,
|
||||
llm_kwargs: Optional[Dict[str, Any]] = None,
|
||||
expected_zmq_port: int = 5557,
|
||||
**search_kwargs,
|
||||
):
|
||||
|
||||
@@ -4,12 +4,11 @@ This file contains the chat generation logic for the LEANN project,
|
||||
supporting different backends like Ollama, Hugging Face Transformers, and a simulation mode.
|
||||
"""
|
||||
|
||||
import difflib
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Optional, List
|
||||
import logging
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
import difflib
|
||||
import torch
|
||||
|
||||
# Configure logging
|
||||
@@ -17,11 +16,10 @@ logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def check_ollama_models() -> list[str]:
|
||||
def check_ollama_models() -> List[str]:
|
||||
"""Check available Ollama models and return a list"""
|
||||
try:
|
||||
import requests
|
||||
|
||||
response = requests.get("http://localhost:11434/api/tags", timeout=5)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
@@ -33,52 +31,51 @@ def check_ollama_models() -> list[str]:
|
||||
|
||||
def check_ollama_model_exists_remotely(model_name: str) -> tuple[bool, list[str]]:
|
||||
"""Check if a model exists in Ollama's remote library and return available tags
|
||||
|
||||
|
||||
Returns:
|
||||
(model_exists, available_tags): bool and list of matching tags
|
||||
"""
|
||||
try:
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
import re
|
||||
|
||||
# Split model name and tag
|
||||
if ":" in model_name:
|
||||
base_model, requested_tag = model_name.split(":", 1)
|
||||
if ':' in model_name:
|
||||
base_model, requested_tag = model_name.split(':', 1)
|
||||
else:
|
||||
base_model, requested_tag = model_name, None
|
||||
|
||||
|
||||
# First check if base model exists in library
|
||||
library_response = requests.get("https://ollama.com/library", timeout=8)
|
||||
if library_response.status_code != 200:
|
||||
return True, [] # Assume exists if can't check
|
||||
|
||||
|
||||
# Extract model names from library page
|
||||
models_in_library = re.findall(r'href="/library/([^"]+)"', library_response.text)
|
||||
|
||||
|
||||
if base_model not in models_in_library:
|
||||
return False, [] # Base model doesn't exist
|
||||
|
||||
|
||||
# If base model exists, get available tags
|
||||
tags_response = requests.get(f"https://ollama.com/library/{base_model}/tags", timeout=8)
|
||||
if tags_response.status_code != 200:
|
||||
return True, [] # Base model exists but can't get tags
|
||||
|
||||
|
||||
# Extract tags for this model - be more specific to avoid HTML artifacts
|
||||
tag_pattern = rf"{re.escape(base_model)}:[a-zA-Z0-9\.\-_]+"
|
||||
tag_pattern = rf'{re.escape(base_model)}:[a-zA-Z0-9\.\-_]+'
|
||||
raw_tags = re.findall(tag_pattern, tags_response.text)
|
||||
|
||||
|
||||
# Clean up tags - remove HTML artifacts and duplicates
|
||||
available_tags = []
|
||||
seen = set()
|
||||
for tag in raw_tags:
|
||||
# Skip if it looks like HTML (contains < or >)
|
||||
if "<" in tag or ">" in tag:
|
||||
if '<' in tag or '>' in tag:
|
||||
continue
|
||||
if tag not in seen:
|
||||
seen.add(tag)
|
||||
available_tags.append(tag)
|
||||
|
||||
|
||||
# Check if exact model exists
|
||||
if requested_tag is None:
|
||||
# User just requested base model, suggest tags
|
||||
@@ -86,80 +83,76 @@ def check_ollama_model_exists_remotely(model_name: str) -> tuple[bool, list[str]
|
||||
else:
|
||||
exact_match = model_name in available_tags
|
||||
return exact_match, available_tags[:10]
|
||||
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# If scraping fails, assume model might exist (don't block user)
|
||||
return True, []
|
||||
|
||||
|
||||
def search_ollama_models_fuzzy(query: str, available_models: list[str]) -> list[str]:
|
||||
def search_ollama_models_fuzzy(query: str, available_models: List[str]) -> List[str]:
|
||||
"""Use intelligent fuzzy search for Ollama models"""
|
||||
if not available_models:
|
||||
return []
|
||||
|
||||
|
||||
query_lower = query.lower()
|
||||
suggestions = []
|
||||
|
||||
|
||||
# 1. Exact matches first
|
||||
exact_matches = [m for m in available_models if query_lower == m.lower()]
|
||||
suggestions.extend(exact_matches)
|
||||
|
||||
|
||||
# 2. Starts with query
|
||||
starts_with = [
|
||||
m for m in available_models if m.lower().startswith(query_lower) and m not in suggestions
|
||||
]
|
||||
starts_with = [m for m in available_models if m.lower().startswith(query_lower) and m not in suggestions]
|
||||
suggestions.extend(starts_with)
|
||||
|
||||
|
||||
# 3. Contains query
|
||||
contains = [m for m in available_models if query_lower in m.lower() and m not in suggestions]
|
||||
suggestions.extend(contains)
|
||||
|
||||
|
||||
# 4. Base model name matching (remove version numbers)
|
||||
def get_base_name(model_name: str) -> str:
|
||||
"""Extract base name without version (e.g., 'llama3:8b' -> 'llama3')"""
|
||||
return model_name.split(":")[0].split("-")[0]
|
||||
|
||||
return model_name.split(':')[0].split('-')[0]
|
||||
|
||||
query_base = get_base_name(query_lower)
|
||||
base_matches = [
|
||||
m
|
||||
for m in available_models
|
||||
m for m in available_models
|
||||
if get_base_name(m.lower()) == query_base and m not in suggestions
|
||||
]
|
||||
suggestions.extend(base_matches)
|
||||
|
||||
|
||||
# 5. Family/variant matching
|
||||
model_families = {
|
||||
"llama": ["llama2", "llama3", "alpaca", "vicuna", "codellama"],
|
||||
"qwen": ["qwen", "qwen2", "qwen3"],
|
||||
"gemma": ["gemma", "gemma2"],
|
||||
"phi": ["phi", "phi2", "phi3"],
|
||||
"mistral": ["mistral", "mixtral", "openhermes"],
|
||||
"dolphin": ["dolphin", "openchat"],
|
||||
"deepseek": ["deepseek", "deepseek-coder"],
|
||||
'llama': ['llama2', 'llama3', 'alpaca', 'vicuna', 'codellama'],
|
||||
'qwen': ['qwen', 'qwen2', 'qwen3'],
|
||||
'gemma': ['gemma', 'gemma2'],
|
||||
'phi': ['phi', 'phi2', 'phi3'],
|
||||
'mistral': ['mistral', 'mixtral', 'openhermes'],
|
||||
'dolphin': ['dolphin', 'openchat'],
|
||||
'deepseek': ['deepseek', 'deepseek-coder']
|
||||
}
|
||||
|
||||
|
||||
query_family = None
|
||||
for family, variants in model_families.items():
|
||||
if any(variant in query_lower for variant in variants):
|
||||
query_family = family
|
||||
break
|
||||
|
||||
|
||||
if query_family:
|
||||
family_variants = model_families[query_family]
|
||||
family_matches = [
|
||||
m
|
||||
for m in available_models
|
||||
m for m in available_models
|
||||
if any(variant in m.lower() for variant in family_variants) and m not in suggestions
|
||||
]
|
||||
suggestions.extend(family_matches)
|
||||
|
||||
|
||||
# 6. Use difflib for remaining fuzzy matches
|
||||
remaining_models = [m for m in available_models if m not in suggestions]
|
||||
difflib_matches = difflib.get_close_matches(query_lower, remaining_models, n=3, cutoff=0.4)
|
||||
suggestions.extend(difflib_matches)
|
||||
|
||||
|
||||
return suggestions[:8] # Return top 8 suggestions
|
||||
|
||||
|
||||
@@ -169,13 +162,15 @@ def search_ollama_models_fuzzy(query: str, available_models: list[str]) -> list[
|
||||
# Remove this too - no need for fallback
|
||||
|
||||
|
||||
def suggest_similar_models(invalid_model: str, available_models: list[str]) -> list[str]:
|
||||
def suggest_similar_models(invalid_model: str, available_models: List[str]) -> List[str]:
|
||||
"""Use difflib to find similar model names"""
|
||||
if not available_models:
|
||||
return []
|
||||
|
||||
|
||||
# Get close matches using fuzzy matching
|
||||
suggestions = difflib.get_close_matches(invalid_model, available_models, n=3, cutoff=0.3)
|
||||
suggestions = difflib.get_close_matches(
|
||||
invalid_model, available_models, n=3, cutoff=0.3
|
||||
)
|
||||
return suggestions
|
||||
|
||||
|
||||
@@ -183,50 +178,49 @@ def check_hf_model_exists(model_name: str) -> bool:
|
||||
"""Quick check if HuggingFace model exists without downloading"""
|
||||
try:
|
||||
from huggingface_hub import model_info
|
||||
|
||||
model_info(model_name)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def get_popular_hf_models() -> list[str]:
|
||||
def get_popular_hf_models() -> List[str]:
|
||||
"""Return a list of popular HuggingFace models for suggestions"""
|
||||
try:
|
||||
from huggingface_hub import list_models
|
||||
|
||||
|
||||
# Get popular text-generation models, sorted by downloads
|
||||
models = list_models(
|
||||
filter="text-generation",
|
||||
sort="downloads",
|
||||
direction=-1,
|
||||
limit=20, # Get top 20 most downloaded
|
||||
limit=20 # Get top 20 most downloaded
|
||||
)
|
||||
|
||||
|
||||
# Extract model names and filter for chat/conversation models
|
||||
model_names = []
|
||||
chat_keywords = ["chat", "instruct", "dialog", "conversation", "assistant"]
|
||||
|
||||
chat_keywords = ['chat', 'instruct', 'dialog', 'conversation', 'assistant']
|
||||
|
||||
for model in models:
|
||||
model_name = model.id if hasattr(model, "id") else str(model)
|
||||
model_name = model.id if hasattr(model, 'id') else str(model)
|
||||
# Prioritize models with chat-related keywords
|
||||
if any(keyword in model_name.lower() for keyword in chat_keywords):
|
||||
model_names.append(model_name)
|
||||
elif len(model_names) < 10: # Fill up with other popular models
|
||||
model_names.append(model_name)
|
||||
|
||||
|
||||
return model_names[:10] if model_names else _get_fallback_hf_models()
|
||||
|
||||
|
||||
except Exception:
|
||||
# Fallback to static list if API call fails
|
||||
return _get_fallback_hf_models()
|
||||
|
||||
|
||||
def _get_fallback_hf_models() -> list[str]:
|
||||
def _get_fallback_hf_models() -> List[str]:
|
||||
"""Fallback list of popular HuggingFace models"""
|
||||
return [
|
||||
"microsoft/DialoGPT-medium",
|
||||
"microsoft/DialoGPT-large",
|
||||
"microsoft/DialoGPT-large",
|
||||
"facebook/blenderbot-400M-distill",
|
||||
"microsoft/phi-2",
|
||||
"deepseek-ai/deepseek-llm-7b-chat",
|
||||
@@ -234,40 +228,44 @@ def _get_fallback_hf_models() -> list[str]:
|
||||
"facebook/blenderbot_small-90M",
|
||||
"microsoft/phi-1_5",
|
||||
"facebook/opt-350m",
|
||||
"EleutherAI/gpt-neo-1.3B",
|
||||
"EleutherAI/gpt-neo-1.3B"
|
||||
]
|
||||
|
||||
|
||||
def search_hf_models_fuzzy(query: str, limit: int = 10) -> list[str]:
|
||||
def search_hf_models_fuzzy(query: str, limit: int = 10) -> List[str]:
|
||||
"""Use HuggingFace Hub's native fuzzy search for model suggestions"""
|
||||
try:
|
||||
from huggingface_hub import list_models
|
||||
|
||||
|
||||
# HF Hub's search is already fuzzy! It handles typos and partial matches
|
||||
models = list_models(
|
||||
search=query, filter="text-generation", sort="downloads", direction=-1, limit=limit
|
||||
search=query,
|
||||
filter="text-generation",
|
||||
sort="downloads",
|
||||
direction=-1,
|
||||
limit=limit
|
||||
)
|
||||
|
||||
model_names = [model.id if hasattr(model, "id") else str(model) for model in models]
|
||||
|
||||
|
||||
model_names = [model.id if hasattr(model, 'id') else str(model) for model in models]
|
||||
|
||||
# If direct search doesn't return enough results, try some variations
|
||||
if len(model_names) < 3:
|
||||
# Try searching for partial matches or common variations
|
||||
variations = []
|
||||
|
||||
|
||||
# Extract base name (e.g., "gpt3" from "gpt-3.5")
|
||||
base_query = query.lower().replace("-", "").replace(".", "").replace("_", "")
|
||||
base_query = query.lower().replace('-', '').replace('.', '').replace('_', '')
|
||||
if base_query != query.lower():
|
||||
variations.append(base_query)
|
||||
|
||||
|
||||
# Try common model name patterns
|
||||
if "gpt" in query.lower():
|
||||
variations.extend(["gpt2", "gpt-neo", "gpt-j", "dialoGPT"])
|
||||
elif "llama" in query.lower():
|
||||
variations.extend(["llama2", "alpaca", "vicuna"])
|
||||
elif "bert" in query.lower():
|
||||
variations.extend(["roberta", "distilbert", "albert"])
|
||||
|
||||
if 'gpt' in query.lower():
|
||||
variations.extend(['gpt2', 'gpt-neo', 'gpt-j', 'dialoGPT'])
|
||||
elif 'llama' in query.lower():
|
||||
variations.extend(['llama2', 'alpaca', 'vicuna'])
|
||||
elif 'bert' in query.lower():
|
||||
variations.extend(['roberta', 'distilbert', 'albert'])
|
||||
|
||||
# Search with variations
|
||||
for var in variations[:2]: # Limit to 2 variations to avoid too many API calls
|
||||
try:
|
||||
@@ -276,15 +274,13 @@ def search_hf_models_fuzzy(query: str, limit: int = 10) -> list[str]:
|
||||
filter="text-generation",
|
||||
sort="downloads",
|
||||
direction=-1,
|
||||
limit=3,
|
||||
limit=3
|
||||
)
|
||||
var_names = [
|
||||
model.id if hasattr(model, "id") else str(model) for model in var_models
|
||||
]
|
||||
var_names = [model.id if hasattr(model, 'id') else str(model) for model in var_models]
|
||||
model_names.extend(var_names)
|
||||
except Exception:
|
||||
except:
|
||||
continue
|
||||
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_models = []
|
||||
@@ -292,67 +288,65 @@ def search_hf_models_fuzzy(query: str, limit: int = 10) -> list[str]:
|
||||
if model not in seen:
|
||||
seen.add(model)
|
||||
unique_models.append(model)
|
||||
|
||||
|
||||
return unique_models[:limit]
|
||||
|
||||
|
||||
except Exception:
|
||||
# If search fails, return empty list
|
||||
return []
|
||||
|
||||
|
||||
def search_hf_models(query: str, limit: int = 10) -> list[str]:
|
||||
def search_hf_models(query: str, limit: int = 10) -> List[str]:
|
||||
"""Simple search for HuggingFace models based on query (kept for backward compatibility)"""
|
||||
return search_hf_models_fuzzy(query, limit)
|
||||
|
||||
|
||||
def validate_model_and_suggest(model_name: str, llm_type: str) -> str | None:
|
||||
def validate_model_and_suggest(model_name: str, llm_type: str) -> Optional[str]:
|
||||
"""Validate model name and provide suggestions if invalid"""
|
||||
if llm_type == "ollama":
|
||||
available_models = check_ollama_models()
|
||||
if available_models and model_name not in available_models:
|
||||
error_msg = f"Model '{model_name}' not found in your local Ollama installation."
|
||||
|
||||
|
||||
# Check if the model exists remotely and get available tags
|
||||
model_exists_remotely, available_tags = check_ollama_model_exists_remotely(model_name)
|
||||
|
||||
|
||||
if model_exists_remotely and model_name in available_tags:
|
||||
# Exact model exists remotely - suggest pulling it
|
||||
error_msg += "\n\nTo install the requested model:\n"
|
||||
error_msg += f"\n\nTo install the requested model:\n"
|
||||
error_msg += f" ollama pull {model_name}\n"
|
||||
|
||||
|
||||
# Show local alternatives
|
||||
suggestions = search_ollama_models_fuzzy(model_name, available_models)
|
||||
if suggestions:
|
||||
error_msg += "\nOr use one of these similar installed models:\n"
|
||||
for i, suggestion in enumerate(suggestions, 1):
|
||||
error_msg += f" {i}. {suggestion}\n"
|
||||
|
||||
|
||||
elif model_exists_remotely and available_tags:
|
||||
# Base model exists but requested tag doesn't - suggest correct tags
|
||||
base_model = model_name.split(":")[0]
|
||||
requested_tag = model_name.split(":", 1)[1] if ":" in model_name else None
|
||||
|
||||
error_msg += (
|
||||
f"\n\nModel '{base_model}' exists, but tag '{requested_tag}' is not available."
|
||||
)
|
||||
base_model = model_name.split(':')[0]
|
||||
requested_tag = model_name.split(':', 1)[1] if ':' in model_name else None
|
||||
|
||||
error_msg += f"\n\nModel '{base_model}' exists, but tag '{requested_tag}' is not available."
|
||||
error_msg += f"\n\nAvailable {base_model} models you can install:\n"
|
||||
for i, tag in enumerate(available_tags[:8], 1):
|
||||
error_msg += f" {i}. ollama pull {tag}\n"
|
||||
if len(available_tags) > 8:
|
||||
error_msg += f" ... and {len(available_tags) - 8} more variants\n"
|
||||
|
||||
|
||||
# Also show local alternatives
|
||||
suggestions = search_ollama_models_fuzzy(model_name, available_models)
|
||||
if suggestions:
|
||||
error_msg += "\nOr use one of these similar installed models:\n"
|
||||
for i, suggestion in enumerate(suggestions, 1):
|
||||
error_msg += f" {i}. {suggestion}\n"
|
||||
|
||||
|
||||
else:
|
||||
# Model doesn't exist remotely - show fuzzy suggestions
|
||||
suggestions = search_ollama_models_fuzzy(model_name, available_models)
|
||||
error_msg += f"\n\nModel '{model_name}' was not found in Ollama's library."
|
||||
|
||||
|
||||
if suggestions:
|
||||
error_msg += "\n\nDid you mean one of these installed models?\n"
|
||||
for i, suggestion in enumerate(suggestions, 1):
|
||||
@@ -363,25 +357,23 @@ def validate_model_and_suggest(model_name: str, llm_type: str) -> str | None:
|
||||
error_msg += f" {i}. {model}\n"
|
||||
if len(available_models) > 8:
|
||||
error_msg += f" ... and {len(available_models) - 8} more\n"
|
||||
|
||||
|
||||
error_msg += "\n\nCommands:"
|
||||
error_msg += "\n ollama list # List installed models"
|
||||
if model_exists_remotely and available_tags:
|
||||
if model_name in available_tags:
|
||||
error_msg += f"\n ollama pull {model_name} # Install requested model"
|
||||
else:
|
||||
error_msg += (
|
||||
f"\n ollama pull {available_tags[0]} # Install recommended variant"
|
||||
)
|
||||
error_msg += f"\n ollama pull {available_tags[0]} # Install recommended variant"
|
||||
error_msg += "\n https://ollama.com/library # Browse available models"
|
||||
return error_msg
|
||||
|
||||
|
||||
elif llm_type == "hf":
|
||||
# For HF models, we can do a quick existence check
|
||||
if not check_hf_model_exists(model_name):
|
||||
# Use HF Hub's native fuzzy search directly
|
||||
search_suggestions = search_hf_models_fuzzy(model_name, limit=8)
|
||||
|
||||
|
||||
error_msg = f"Model '{model_name}' not found on HuggingFace Hub."
|
||||
if search_suggestions:
|
||||
error_msg += "\n\nDid you mean one of these?\n"
|
||||
@@ -393,10 +385,10 @@ def validate_model_and_suggest(model_name: str, llm_type: str) -> str | None:
|
||||
error_msg += "\n\nPopular chat models:\n"
|
||||
for i, model in enumerate(popular_models[:5], 1):
|
||||
error_msg += f" {i}. {model}\n"
|
||||
|
||||
|
||||
error_msg += f"\nSearch more: https://huggingface.co/models?search={model_name}&pipeline_tag=text-generation"
|
||||
return error_msg
|
||||
|
||||
|
||||
return None # Model is valid or we can't check
|
||||
|
||||
|
||||
@@ -459,26 +451,27 @@ class OllamaChat(LLMInterface):
|
||||
# Check if the Ollama server is responsive
|
||||
if host:
|
||||
requests.get(host)
|
||||
|
||||
|
||||
# Pre-check model availability with helpful suggestions
|
||||
model_error = validate_model_and_suggest(model, "ollama")
|
||||
if model_error:
|
||||
raise ValueError(model_error)
|
||||
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The 'requests' library is required for Ollama. Please install it with 'pip install requests'."
|
||||
)
|
||||
except requests.exceptions.ConnectionError:
|
||||
logger.error(f"Could not connect to Ollama at {host}. Please ensure Ollama is running.")
|
||||
logger.error(
|
||||
f"Could not connect to Ollama at {host}. Please ensure Ollama is running."
|
||||
)
|
||||
raise ConnectionError(
|
||||
f"Could not connect to Ollama at {host}. Please ensure Ollama is running."
|
||||
)
|
||||
|
||||
def ask(self, prompt: str, **kwargs) -> str:
|
||||
import json
|
||||
|
||||
import requests
|
||||
import json
|
||||
|
||||
full_url = f"{self.host}/api/generate"
|
||||
payload = {
|
||||
@@ -489,7 +482,7 @@ class OllamaChat(LLMInterface):
|
||||
}
|
||||
logger.debug(f"Sending request to Ollama: {payload}")
|
||||
try:
|
||||
logger.info("Sending request to Ollama and waiting for response...")
|
||||
logger.info(f"Sending request to Ollama and waiting for response...")
|
||||
response = requests.post(full_url, data=json.dumps(payload))
|
||||
response.raise_for_status()
|
||||
|
||||
@@ -513,15 +506,15 @@ class HFChat(LLMInterface):
|
||||
|
||||
def __init__(self, model_name: str = "deepseek-ai/deepseek-llm-7b-chat"):
|
||||
logger.info(f"Initializing HFChat with model='{model_name}'")
|
||||
|
||||
|
||||
# Pre-check model availability with helpful suggestions
|
||||
model_error = validate_model_and_suggest(model_name, "hf")
|
||||
if model_error:
|
||||
raise ValueError(model_error)
|
||||
|
||||
|
||||
try:
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The 'transformers' and 'torch' libraries are required for Hugging Face models. Please install them with 'pip install transformers torch'."
|
||||
@@ -544,34 +537,36 @@ class HFChat(LLMInterface):
|
||||
model_name,
|
||||
torch_dtype=torch.float16 if self.device != "cpu" else torch.float32,
|
||||
device_map="auto" if self.device != "cpu" else None,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=True
|
||||
)
|
||||
|
||||
|
||||
# Move model to device if not using device_map
|
||||
if self.device != "cpu" and "device_map" not in str(self.model):
|
||||
self.model = self.model.to(self.device)
|
||||
|
||||
|
||||
# Set pad token if not present
|
||||
if self.tokenizer.pad_token is None:
|
||||
self.tokenizer.pad_token = self.tokenizer.eos_token
|
||||
|
||||
def ask(self, prompt: str, **kwargs) -> str:
|
||||
print("kwargs in HF: ", kwargs)
|
||||
print('kwargs in HF: ', kwargs)
|
||||
# Check if this is a Qwen model and add /no_think by default
|
||||
is_qwen_model = "qwen" in self.model.config._name_or_path.lower()
|
||||
|
||||
|
||||
# For Qwen models, automatically add /no_think to the prompt
|
||||
if is_qwen_model and "/no_think" not in prompt and "/think" not in prompt:
|
||||
prompt = prompt + " /no_think"
|
||||
|
||||
|
||||
# Prepare chat template
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
|
||||
|
||||
# Apply chat template if available
|
||||
if hasattr(self.tokenizer, "apply_chat_template"):
|
||||
try:
|
||||
formatted_prompt = self.tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Chat template failed, using raw prompt: {e}")
|
||||
@@ -582,9 +577,13 @@ class HFChat(LLMInterface):
|
||||
|
||||
# Tokenize input
|
||||
inputs = self.tokenizer(
|
||||
formatted_prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048
|
||||
formatted_prompt,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=2048
|
||||
)
|
||||
|
||||
|
||||
# Move inputs to device
|
||||
if self.device != "cpu":
|
||||
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
||||
@@ -598,29 +597,32 @@ class HFChat(LLMInterface):
|
||||
"pad_token_id": self.tokenizer.eos_token_id,
|
||||
"eos_token_id": self.tokenizer.eos_token_id,
|
||||
}
|
||||
|
||||
|
||||
# Handle temperature=0 for greedy decoding
|
||||
if generation_config["temperature"] == 0.0:
|
||||
generation_config["do_sample"] = False
|
||||
generation_config.pop("temperature")
|
||||
|
||||
logger.info(f"Generating with HuggingFace model, config: {generation_config}")
|
||||
|
||||
|
||||
# Generate
|
||||
with torch.no_grad():
|
||||
outputs = self.model.generate(**inputs, **generation_config)
|
||||
outputs = self.model.generate(
|
||||
**inputs,
|
||||
**generation_config
|
||||
)
|
||||
|
||||
# Decode response
|
||||
generated_tokens = outputs[0][inputs["input_ids"].shape[1] :]
|
||||
generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
|
||||
response = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
||||
|
||||
|
||||
return response.strip()
|
||||
|
||||
|
||||
class OpenAIChat(LLMInterface):
|
||||
"""LLM interface for OpenAI models."""
|
||||
|
||||
def __init__(self, model: str = "gpt-4o", api_key: str | None = None):
|
||||
def __init__(self, model: str = "gpt-4o", api_key: Optional[str] = None):
|
||||
self.model = model
|
||||
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
|
||||
@@ -647,7 +649,11 @@ class OpenAIChat(LLMInterface):
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": kwargs.get("max_tokens", 1000),
|
||||
"temperature": kwargs.get("temperature", 0.7),
|
||||
**{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]},
|
||||
**{
|
||||
k: v
|
||||
for k, v in kwargs.items()
|
||||
if k not in ["max_tokens", "temperature"]
|
||||
},
|
||||
}
|
||||
|
||||
logger.info(f"Sending request to OpenAI with model {self.model}")
|
||||
@@ -669,7 +675,7 @@ class SimulatedChat(LLMInterface):
|
||||
return "This is a simulated answer from the LLM based on the retrieved context."
|
||||
|
||||
|
||||
def get_llm(llm_config: dict[str, Any] | None = None) -> LLMInterface:
|
||||
def get_llm(llm_config: Optional[Dict[str, Any]] = None) -> LLMInterface:
|
||||
"""
|
||||
Factory function to get an LLM interface based on configuration.
|
||||
|
||||
|
||||
@@ -5,38 +5,7 @@ from pathlib import Path
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
from .api import LeannBuilder, LeannChat, LeannSearcher
|
||||
|
||||
|
||||
def extract_pdf_text_with_pymupdf(file_path: str) -> str:
|
||||
"""Extract text from PDF using PyMuPDF for better quality."""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
|
||||
doc = fitz.open(file_path)
|
||||
text = ""
|
||||
for page in doc:
|
||||
text += page.get_text()
|
||||
doc.close()
|
||||
return text
|
||||
except ImportError:
|
||||
# Fallback to default reader
|
||||
return None
|
||||
|
||||
|
||||
def extract_pdf_text_with_pdfplumber(file_path: str) -> str:
|
||||
"""Extract text from PDF using pdfplumber for better quality."""
|
||||
try:
|
||||
import pdfplumber
|
||||
|
||||
text = ""
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
text += page.extract_text() or ""
|
||||
return text
|
||||
except ImportError:
|
||||
# Fallback to default reader
|
||||
return None
|
||||
from .api import LeannBuilder, LeannSearcher, LeannChat
|
||||
|
||||
|
||||
class LeannCLI:
|
||||
@@ -76,12 +45,18 @@ Examples:
|
||||
# Build command
|
||||
build_parser = subparsers.add_parser("build", help="Build document index")
|
||||
build_parser.add_argument("index_name", help="Index name")
|
||||
build_parser.add_argument("--docs", type=str, required=True, help="Documents directory")
|
||||
build_parser.add_argument(
|
||||
"--docs", type=str, required=True, help="Documents directory"
|
||||
)
|
||||
build_parser.add_argument(
|
||||
"--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
|
||||
)
|
||||
build_parser.add_argument("--embedding-model", type=str, default="facebook/contriever")
|
||||
build_parser.add_argument("--force", "-f", action="store_true", help="Force rebuild")
|
||||
build_parser.add_argument(
|
||||
"--embedding-model", type=str, default="facebook/contriever"
|
||||
)
|
||||
build_parser.add_argument(
|
||||
"--force", "-f", action="store_true", help="Force rebuild"
|
||||
)
|
||||
build_parser.add_argument("--graph-degree", type=int, default=32)
|
||||
build_parser.add_argument("--complexity", type=int, default=64)
|
||||
build_parser.add_argument("--num-threads", type=int, default=1)
|
||||
@@ -127,7 +102,7 @@ Examples:
|
||||
)
|
||||
|
||||
# List command
|
||||
subparsers.add_parser("list", help="List all indexes")
|
||||
list_parser = subparsers.add_parser("list", help="List all indexes")
|
||||
|
||||
return parser
|
||||
|
||||
@@ -135,13 +110,17 @@ Examples:
|
||||
print("Stored LEANN indexes:")
|
||||
|
||||
if not self.indexes_dir.exists():
|
||||
print("No indexes found. Use 'leann build <name> --docs <dir>' to create one.")
|
||||
print(
|
||||
"No indexes found. Use 'leann build <name> --docs <dir>' to create one."
|
||||
)
|
||||
return
|
||||
|
||||
index_dirs = [d for d in self.indexes_dir.iterdir() if d.is_dir()]
|
||||
|
||||
if not index_dirs:
|
||||
print("No indexes found. Use 'leann build <name> --docs <dir>' to create one.")
|
||||
print(
|
||||
"No indexes found. Use 'leann build <name> --docs <dir>' to create one."
|
||||
)
|
||||
return
|
||||
|
||||
print(f"Found {len(index_dirs)} indexes:")
|
||||
@@ -151,58 +130,27 @@ Examples:
|
||||
|
||||
print(f" {i}. {index_name} [{status}]")
|
||||
if self.index_exists(index_name):
|
||||
index_dir / "documents.leann.meta.json"
|
||||
size_mb = sum(f.stat().st_size for f in index_dir.iterdir() if f.is_file()) / (
|
||||
1024 * 1024
|
||||
)
|
||||
meta_file = index_dir / "documents.leann.meta.json"
|
||||
size_mb = sum(
|
||||
f.stat().st_size for f in index_dir.iterdir() if f.is_file()
|
||||
) / (1024 * 1024)
|
||||
print(f" Size: {size_mb:.1f} MB")
|
||||
|
||||
if index_dirs:
|
||||
example_name = index_dirs[0].name
|
||||
print("\nUsage:")
|
||||
print(f"\nUsage:")
|
||||
print(f' leann search {example_name} "your query"')
|
||||
print(f" leann ask {example_name} --interactive")
|
||||
|
||||
def load_documents(self, docs_dir: str):
|
||||
print(f"Loading documents from {docs_dir}...")
|
||||
|
||||
# Try to use better PDF parsers first
|
||||
documents = []
|
||||
docs_path = Path(docs_dir)
|
||||
|
||||
for file_path in docs_path.rglob("*.pdf"):
|
||||
print(f"Processing PDF: {file_path}")
|
||||
|
||||
# Try PyMuPDF first (best quality)
|
||||
text = extract_pdf_text_with_pymupdf(str(file_path))
|
||||
if text is None:
|
||||
# Try pdfplumber
|
||||
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
||||
|
||||
if text:
|
||||
# Create a simple document structure
|
||||
from llama_index.core import Document
|
||||
|
||||
doc = Document(text=text, metadata={"source": str(file_path)})
|
||||
documents.append(doc)
|
||||
else:
|
||||
# Fallback to default reader
|
||||
print(f"Using default reader for {file_path}")
|
||||
default_docs = SimpleDirectoryReader(
|
||||
str(file_path.parent),
|
||||
filename_as_id=True,
|
||||
required_exts=[file_path.suffix],
|
||||
).load_data()
|
||||
documents.extend(default_docs)
|
||||
|
||||
# Load other file types with default reader
|
||||
other_docs = SimpleDirectoryReader(
|
||||
documents = SimpleDirectoryReader(
|
||||
docs_dir,
|
||||
recursive=True,
|
||||
encoding="utf-8",
|
||||
required_exts=[".txt", ".md", ".docx"],
|
||||
required_exts=[".pdf", ".txt", ".md", ".docx"],
|
||||
).load_data(show_progress=True)
|
||||
documents.extend(other_docs)
|
||||
|
||||
all_texts = []
|
||||
for doc in documents:
|
||||
|
||||
@@ -4,12 +4,11 @@ Consolidates all embedding computation logic using SentenceTransformer
|
||||
Preserves all optimization parameters to ensure performance
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from typing import List, Dict, Any
|
||||
import logging
|
||||
import os
|
||||
|
||||
# Set up logger with proper level
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -18,11 +17,11 @@ log_level = getattr(logging, LOG_LEVEL, logging.WARNING)
|
||||
logger.setLevel(log_level)
|
||||
|
||||
# Global model cache to avoid repeated loading
|
||||
_model_cache: dict[str, Any] = {}
|
||||
_model_cache: Dict[str, Any] = {}
|
||||
|
||||
|
||||
def compute_embeddings(
|
||||
texts: list[str],
|
||||
texts: List[str],
|
||||
model_name: str,
|
||||
mode: str = "sentence-transformers",
|
||||
is_build: bool = False,
|
||||
@@ -60,7 +59,7 @@ def compute_embeddings(
|
||||
|
||||
|
||||
def compute_embeddings_sentence_transformers(
|
||||
texts: list[str],
|
||||
texts: List[str],
|
||||
model_name: str,
|
||||
use_fp16: bool = True,
|
||||
device: str = "auto",
|
||||
@@ -115,7 +114,9 @@ def compute_embeddings_sentence_transformers(
|
||||
logger.info(f"Using cached optimized model: {model_name}")
|
||||
model = _model_cache[cache_key]
|
||||
else:
|
||||
logger.info(f"Loading and caching optimized SentenceTransformer model: {model_name}")
|
||||
logger.info(
|
||||
f"Loading and caching optimized SentenceTransformer model: {model_name}"
|
||||
)
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
logger.info(f"Using device: {device}")
|
||||
@@ -133,7 +134,9 @@ def compute_embeddings_sentence_transformers(
|
||||
if hasattr(torch.mps, "set_per_process_memory_fraction"):
|
||||
torch.mps.set_per_process_memory_fraction(0.9)
|
||||
except AttributeError:
|
||||
logger.warning("Some MPS optimizations not available in this PyTorch version")
|
||||
logger.warning(
|
||||
"Some MPS optimizations not available in this PyTorch version"
|
||||
)
|
||||
elif device == "cpu":
|
||||
# TODO: Haven't tested this yet
|
||||
torch.set_num_threads(min(8, os.cpu_count() or 4))
|
||||
@@ -223,22 +226,25 @@ def compute_embeddings_sentence_transformers(
|
||||
device=device,
|
||||
)
|
||||
|
||||
logger.info(f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}")
|
||||
logger.info(
|
||||
f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}"
|
||||
)
|
||||
|
||||
# Validate results
|
||||
if np.isnan(embeddings).any() or np.isinf(embeddings).any():
|
||||
raise RuntimeError(f"Detected NaN or Inf values in embeddings, model: {model_name}")
|
||||
raise RuntimeError(
|
||||
f"Detected NaN or Inf values in embeddings, model: {model_name}"
|
||||
)
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray:
|
||||
def compute_embeddings_openai(texts: List[str], model_name: str) -> np.ndarray:
|
||||
# TODO: @yichuan-w add progress bar only in build mode
|
||||
"""Compute embeddings using OpenAI API"""
|
||||
try:
|
||||
import os
|
||||
|
||||
import openai
|
||||
import os
|
||||
except ImportError as e:
|
||||
raise ImportError(f"OpenAI package not installed: {e}")
|
||||
|
||||
@@ -258,10 +264,9 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray:
|
||||
logger.info(
|
||||
f"Computing embeddings for {len(texts)} texts using OpenAI API, model: '{model_name}'"
|
||||
)
|
||||
print(f"len of texts: {len(texts)}")
|
||||
|
||||
# OpenAI has limits on batch size and input length
|
||||
max_batch_size = 1000 # Conservative batch size
|
||||
max_batch_size = 100 # Conservative batch size
|
||||
all_embeddings = []
|
||||
|
||||
try:
|
||||
@@ -288,12 +293,15 @@ def compute_embeddings_openai(texts: list[str], model_name: str) -> np.ndarray:
|
||||
raise
|
||||
|
||||
embeddings = np.array(all_embeddings, dtype=np.float32)
|
||||
logger.info(f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}")
|
||||
print(f"len of embeddings: {len(embeddings)}")
|
||||
logger.info(
|
||||
f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}"
|
||||
)
|
||||
return embeddings
|
||||
|
||||
|
||||
def compute_embeddings_mlx(chunks: list[str], model_name: str, batch_size: int = 16) -> np.ndarray:
|
||||
def compute_embeddings_mlx(
|
||||
chunks: List[str], model_name: str, batch_size: int = 16
|
||||
) -> np.ndarray:
|
||||
# TODO: @yichuan-w add progress bar only in build mode
|
||||
"""Computes embeddings using an MLX model."""
|
||||
try:
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
import time
|
||||
import atexit
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Optional
|
||||
import psutil
|
||||
|
||||
# Set up logging based on environment variable
|
||||
@@ -18,24 +18,6 @@ logging.basicConfig(
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _is_colab_environment() -> bool:
|
||||
"""Check if we're running in Google Colab environment."""
|
||||
return "COLAB_GPU" in os.environ or "COLAB_TPU" in os.environ
|
||||
|
||||
|
||||
def _get_available_port(start_port: int = 5557) -> int:
|
||||
"""Get an available port starting from start_port."""
|
||||
port = start_port
|
||||
while port < start_port + 100: # Try up to 100 ports
|
||||
try:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("localhost", port))
|
||||
return port
|
||||
except OSError:
|
||||
port += 1
|
||||
raise RuntimeError(f"No available ports found in range {start_port}-{start_port + 100}")
|
||||
|
||||
|
||||
def _check_port(port: int) -> bool:
|
||||
"""Check if a port is in use"""
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
@@ -182,8 +164,8 @@ class EmbeddingServerManager:
|
||||
e.g., "leann_backend_diskann.embedding_server"
|
||||
"""
|
||||
self.backend_module_name = backend_module_name
|
||||
self.server_process: subprocess.Popen | None = None
|
||||
self.server_port: int | None = None
|
||||
self.server_process: Optional[subprocess.Popen] = None
|
||||
self.server_port: Optional[int] = None
|
||||
self._atexit_registered = False
|
||||
|
||||
def start_server(
|
||||
@@ -193,69 +175,68 @@ class EmbeddingServerManager:
|
||||
embedding_mode: str = "sentence-transformers",
|
||||
**kwargs,
|
||||
) -> tuple[bool, int]:
|
||||
"""Start the embedding server."""
|
||||
"""
|
||||
Starts the embedding server process.
|
||||
|
||||
Args:
|
||||
port (int): The preferred ZMQ port for the server.
|
||||
model_name (str): The name of the embedding model to use.
|
||||
**kwargs: Additional arguments for the server.
|
||||
|
||||
Returns:
|
||||
tuple[bool, int]: (success, actual_port_used)
|
||||
"""
|
||||
passages_file = kwargs.get("passages_file")
|
||||
assert isinstance(passages_file, str), "passages_file must be a string"
|
||||
|
||||
# Check if we have a compatible server already running
|
||||
# Check if we have a compatible running server
|
||||
if self._has_compatible_running_server(model_name, passages_file):
|
||||
logger.info("Found compatible running server!")
|
||||
return True, port
|
||||
assert self.server_port is not None, (
|
||||
"a compatible running server should set server_port"
|
||||
)
|
||||
return True, self.server_port
|
||||
|
||||
# For Colab environment, use a different strategy
|
||||
if _is_colab_environment():
|
||||
logger.info("Detected Colab environment, using alternative startup strategy")
|
||||
return self._start_server_colab(port, model_name, embedding_mode, **kwargs)
|
||||
|
||||
# Find a compatible port or next available
|
||||
actual_port, is_compatible = _find_compatible_port_or_next_available(
|
||||
port, model_name, passages_file
|
||||
)
|
||||
|
||||
if is_compatible:
|
||||
logger.info(f"Found compatible server on port {actual_port}")
|
||||
return True, actual_port
|
||||
|
||||
# Start a new server
|
||||
return self._start_new_server(actual_port, model_name, embedding_mode, **kwargs)
|
||||
|
||||
def _start_server_colab(
|
||||
self,
|
||||
port: int,
|
||||
model_name: str,
|
||||
embedding_mode: str = "sentence-transformers",
|
||||
**kwargs,
|
||||
) -> tuple[bool, int]:
|
||||
"""Start server with Colab-specific configuration."""
|
||||
# Try to find an available port
|
||||
# Find available port (compatible or free)
|
||||
try:
|
||||
actual_port = _get_available_port(port)
|
||||
except RuntimeError:
|
||||
logger.error("No available ports found")
|
||||
actual_port, is_compatible = _find_compatible_port_or_next_available(
|
||||
port, model_name, passages_file
|
||||
)
|
||||
except RuntimeError as e:
|
||||
logger.error(str(e))
|
||||
return False, port
|
||||
|
||||
logger.info(f"Starting server on port {actual_port} for Colab environment")
|
||||
if is_compatible:
|
||||
logger.info(f"Using existing compatible server on port {actual_port}")
|
||||
self.server_port = actual_port
|
||||
self.server_process = None # We don't own this process
|
||||
return True, actual_port
|
||||
|
||||
# Use a simpler startup strategy for Colab
|
||||
command = self._build_server_command(actual_port, model_name, embedding_mode, **kwargs)
|
||||
if actual_port != port:
|
||||
logger.info(f"Using port {actual_port} instead of {port}")
|
||||
|
||||
try:
|
||||
# In Colab, we'll use a more direct approach
|
||||
self._launch_server_process_colab(command, actual_port)
|
||||
return self._wait_for_server_ready_colab(actual_port)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start embedding server in Colab: {e}")
|
||||
return False, actual_port
|
||||
# Start new server
|
||||
return self._start_new_server(actual_port, model_name, embedding_mode, **kwargs)
|
||||
|
||||
def _has_compatible_running_server(self, model_name: str, passages_file: str) -> bool:
|
||||
def _has_compatible_running_server(
|
||||
self, model_name: str, passages_file: str
|
||||
) -> bool:
|
||||
"""Check if we have a compatible running server."""
|
||||
if not (self.server_process and self.server_process.poll() is None and self.server_port):
|
||||
if not (
|
||||
self.server_process
|
||||
and self.server_process.poll() is None
|
||||
and self.server_port
|
||||
):
|
||||
return False
|
||||
|
||||
if _check_process_matches_config(self.server_port, model_name, passages_file):
|
||||
logger.info(f"Existing server process (PID {self.server_process.pid}) is compatible")
|
||||
logger.info(
|
||||
f"Existing server process (PID {self.server_process.pid}) is compatible"
|
||||
)
|
||||
return True
|
||||
|
||||
logger.info("Existing server process is incompatible. Should start a new server.")
|
||||
logger.info(
|
||||
"Existing server process is incompatible. Should start a new server."
|
||||
)
|
||||
return False
|
||||
|
||||
def _start_new_server(
|
||||
@@ -288,9 +269,7 @@ class EmbeddingServerManager:
|
||||
]
|
||||
|
||||
if kwargs.get("passages_file"):
|
||||
# Convert to absolute path to ensure subprocess can find the file
|
||||
passages_file = Path(kwargs["passages_file"]).resolve()
|
||||
command.extend(["--passages-file", str(passages_file)])
|
||||
command.extend(["--passages-file", str(kwargs["passages_file"])])
|
||||
if embedding_mode != "sentence-transformers":
|
||||
command.extend(["--embedding-mode", embedding_mode])
|
||||
|
||||
@@ -367,45 +346,3 @@ class EmbeddingServerManager:
|
||||
pass
|
||||
|
||||
self.server_process = None
|
||||
|
||||
def _launch_server_process_colab(self, command: list, port: int) -> None:
|
||||
"""Launch the server process with Colab-specific settings."""
|
||||
logger.info(f"Colab Command: {' '.join(command)}")
|
||||
|
||||
# In Colab, we need to be more careful about process management
|
||||
self.server_process = subprocess.Popen(
|
||||
command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
)
|
||||
self.server_port = port
|
||||
logger.info(f"Colab server process started with PID: {self.server_process.pid}")
|
||||
|
||||
# Register atexit callback
|
||||
if not self._atexit_registered:
|
||||
atexit.register(lambda: self.stop_server() if self.server_process else None)
|
||||
self._atexit_registered = True
|
||||
|
||||
def _wait_for_server_ready_colab(self, port: int) -> tuple[bool, int]:
|
||||
"""Wait for the server to be ready with Colab-specific timeout."""
|
||||
max_wait, wait_interval = 30, 0.5 # Shorter timeout for Colab
|
||||
|
||||
for _ in range(int(max_wait / wait_interval)):
|
||||
if _check_port(port):
|
||||
logger.info("Colab embedding server is ready!")
|
||||
return True, port
|
||||
|
||||
if self.server_process and self.server_process.poll() is not None:
|
||||
# Check for error output
|
||||
stdout, stderr = self.server_process.communicate()
|
||||
logger.error("Colab server terminated during startup.")
|
||||
logger.error(f"stdout: {stdout}")
|
||||
logger.error(f"stderr: {stderr}")
|
||||
return False, port
|
||||
|
||||
time.sleep(wait_interval)
|
||||
|
||||
logger.error(f"Colab server failed to start within {max_wait} seconds.")
|
||||
self.stop_server()
|
||||
return False, port
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Literal
|
||||
|
||||
import numpy as np
|
||||
from typing import Dict, Any, List, Literal, Optional
|
||||
|
||||
|
||||
class LeannBackendBuilderInterface(ABC):
|
||||
"""Backend interface for building indexes"""
|
||||
|
||||
@abstractmethod
|
||||
def build(self, data: np.ndarray, ids: list[str], index_path: str, **kwargs) -> None:
|
||||
def build(
|
||||
self, data: np.ndarray, ids: List[str], index_path: str, **kwargs
|
||||
) -> None:
|
||||
"""Build index
|
||||
|
||||
Args:
|
||||
@@ -34,7 +35,9 @@ class LeannBackendSearcherInterface(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _ensure_server_running(self, passages_source_file: str, port: int | None, **kwargs) -> int:
|
||||
def _ensure_server_running(
|
||||
self, passages_source_file: str, port: Optional[int], **kwargs
|
||||
) -> int:
|
||||
"""Ensure server is running"""
|
||||
pass
|
||||
|
||||
@@ -48,9 +51,9 @@ class LeannBackendSearcherInterface(ABC):
|
||||
prune_ratio: float = 0.0,
|
||||
recompute_embeddings: bool = False,
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
zmq_port: int | None = None,
|
||||
zmq_port: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> dict[str, Any]:
|
||||
) -> Dict[str, Any]:
|
||||
"""Search for nearest neighbors
|
||||
|
||||
Args:
|
||||
@@ -74,7 +77,7 @@ class LeannBackendSearcherInterface(ABC):
|
||||
self,
|
||||
query: str,
|
||||
use_server_if_available: bool = True,
|
||||
zmq_port: int | None = None,
|
||||
zmq_port: Optional[int] = None,
|
||||
) -> np.ndarray:
|
||||
"""Compute embedding for a query string
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
# packages/leann-core/src/leann/registry.py
|
||||
|
||||
from typing import Dict, TYPE_CHECKING
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from leann.interface import LeannBackendFactoryInterface
|
||||
|
||||
BACKEND_REGISTRY: dict[str, "LeannBackendFactoryInterface"] = {}
|
||||
BACKEND_REGISTRY: Dict[str, "LeannBackendFactoryInterface"] = {}
|
||||
|
||||
|
||||
def register_backend(name: str):
|
||||
@@ -31,11 +31,13 @@ def autodiscover_backends():
|
||||
backend_module_name = dist_name.replace("-", "_")
|
||||
discovered_backends.append(backend_module_name)
|
||||
|
||||
for backend_module_name in sorted(discovered_backends): # sort for deterministic loading
|
||||
for backend_module_name in sorted(
|
||||
discovered_backends
|
||||
): # sort for deterministic loading
|
||||
try:
|
||||
importlib.import_module(backend_module_name)
|
||||
# Registration message is printed by the decorator
|
||||
except ImportError:
|
||||
except ImportError as e:
|
||||
# print(f"WARN: Could not import backend module '{backend_module_name}': {e}")
|
||||
pass
|
||||
# print("INFO: Backend auto-discovery finished.")
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import json
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
from typing import Dict, Any, Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -38,7 +38,9 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
|
||||
self.embedding_model = self.meta.get("embedding_model")
|
||||
if not self.embedding_model:
|
||||
print("WARNING: embedding_model not found in meta.json. Recompute will fail.")
|
||||
print(
|
||||
"WARNING: embedding_model not found in meta.json. Recompute will fail."
|
||||
)
|
||||
|
||||
self.embedding_mode = self.meta.get("embedding_mode", "sentence-transformers")
|
||||
|
||||
@@ -46,22 +48,26 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
backend_module_name=backend_module_name,
|
||||
)
|
||||
|
||||
def _load_meta(self) -> dict[str, Any]:
|
||||
def _load_meta(self) -> Dict[str, Any]:
|
||||
"""Loads the metadata file associated with the index."""
|
||||
# This is the corrected logic for finding the meta file.
|
||||
meta_path = self.index_dir / f"{self.index_path.name}.meta.json"
|
||||
if not meta_path.exists():
|
||||
raise FileNotFoundError(f"Leann metadata file not found at {meta_path}")
|
||||
with open(meta_path, encoding="utf-8") as f:
|
||||
with open(meta_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def _ensure_server_running(self, passages_source_file: str, port: int, **kwargs) -> int:
|
||||
def _ensure_server_running(
|
||||
self, passages_source_file: str, port: int, **kwargs
|
||||
) -> int:
|
||||
"""
|
||||
Ensures the embedding server is running if recompute is needed.
|
||||
This is a helper for subclasses.
|
||||
"""
|
||||
if not self.embedding_model:
|
||||
raise ValueError("Cannot use recompute mode without 'embedding_model' in meta.json.")
|
||||
raise ValueError(
|
||||
"Cannot use recompute mode without 'embedding_model' in meta.json."
|
||||
)
|
||||
|
||||
server_started, actual_port = self.embedding_server_manager.start_server(
|
||||
port=port,
|
||||
@@ -72,7 +78,9 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
enable_warmup=kwargs.get("enable_warmup", False),
|
||||
)
|
||||
if not server_started:
|
||||
raise RuntimeError(f"Failed to start embedding server on port {actual_port}")
|
||||
raise RuntimeError(
|
||||
f"Failed to start embedding server on port {actual_port}"
|
||||
)
|
||||
|
||||
return actual_port
|
||||
|
||||
@@ -101,7 +109,9 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
# on that port?
|
||||
|
||||
# Ensure we have a server with passages_file for compatibility
|
||||
passages_source_file = self.index_dir / f"{self.index_path.name}.meta.json"
|
||||
passages_source_file = (
|
||||
self.index_dir / f"{self.index_path.name}.meta.json"
|
||||
)
|
||||
# Convert to absolute path to ensure server can find it
|
||||
zmq_port = self._ensure_server_running(
|
||||
str(passages_source_file.resolve()), zmq_port
|
||||
@@ -122,8 +132,8 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
|
||||
def _compute_embedding_via_server(self, chunks: list, zmq_port: int) -> np.ndarray:
|
||||
"""Compute embeddings using the ZMQ embedding server."""
|
||||
import msgpack
|
||||
import zmq
|
||||
import msgpack
|
||||
|
||||
try:
|
||||
context = zmq.Context()
|
||||
@@ -162,9 +172,9 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
prune_ratio: float = 0.0,
|
||||
recompute_embeddings: bool = False,
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
zmq_port: int | None = None,
|
||||
zmq_port: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> dict[str, Any]:
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Search for the top_k nearest neighbors of the query vector.
|
||||
|
||||
|
||||
@@ -7,6 +7,6 @@ A revolutionary vector database that democratizes personal AI.
|
||||
__version__ = "0.1.0"
|
||||
|
||||
# Re-export main API from leann-core
|
||||
from leann_core import LeannBuilder, LeannChat, LeannSearcher
|
||||
from leann_core import LeannBuilder, LeannSearcher, LeannChat
|
||||
|
||||
__all__ = ["LeannBuilder", "LeannChat", "LeannSearcher"]
|
||||
__all__ = ["LeannBuilder", "LeannSearcher", "LeannChat"]
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "leann"
|
||||
version = "0.1.14"
|
||||
version = "0.1.8"
|
||||
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
|
||||
@@ -1,23 +1,22 @@
|
||||
import json
|
||||
import sqlite3
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
|
||||
import requests
|
||||
import typer
|
||||
from pathlib import Path
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing_extensions import Annotated
|
||||
import sqlite3
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
def get_safe_path(s: str) -> str:
|
||||
"""
|
||||
Remove invalid characters to sanitize a path.
|
||||
:param s: str to sanitize
|
||||
:returns: sanitized str
|
||||
"""
|
||||
ban_chars = "\\ / : * ? \" ' < > | $ \r \n".replace(" ", "")
|
||||
ban_chars = "\\ / : * ? \" ' < > | $ \r \n".replace(
|
||||
' ', '')
|
||||
for i in ban_chars:
|
||||
s = s.replace(i, "")
|
||||
return s
|
||||
@@ -27,38 +26,35 @@ def process_history(history: str):
|
||||
if history.startswith("<?xml") or history.startswith("<msg>"):
|
||||
try:
|
||||
root = ET.fromstring(history)
|
||||
title = root.find(".//title").text if root.find(".//title") is not None else None
|
||||
quoted = (
|
||||
root.find(".//refermsg/content").text
|
||||
if root.find(".//refermsg/content") is not None
|
||||
else None
|
||||
)
|
||||
title = root.find('.//title').text if root.find('.//title') is not None else None
|
||||
quoted = root.find('.//refermsg/content').text if root.find('.//refermsg/content') is not None else None
|
||||
if title and quoted:
|
||||
return {"title": title, "quoted": process_history(quoted)}
|
||||
return {
|
||||
"title": title,
|
||||
"quoted": process_history(quoted)
|
||||
}
|
||||
if title:
|
||||
return title
|
||||
except Exception:
|
||||
return history
|
||||
return history
|
||||
|
||||
|
||||
def get_message(history: dict | str):
|
||||
if isinstance(history, dict):
|
||||
if "title" in history:
|
||||
return history["title"]
|
||||
if 'title' in history:
|
||||
return history['title']
|
||||
else:
|
||||
return history
|
||||
|
||||
|
||||
def export_chathistory(user_id: str):
|
||||
res = requests.get(
|
||||
"http://localhost:48065/wechat/chatlog", params={"userId": user_id, "count": 100000}
|
||||
).json()
|
||||
for i in range(len(res["chatLogs"])):
|
||||
res["chatLogs"][i]["content"] = process_history(res["chatLogs"][i]["content"])
|
||||
res["chatLogs"][i]["message"] = get_message(res["chatLogs"][i]["content"])
|
||||
return res["chatLogs"]
|
||||
|
||||
res = requests.get("http://localhost:48065/wechat/chatlog", params={
|
||||
"userId": user_id,
|
||||
"count": 100000
|
||||
}).json()
|
||||
for i in range(len(res['chatLogs'])):
|
||||
res['chatLogs'][i]['content'] = process_history(res['chatLogs'][i]['content'])
|
||||
res['chatLogs'][i]['message'] = get_message(res['chatLogs'][i]['content'])
|
||||
return res['chatLogs']
|
||||
|
||||
@app.command()
|
||||
def export_all(dest: Annotated[Path, typer.Argument(help="Destination path to export to.")]):
|
||||
@@ -68,7 +64,7 @@ def export_all(dest: Annotated[Path, typer.Argument(help="Destination path to ex
|
||||
if not dest.is_dir():
|
||||
if not dest.exists():
|
||||
inp = typer.prompt("Destination path does not exist, create it? (y/n)")
|
||||
if inp.lower() == "y":
|
||||
if inp.lower() == 'y':
|
||||
dest.mkdir(parents=True)
|
||||
else:
|
||||
typer.echo("Aborted.", err=True)
|
||||
@@ -81,12 +77,12 @@ def export_all(dest: Annotated[Path, typer.Argument(help="Destination path to ex
|
||||
exported_count = 0
|
||||
for user in tqdm(all_users):
|
||||
try:
|
||||
usr_chatlog = export_chathistory(user["arg"])
|
||||
|
||||
usr_chatlog = export_chathistory(user['arg'])
|
||||
|
||||
# Only write file if there are messages
|
||||
if len(usr_chatlog) > 0:
|
||||
out_path = dest / get_safe_path((user["title"] or "") + "-" + user["arg"] + ".json")
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
out_path = dest/get_safe_path((user['title'] or "")+"-"+user['arg']+'.json')
|
||||
with open(out_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(usr_chatlog, f, ensure_ascii=False, indent=2)
|
||||
exported_count += 1
|
||||
except Exception as e:
|
||||
@@ -95,42 +91,23 @@ def export_all(dest: Annotated[Path, typer.Argument(help="Destination path to ex
|
||||
|
||||
print(f"Exported {exported_count} users' chat history to {dest} in json.")
|
||||
|
||||
|
||||
@app.command()
|
||||
def export_sqlite(
|
||||
dest: Annotated[Path, typer.Argument(help="Destination path to export to.")] = Path(
|
||||
"chatlog.db"
|
||||
),
|
||||
):
|
||||
def export_sqlite(dest: Annotated[Path, typer.Argument(help="Destination path to export to.")] = Path("chatlog.db")):
|
||||
"""
|
||||
Export all users' chat history to a sqlite database.
|
||||
"""
|
||||
connection = sqlite3.connect(dest)
|
||||
cursor = connection.cursor()
|
||||
cursor.execute(
|
||||
"CREATE TABLE IF NOT EXISTS chatlog (id INTEGER PRIMARY KEY AUTOINCREMENT, with_id TEXT, from_user TEXT, to_user TEXT, message TEXT, timest DATETIME, auxiliary TEXT)"
|
||||
)
|
||||
cursor.execute("CREATE TABLE IF NOT EXISTS chatlog (id INTEGER PRIMARY KEY AUTOINCREMENT, with_id TEXT, from_user TEXT, to_user TEXT, message TEXT, timest DATETIME, auxiliary TEXT)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS chatlog_with_id_index ON chatlog (with_id)")
|
||||
cursor.execute("CREATE TABLE iF NOT EXISTS users (id TEXT PRIMARY KEY, name TEXT)")
|
||||
|
||||
all_users = requests.get("http://localhost:48065/wechat/allcontacts").json()
|
||||
for user in tqdm(all_users):
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO users (id, name) VALUES (?, ?)", (user["arg"], user["title"])
|
||||
)
|
||||
usr_chatlog = export_chathistory(user["arg"])
|
||||
cursor.execute("INSERT OR IGNORE INTO users (id, name) VALUES (?, ?)", (user['arg'], user['title']))
|
||||
usr_chatlog = export_chathistory(user['arg'])
|
||||
for msg in usr_chatlog:
|
||||
cursor.execute(
|
||||
"INSERT INTO chatlog (with_id, from_user, to_user, message, timest, auxiliary) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(
|
||||
user["arg"],
|
||||
msg["fromUser"],
|
||||
msg["toUser"],
|
||||
msg["message"],
|
||||
msg["createTime"],
|
||||
str(msg["content"]),
|
||||
),
|
||||
)
|
||||
cursor.execute("INSERT INTO chatlog (with_id, from_user, to_user, message, timest, auxiliary) VALUES (?, ?, ?, ?, ?, ?)", (user['arg'], msg['fromUser'], msg['toUser'], msg['message'], msg['createTime'], str(msg['content'])))
|
||||
connection.commit()
|
||||
|
||||
|
||||
|
||||
@@ -25,21 +25,14 @@ dependencies = [
|
||||
"requests>=2.25.0",
|
||||
"sentence-transformers>=2.2.0",
|
||||
"openai>=1.0.0",
|
||||
# PDF parsing dependencies - essential for document processing
|
||||
"PyPDF2>=3.0.0",
|
||||
"pdfplumber>=0.11.0",
|
||||
"pymupdf>=1.26.0",
|
||||
"pypdfium2>=4.30.0",
|
||||
# LlamaIndex core and readers - updated versions
|
||||
"llama-index>=0.12.44",
|
||||
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
|
||||
"llama-index-readers-docling",
|
||||
"llama-index-node-parser-docling",
|
||||
"llama-index-vector-stores-faiss>=0.4.0",
|
||||
"llama-index-embeddings-huggingface>=0.5.5",
|
||||
# Other dependencies
|
||||
"ipykernel==6.29.5",
|
||||
"msgpack>=1.1.1",
|
||||
"llama-index-vector-stores-faiss>=0.4.0",
|
||||
"llama-index-embeddings-huggingface>=0.5.5",
|
||||
"mlx>=0.26.3; sys_platform == 'darwin'",
|
||||
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
||||
"psutil>=5.8.0",
|
||||
@@ -59,14 +52,6 @@ diskann = [
|
||||
"leann-backend-diskann",
|
||||
]
|
||||
|
||||
# Add a new optional dependency group for document processing
|
||||
documents = [
|
||||
"beautifulsoup4>=4.13.0", # For HTML parsing
|
||||
"python-docx>=0.8.11", # For Word documents
|
||||
"openpyxl>=3.1.0", # For Excel files
|
||||
"pandas>=2.2.0", # For data processing
|
||||
]
|
||||
|
||||
[tool.setuptools]
|
||||
py-modules = []
|
||||
|
||||
@@ -76,49 +61,28 @@ leann-core = { path = "packages/leann-core", editable = true }
|
||||
leann-backend-diskann = { path = "packages/leann-backend-diskann", editable = true }
|
||||
leann-backend-hnsw = { path = "packages/leann-backend-hnsw", editable = true }
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py310"
|
||||
line-length = 100
|
||||
extend-exclude = [
|
||||
"third_party",
|
||||
"*.egg-info",
|
||||
"__pycache__",
|
||||
".git",
|
||||
".venv",
|
||||
]
|
||||
[tool.cibuildwheel]
|
||||
# Skip 32-bit and PyPy builds
|
||||
skip = "*-win32 *-manylinux_i686 pp* *musllinux*"
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
"E", # pycodestyle errors
|
||||
"W", # pycodestyle warnings
|
||||
"F", # pyflakes
|
||||
"I", # isort
|
||||
"B", # flake8-bugbear
|
||||
"C4", # flake8-comprehensions
|
||||
"UP", # pyupgrade
|
||||
"N", # pep8-naming
|
||||
"RUF", # ruff-specific rules
|
||||
]
|
||||
ignore = [
|
||||
"E501", # line too long (handled by formatter)
|
||||
"B008", # do not perform function calls in argument defaults
|
||||
"B904", # raise without from
|
||||
"N812", # lowercase imported as non-lowercase
|
||||
"N806", # variable in function should be lowercase
|
||||
"RUF012", # mutable class attributes should be annotated with typing.ClassVar
|
||||
]
|
||||
# Use manylinux_2_35 for Colab compatibility while keeping modern features
|
||||
manylinux-x86_64-image = "manylinux_2_35"
|
||||
manylinux-aarch64-image = "manylinux_2_35"
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"test/**/*.py" = ["E402"] # module level import not at top of file (common in tests)
|
||||
"examples/**/*.py" = ["E402"] # module level import not at top of file (common in examples)
|
||||
# Linux system dependencies
|
||||
[tool.cibuildwheel.linux]
|
||||
before-all = """
|
||||
yum install -y epel-release
|
||||
yum install -y gcc-c++ boost-devel zeromq-devel openblas-devel cmake3 python3-devel
|
||||
ln -sf /usr/bin/cmake3 /usr/bin/cmake
|
||||
"""
|
||||
|
||||
[tool.ruff.format]
|
||||
quote-style = "double"
|
||||
indent-style = "space"
|
||||
skip-magic-trailing-comma = false
|
||||
line-ending = "auto"
|
||||
# macOS system dependencies
|
||||
[tool.cibuildwheel.macos]
|
||||
before-all = "brew install boost zeromq openblas cmake libomp"
|
||||
# Set minimum macOS version
|
||||
environment = { MACOSX_DEPLOYMENT_TARGET = "11.0", CMAKE_OSX_DEPLOYMENT_TARGET = "11.0" }
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"ruff>=0.12.4",
|
||||
]
|
||||
# Environment variables configuration
|
||||
[tool.cibuildwheel.environment]
|
||||
CMAKE_BUILD_PARALLEL_LEVEL = "8"
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import os
|
||||
|
||||
from leann.api import LeannBuilder, LeannChat
|
||||
from leann.api import LeannBuilder, LeannSearcher, LeannChat
|
||||
|
||||
# Define the path for our new MLX-based index
|
||||
INDEX_PATH = "./mlx_diskann_index/leann"
|
||||
@@ -39,5 +38,7 @@ chat = LeannChat(index_path=INDEX_PATH)
|
||||
# add query
|
||||
query = "MLX is an array framework for machine learning on Apple silicon."
|
||||
print(f"Query: {query}")
|
||||
response = chat.ask(query, top_k=3, recompute_beighbor_embeddings=True, complexity=3, beam_width=1)
|
||||
response = chat.ask(
|
||||
query, top_k=3, recompute_beighbor_embeddings=True, complexity=3, beam_width=1
|
||||
)
|
||||
print(f"Response: {response}")
|
||||
|
||||
@@ -1,84 +1,76 @@
|
||||
import email
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from llama_index.core import Document, VectorStoreIndex
|
||||
import email
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
from llama_index.core import VectorStoreIndex, Document
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
|
||||
|
||||
class EmlxReader(BaseReader):
|
||||
"""
|
||||
Apple Mail .emlx file reader.
|
||||
|
||||
|
||||
Reads individual .emlx files from Apple Mail's storage format.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize."""
|
||||
pass
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> list[Document]:
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]:
|
||||
"""
|
||||
Load data from the input directory containing .emlx files.
|
||||
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing .emlx files
|
||||
**load_kwargs:
|
||||
max_count (int): Maximum amount of messages to read.
|
||||
"""
|
||||
docs: list[Document] = []
|
||||
max_count = load_kwargs.get("max_count", 1000)
|
||||
docs: List[Document] = []
|
||||
max_count = load_kwargs.get('max_count', 1000)
|
||||
count = 0
|
||||
|
||||
|
||||
# Walk through the directory recursively
|
||||
for dirpath, dirnames, filenames in os.walk(input_dir):
|
||||
# Skip hidden directories
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
||||
|
||||
|
||||
for filename in filenames:
|
||||
if count >= max_count:
|
||||
break
|
||||
|
||||
|
||||
if filename.endswith(".emlx"):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
try:
|
||||
# Read the .emlx file
|
||||
with open(filepath, encoding="utf-8", errors="ignore") as f:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
|
||||
# .emlx files have a length prefix followed by the email content
|
||||
# The first line contains the length, followed by the email
|
||||
lines = content.split("\n", 1)
|
||||
lines = content.split('\n', 1)
|
||||
if len(lines) >= 2:
|
||||
email_content = lines[1]
|
||||
|
||||
|
||||
# Parse the email using Python's email module
|
||||
try:
|
||||
msg = email.message_from_string(email_content)
|
||||
|
||||
|
||||
# Extract email metadata
|
||||
subject = msg.get("Subject", "No Subject")
|
||||
from_addr = msg.get("From", "Unknown")
|
||||
to_addr = msg.get("To", "Unknown")
|
||||
date = msg.get("Date", "Unknown")
|
||||
|
||||
subject = msg.get('Subject', 'No Subject')
|
||||
from_addr = msg.get('From', 'Unknown')
|
||||
to_addr = msg.get('To', 'Unknown')
|
||||
date = msg.get('Date', 'Unknown')
|
||||
|
||||
# Extract email body
|
||||
body = ""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if (
|
||||
part.get_content_type() == "text/plain"
|
||||
or part.get_content_type() == "text/html"
|
||||
):
|
||||
body += part.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
if part.get_content_type() == "text/plain" or part.get_content_type() == "text/html":
|
||||
body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
# break
|
||||
else:
|
||||
body = msg.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
|
||||
body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
|
||||
# Create document content
|
||||
doc_content = f"""
|
||||
From: {from_addr}
|
||||
@@ -88,38 +80,32 @@ Date: {date}
|
||||
|
||||
{body}
|
||||
"""
|
||||
|
||||
|
||||
# Create metadata
|
||||
metadata = {
|
||||
"file_path": filepath,
|
||||
"subject": subject,
|
||||
"from": from_addr,
|
||||
"to": to_addr,
|
||||
"date": date,
|
||||
"filename": filename,
|
||||
'file_path': filepath,
|
||||
'subject': subject,
|
||||
'from': from_addr,
|
||||
'to': to_addr,
|
||||
'date': date,
|
||||
'filename': filename
|
||||
}
|
||||
if count == 0:
|
||||
print("--------------------------------")
|
||||
print("dir path", dirpath)
|
||||
print('dir path', dirpath)
|
||||
print(metadata)
|
||||
print(doc_content)
|
||||
print("--------------------------------")
|
||||
body = []
|
||||
body=[]
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
print(
|
||||
"-------------------------------- get content type -------------------------------"
|
||||
)
|
||||
print("-------------------------------- get content type -------------------------------")
|
||||
print(part.get_content_type())
|
||||
print(part)
|
||||
# body.append(part.get_payload(decode=True).decode('utf-8', errors='ignore'))
|
||||
print(
|
||||
"-------------------------------- get content type -------------------------------"
|
||||
)
|
||||
print("-------------------------------- get content type -------------------------------")
|
||||
else:
|
||||
body = msg.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
print(body)
|
||||
|
||||
print(body)
|
||||
@@ -127,23 +113,22 @@ Date: {date}
|
||||
doc = Document(text=doc_content, metadata=metadata)
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"!!!!!!! Error parsing email from {filepath}: {e} !!!!!!!!")
|
||||
continue
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"!!!!!!! Error reading file !!!!!!!! {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
print(f"Loaded {len(docs)} email documents")
|
||||
return docs
|
||||
|
||||
|
||||
# Use the custom EmlxReader instead of MboxReader
|
||||
documents = EmlxReader().load_data(
|
||||
"/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages",
|
||||
max_count=1000,
|
||||
"/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages",
|
||||
max_count=1000
|
||||
) # Returns list of documents
|
||||
|
||||
# Configure the index with larger chunk size to handle long metadata
|
||||
@@ -153,9 +138,10 @@ from llama_index.core.node_parser import SentenceSplitter
|
||||
text_splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=200)
|
||||
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents, transformations=[text_splitter]
|
||||
documents,
|
||||
transformations=[text_splitter]
|
||||
) # Initialize index with documents
|
||||
|
||||
query_engine = index.as_query_engine()
|
||||
res = query_engine.query("Hows Berkeley Graduate Student Instructor")
|
||||
print(res)
|
||||
print(res)
|
||||
@@ -1,82 +1,77 @@
|
||||
import email
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from llama_index.core import Document, StorageContext, VectorStoreIndex
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
import email
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
from llama_index.core import VectorStoreIndex, Document, StorageContext
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
class EmlxReader(BaseReader):
|
||||
"""
|
||||
Apple Mail .emlx file reader.
|
||||
|
||||
|
||||
Reads individual .emlx files from Apple Mail's storage format.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize."""
|
||||
pass
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> list[Document]:
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]:
|
||||
"""
|
||||
Load data from the input directory containing .emlx files.
|
||||
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing .emlx files
|
||||
**load_kwargs:
|
||||
max_count (int): Maximum amount of messages to read.
|
||||
"""
|
||||
docs: list[Document] = []
|
||||
max_count = load_kwargs.get("max_count", 1000)
|
||||
docs: List[Document] = []
|
||||
max_count = load_kwargs.get('max_count', 1000)
|
||||
count = 0
|
||||
|
||||
|
||||
# Walk through the directory recursively
|
||||
for dirpath, dirnames, filenames in os.walk(input_dir):
|
||||
# Skip hidden directories
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
||||
|
||||
|
||||
for filename in filenames:
|
||||
if count >= max_count:
|
||||
break
|
||||
|
||||
|
||||
if filename.endswith(".emlx"):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
try:
|
||||
# Read the .emlx file
|
||||
with open(filepath, encoding="utf-8", errors="ignore") as f:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
|
||||
# .emlx files have a length prefix followed by the email content
|
||||
# The first line contains the length, followed by the email
|
||||
lines = content.split("\n", 1)
|
||||
lines = content.split('\n', 1)
|
||||
if len(lines) >= 2:
|
||||
email_content = lines[1]
|
||||
|
||||
|
||||
# Parse the email using Python's email module
|
||||
try:
|
||||
msg = email.message_from_string(email_content)
|
||||
|
||||
|
||||
# Extract email metadata
|
||||
subject = msg.get("Subject", "No Subject")
|
||||
from_addr = msg.get("From", "Unknown")
|
||||
to_addr = msg.get("To", "Unknown")
|
||||
date = msg.get("Date", "Unknown")
|
||||
|
||||
subject = msg.get('Subject', 'No Subject')
|
||||
from_addr = msg.get('From', 'Unknown')
|
||||
to_addr = msg.get('To', 'Unknown')
|
||||
date = msg.get('Date', 'Unknown')
|
||||
|
||||
# Extract email body
|
||||
body = ""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
body = part.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
body = part.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
break
|
||||
else:
|
||||
body = msg.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
|
||||
body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
|
||||
# Create document content
|
||||
doc_content = f"""
|
||||
From: {from_addr}
|
||||
@@ -86,96 +81,97 @@ Date: {date}
|
||||
|
||||
{body}
|
||||
"""
|
||||
|
||||
|
||||
# Create metadata
|
||||
metadata = {
|
||||
"file_path": filepath,
|
||||
"subject": subject,
|
||||
"from": from_addr,
|
||||
"to": to_addr,
|
||||
"date": date,
|
||||
"filename": filename,
|
||||
'file_path': filepath,
|
||||
'subject': subject,
|
||||
'from': from_addr,
|
||||
'to': to_addr,
|
||||
'date': date,
|
||||
'filename': filename
|
||||
}
|
||||
|
||||
|
||||
doc = Document(text=doc_content, metadata=metadata)
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing email from {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading file {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
print(f"Loaded {len(docs)} email documents")
|
||||
return docs
|
||||
|
||||
|
||||
def create_and_save_index(mail_path: str, save_dir: str = "mail_index", max_count: int = 1000):
|
||||
"""
|
||||
Create the index from mail data and save it to disk.
|
||||
|
||||
|
||||
Args:
|
||||
mail_path: Path to the mail directory
|
||||
save_dir: Directory to save the index
|
||||
max_count: Maximum number of emails to process
|
||||
"""
|
||||
print("Creating index from mail data...")
|
||||
|
||||
|
||||
# Load documents
|
||||
documents = EmlxReader().load_data(mail_path, max_count=max_count)
|
||||
|
||||
|
||||
if not documents:
|
||||
print("No documents loaded. Exiting.")
|
||||
return None
|
||||
|
||||
|
||||
# Create text splitter
|
||||
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=0)
|
||||
|
||||
|
||||
# Create index
|
||||
index = VectorStoreIndex.from_documents(documents, transformations=[text_splitter])
|
||||
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents,
|
||||
transformations=[text_splitter]
|
||||
)
|
||||
|
||||
# Save the index
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
index.storage_context.persist(persist_dir=save_dir)
|
||||
print(f"Index saved to {save_dir}")
|
||||
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def load_index(save_dir: str = "mail_index"):
|
||||
"""
|
||||
Load the saved index from disk.
|
||||
|
||||
|
||||
Args:
|
||||
save_dir: Directory where the index is saved
|
||||
|
||||
|
||||
Returns:
|
||||
Loaded index or None if loading fails
|
||||
"""
|
||||
try:
|
||||
# Load storage context
|
||||
storage_context = StorageContext.from_defaults(persist_dir=save_dir)
|
||||
|
||||
|
||||
# Load index
|
||||
index = VectorStoreIndex.from_vector_store(
|
||||
storage_context.vector_store, storage_context=storage_context
|
||||
storage_context.vector_store,
|
||||
storage_context=storage_context
|
||||
)
|
||||
|
||||
|
||||
print(f"Index loaded from {save_dir}")
|
||||
return index
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading index: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def query_index(index, query: str):
|
||||
"""
|
||||
Query the loaded index.
|
||||
|
||||
|
||||
Args:
|
||||
index: The loaded index
|
||||
query: The query string
|
||||
@@ -183,17 +179,16 @@ def query_index(index, query: str):
|
||||
if index is None:
|
||||
print("No index available for querying.")
|
||||
return
|
||||
|
||||
|
||||
query_engine = index.as_query_engine()
|
||||
response = query_engine.query(query)
|
||||
print(f"Query: {query}")
|
||||
print(f"Response: {response}")
|
||||
|
||||
|
||||
def main():
|
||||
mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages"
|
||||
save_dir = "mail_index"
|
||||
|
||||
|
||||
# Check if index already exists
|
||||
if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")):
|
||||
print("Loading existing index...")
|
||||
@@ -201,19 +196,18 @@ def main():
|
||||
else:
|
||||
print("Creating new index...")
|
||||
index = create_and_save_index(mail_path, save_dir, max_count=1000)
|
||||
|
||||
|
||||
if index:
|
||||
# Example queries
|
||||
queries = [
|
||||
"Hows Berkeley Graduate Student Instructor",
|
||||
"What emails mention GSR appointments?",
|
||||
"Find emails about deadlines",
|
||||
"Find emails about deadlines"
|
||||
]
|
||||
|
||||
|
||||
for query in queries:
|
||||
print("\n" + "=" * 50)
|
||||
print("\n" + "="*50)
|
||||
query_index(index, query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
@@ -1,82 +1,77 @@
|
||||
import email
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from llama_index.core import Document, StorageContext, VectorStoreIndex
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
import email
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
from llama_index.core import VectorStoreIndex, Document, StorageContext
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
class EmlxReader(BaseReader):
|
||||
"""
|
||||
Apple Mail .emlx file reader with reduced metadata.
|
||||
|
||||
|
||||
Reads individual .emlx files from Apple Mail's storage format.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize."""
|
||||
pass
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> list[Document]:
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]:
|
||||
"""
|
||||
Load data from the input directory containing .emlx files.
|
||||
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing .emlx files
|
||||
**load_kwargs:
|
||||
max_count (int): Maximum amount of messages to read.
|
||||
"""
|
||||
docs: list[Document] = []
|
||||
max_count = load_kwargs.get("max_count", 1000)
|
||||
docs: List[Document] = []
|
||||
max_count = load_kwargs.get('max_count', 1000)
|
||||
count = 0
|
||||
|
||||
|
||||
# Walk through the directory recursively
|
||||
for dirpath, dirnames, filenames in os.walk(input_dir):
|
||||
# Skip hidden directories
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
||||
|
||||
|
||||
for filename in filenames:
|
||||
if count >= max_count:
|
||||
break
|
||||
|
||||
|
||||
if filename.endswith(".emlx"):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
try:
|
||||
# Read the .emlx file
|
||||
with open(filepath, encoding="utf-8", errors="ignore") as f:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
|
||||
# .emlx files have a length prefix followed by the email content
|
||||
# The first line contains the length, followed by the email
|
||||
lines = content.split("\n", 1)
|
||||
lines = content.split('\n', 1)
|
||||
if len(lines) >= 2:
|
||||
email_content = lines[1]
|
||||
|
||||
|
||||
# Parse the email using Python's email module
|
||||
try:
|
||||
msg = email.message_from_string(email_content)
|
||||
|
||||
|
||||
# Extract email metadata
|
||||
subject = msg.get("Subject", "No Subject")
|
||||
from_addr = msg.get("From", "Unknown")
|
||||
to_addr = msg.get("To", "Unknown")
|
||||
date = msg.get("Date", "Unknown")
|
||||
|
||||
subject = msg.get('Subject', 'No Subject')
|
||||
from_addr = msg.get('From', 'Unknown')
|
||||
to_addr = msg.get('To', 'Unknown')
|
||||
date = msg.get('Date', 'Unknown')
|
||||
|
||||
# Extract email body
|
||||
body = ""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
body = part.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
body = part.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
break
|
||||
else:
|
||||
body = msg.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
|
||||
body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
|
||||
# Create document content with metadata embedded in text
|
||||
doc_content = f"""
|
||||
From: {from_addr}
|
||||
@@ -86,96 +81,95 @@ Date: {date}
|
||||
|
||||
{body}
|
||||
"""
|
||||
|
||||
|
||||
# Create minimal metadata (only essential info)
|
||||
metadata = {
|
||||
"subject": subject[:50], # Truncate subject
|
||||
"from": from_addr[:30], # Truncate from
|
||||
"date": date[:20], # Truncate date
|
||||
"filename": filename, # Keep filename
|
||||
'subject': subject[:50], # Truncate subject
|
||||
'from': from_addr[:30], # Truncate from
|
||||
'date': date[:20], # Truncate date
|
||||
'filename': filename # Keep filename
|
||||
}
|
||||
|
||||
|
||||
doc = Document(text=doc_content, metadata=metadata)
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing email from {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading file {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
print(f"Loaded {len(docs)} email documents")
|
||||
return docs
|
||||
|
||||
|
||||
def create_and_save_index(
|
||||
mail_path: str, save_dir: str = "mail_index_small", max_count: int = 1000
|
||||
):
|
||||
def create_and_save_index(mail_path: str, save_dir: str = "mail_index_small", max_count: int = 1000):
|
||||
"""
|
||||
Create the index from mail data and save it to disk.
|
||||
|
||||
|
||||
Args:
|
||||
mail_path: Path to the mail directory
|
||||
save_dir: Directory to save the index
|
||||
max_count: Maximum number of emails to process
|
||||
"""
|
||||
print("Creating index from mail data with small chunks...")
|
||||
|
||||
|
||||
# Load documents
|
||||
documents = EmlxReader().load_data(mail_path, max_count=max_count)
|
||||
|
||||
|
||||
if not documents:
|
||||
print("No documents loaded. Exiting.")
|
||||
return None
|
||||
|
||||
|
||||
# Create text splitter with small chunk size
|
||||
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50)
|
||||
|
||||
|
||||
# Create index
|
||||
index = VectorStoreIndex.from_documents(documents, transformations=[text_splitter])
|
||||
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents,
|
||||
transformations=[text_splitter]
|
||||
)
|
||||
|
||||
# Save the index
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
index.storage_context.persist(persist_dir=save_dir)
|
||||
print(f"Index saved to {save_dir}")
|
||||
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def load_index(save_dir: str = "mail_index_small"):
|
||||
"""
|
||||
Load the saved index from disk.
|
||||
|
||||
|
||||
Args:
|
||||
save_dir: Directory where the index is saved
|
||||
|
||||
|
||||
Returns:
|
||||
Loaded index or None if loading fails
|
||||
"""
|
||||
try:
|
||||
# Load storage context
|
||||
storage_context = StorageContext.from_defaults(persist_dir=save_dir)
|
||||
|
||||
|
||||
# Load index
|
||||
index = VectorStoreIndex.from_vector_store(
|
||||
storage_context.vector_store, storage_context=storage_context
|
||||
storage_context.vector_store,
|
||||
storage_context=storage_context
|
||||
)
|
||||
|
||||
|
||||
print(f"Index loaded from {save_dir}")
|
||||
return index
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading index: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def query_index(index, query: str):
|
||||
"""
|
||||
Query the loaded index.
|
||||
|
||||
|
||||
Args:
|
||||
index: The loaded index
|
||||
query: The query string
|
||||
@@ -183,17 +177,16 @@ def query_index(index, query: str):
|
||||
if index is None:
|
||||
print("No index available for querying.")
|
||||
return
|
||||
|
||||
|
||||
query_engine = index.as_query_engine()
|
||||
response = query_engine.query(query)
|
||||
print(f"Query: {query}")
|
||||
print(f"Response: {response}")
|
||||
|
||||
|
||||
def main():
|
||||
mail_path = "/Users/yichuan/Library/Mail/V10/0FCA0879-FD8C-4B7E-83BF-FDDA930791C5/[Gmail].mbox/All Mail.mbox/78BA5BE1-8819-4F9A-9613-EB63772F1DD0/Data/9/Messages"
|
||||
save_dir = "mail_index_small"
|
||||
|
||||
|
||||
# Check if index already exists
|
||||
if os.path.exists(save_dir) and os.path.exists(os.path.join(save_dir, "vector_store.json")):
|
||||
print("Loading existing index...")
|
||||
@@ -201,19 +194,18 @@ def main():
|
||||
else:
|
||||
print("Creating new index...")
|
||||
index = create_and_save_index(mail_path, save_dir, max_count=1000)
|
||||
|
||||
|
||||
if index:
|
||||
# Example queries
|
||||
queries = [
|
||||
"Hows Berkeley Graduate Student Instructor",
|
||||
"What emails mention GSR appointments?",
|
||||
"Find emails about deadlines",
|
||||
"Find emails about deadlines"
|
||||
]
|
||||
|
||||
|
||||
for query in queries:
|
||||
print("\n" + "=" * 50)
|
||||
print("\n" + "="*50)
|
||||
query_index(index, query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
@@ -1,94 +1,89 @@
|
||||
import email
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
from llama_index.core import Document, VectorStoreIndex
|
||||
import email
|
||||
from pathlib import Path
|
||||
from typing import List, Any
|
||||
from llama_index.core import VectorStoreIndex, Document
|
||||
from llama_index.core.readers.base import BaseReader
|
||||
|
||||
|
||||
class EmlxReader(BaseReader):
|
||||
"""
|
||||
Apple Mail .emlx file reader.
|
||||
|
||||
|
||||
Reads individual .emlx files from Apple Mail's storage format.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize."""
|
||||
pass
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> list[Document]:
|
||||
|
||||
def load_data(self, input_dir: str, **load_kwargs: Any) -> List[Document]:
|
||||
"""
|
||||
Load data from the input directory containing .emlx files.
|
||||
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing .emlx files
|
||||
**load_kwargs:
|
||||
max_count (int): Maximum amount of messages to read.
|
||||
"""
|
||||
docs: list[Document] = []
|
||||
max_count = load_kwargs.get("max_count", 1000)
|
||||
docs: List[Document] = []
|
||||
max_count = load_kwargs.get('max_count', 1000)
|
||||
count = 0
|
||||
|
||||
|
||||
# Check if directory exists and is accessible
|
||||
if not os.path.exists(input_dir):
|
||||
print(f"Error: Directory '{input_dir}' does not exist")
|
||||
return docs
|
||||
|
||||
|
||||
if not os.access(input_dir, os.R_OK):
|
||||
print(f"Error: Directory '{input_dir}' is not accessible (permission denied)")
|
||||
print("This is likely due to macOS security restrictions on Mail app data")
|
||||
return docs
|
||||
|
||||
|
||||
print(f"Scanning directory: {input_dir}")
|
||||
|
||||
|
||||
# Walk through the directory recursively
|
||||
for dirpath, dirnames, filenames in os.walk(input_dir):
|
||||
# Skip hidden directories
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
||||
|
||||
|
||||
for filename in filenames:
|
||||
if count >= max_count:
|
||||
break
|
||||
|
||||
|
||||
if filename.endswith(".emlx"):
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
print(f"Found .emlx file: {filepath}")
|
||||
try:
|
||||
# Read the .emlx file
|
||||
with open(filepath, encoding="utf-8", errors="ignore") as f:
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
|
||||
# .emlx files have a length prefix followed by the email content
|
||||
# The first line contains the length, followed by the email
|
||||
lines = content.split("\n", 1)
|
||||
lines = content.split('\n', 1)
|
||||
if len(lines) >= 2:
|
||||
email_content = lines[1]
|
||||
|
||||
|
||||
# Parse the email using Python's email module
|
||||
try:
|
||||
msg = email.message_from_string(email_content)
|
||||
|
||||
|
||||
# Extract email metadata
|
||||
subject = msg.get("Subject", "No Subject")
|
||||
from_addr = msg.get("From", "Unknown")
|
||||
to_addr = msg.get("To", "Unknown")
|
||||
date = msg.get("Date", "Unknown")
|
||||
|
||||
subject = msg.get('Subject', 'No Subject')
|
||||
from_addr = msg.get('From', 'Unknown')
|
||||
to_addr = msg.get('To', 'Unknown')
|
||||
date = msg.get('Date', 'Unknown')
|
||||
|
||||
# Extract email body
|
||||
body = ""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
body = part.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
body = part.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
break
|
||||
else:
|
||||
body = msg.get_payload(decode=True).decode(
|
||||
"utf-8", errors="ignore"
|
||||
)
|
||||
|
||||
body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
|
||||
|
||||
# Create document content
|
||||
doc_content = f"""
|
||||
From: {from_addr}
|
||||
@@ -98,57 +93,55 @@ Date: {date}
|
||||
|
||||
{body}
|
||||
"""
|
||||
|
||||
|
||||
# Create metadata
|
||||
metadata = {
|
||||
"file_path": filepath,
|
||||
"subject": subject,
|
||||
"from": from_addr,
|
||||
"to": to_addr,
|
||||
"date": date,
|
||||
"filename": filename,
|
||||
'file_path': filepath,
|
||||
'subject': subject,
|
||||
'from': from_addr,
|
||||
'to': to_addr,
|
||||
'date': date,
|
||||
'filename': filename
|
||||
}
|
||||
|
||||
|
||||
doc = Document(text=doc_content, metadata=metadata)
|
||||
docs.append(doc)
|
||||
count += 1
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing email from {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading file {filepath}: {e}")
|
||||
continue
|
||||
|
||||
|
||||
print(f"Loaded {len(docs)} email documents")
|
||||
return docs
|
||||
|
||||
|
||||
def main():
|
||||
# Use the current directory where the sample.emlx file is located
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
print("Testing EmlxReader with sample .emlx file...")
|
||||
print(f"Scanning directory: {current_dir}")
|
||||
|
||||
|
||||
# Use the custom EmlxReader
|
||||
documents = EmlxReader().load_data(current_dir, max_count=1000)
|
||||
|
||||
|
||||
if not documents:
|
||||
print("No documents loaded. Make sure sample.emlx exists in the examples directory.")
|
||||
return
|
||||
|
||||
|
||||
print(f"\nSuccessfully loaded {len(documents)} document(s)")
|
||||
|
||||
|
||||
# Initialize index with documents
|
||||
index = VectorStoreIndex.from_documents(documents)
|
||||
query_engine = index.as_query_engine()
|
||||
|
||||
|
||||
print("\nTesting query: 'Hows Berkeley Graduate Student Instructor'")
|
||||
res = query_engine.query("Hows Berkeley Graduate Student Instructor")
|
||||
print(f"Response: {res}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
@@ -2,20 +2,20 @@
|
||||
|
||||
import argparse
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoModel, BitsAndBytesConfig
|
||||
|
||||
from tqdm import tqdm
|
||||
from contextlib import contextmanager
|
||||
|
||||
@dataclass
|
||||
class BenchmarkConfig:
|
||||
model_path: str
|
||||
batch_sizes: list[int]
|
||||
batch_sizes: List[int]
|
||||
seq_length: int
|
||||
num_runs: int
|
||||
use_fp16: bool = True
|
||||
@@ -28,44 +28,47 @@ class BenchmarkConfig:
|
||||
|
||||
class GraphContainer:
|
||||
"""Container for managing graphs for different batch sizes (CUDA graphs on NVIDIA, regular on others)."""
|
||||
|
||||
|
||||
def __init__(self, model: nn.Module, seq_length: int):
|
||||
self.model = model
|
||||
self.seq_length = seq_length
|
||||
self.graphs: dict[int, GraphWrapper] = {}
|
||||
|
||||
def get_or_create(self, batch_size: int) -> "GraphWrapper":
|
||||
self.graphs: Dict[int, 'GraphWrapper'] = {}
|
||||
|
||||
def get_or_create(self, batch_size: int) -> 'GraphWrapper':
|
||||
if batch_size not in self.graphs:
|
||||
self.graphs[batch_size] = GraphWrapper(self.model, batch_size, self.seq_length)
|
||||
self.graphs[batch_size] = GraphWrapper(
|
||||
self.model, batch_size, self.seq_length
|
||||
)
|
||||
return self.graphs[batch_size]
|
||||
|
||||
|
||||
class GraphWrapper:
|
||||
"""Wrapper for graph capture and replay (CUDA graphs on NVIDIA, regular on others)."""
|
||||
|
||||
|
||||
def __init__(self, model: nn.Module, batch_size: int, seq_length: int):
|
||||
self.model = model
|
||||
self.device = self._get_device()
|
||||
self.static_input = self._create_random_batch(batch_size, seq_length)
|
||||
self.static_attention_mask = torch.ones_like(self.static_input)
|
||||
|
||||
|
||||
# Warm up
|
||||
self._warmup()
|
||||
|
||||
|
||||
# Only use CUDA graphs on NVIDIA GPUs
|
||||
if torch.cuda.is_available() and hasattr(torch.cuda, "CUDAGraph"):
|
||||
if torch.cuda.is_available() and hasattr(torch.cuda, 'CUDAGraph'):
|
||||
# Capture graph
|
||||
self.graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(self.graph):
|
||||
self.static_output = self.model(
|
||||
input_ids=self.static_input, attention_mask=self.static_attention_mask
|
||||
input_ids=self.static_input,
|
||||
attention_mask=self.static_attention_mask
|
||||
)
|
||||
self.use_cuda_graph = True
|
||||
else:
|
||||
# For MPS or CPU, just store the model
|
||||
self.use_cuda_graph = False
|
||||
self.static_output = None
|
||||
|
||||
|
||||
def _get_device(self) -> str:
|
||||
if torch.cuda.is_available():
|
||||
return "cuda"
|
||||
@@ -73,17 +76,22 @@ class GraphWrapper:
|
||||
return "mps"
|
||||
else:
|
||||
return "cpu"
|
||||
|
||||
|
||||
def _create_random_batch(self, batch_size: int, seq_length: int) -> torch.Tensor:
|
||||
return torch.randint(
|
||||
0, 1000, (batch_size, seq_length), device=self.device, dtype=torch.long
|
||||
0, 1000, (batch_size, seq_length),
|
||||
device=self.device,
|
||||
dtype=torch.long
|
||||
)
|
||||
|
||||
|
||||
def _warmup(self, num_warmup: int = 3):
|
||||
with torch.no_grad():
|
||||
for _ in range(num_warmup):
|
||||
self.model(input_ids=self.static_input, attention_mask=self.static_attention_mask)
|
||||
|
||||
self.model(
|
||||
input_ids=self.static_input,
|
||||
attention_mask=self.static_attention_mask
|
||||
)
|
||||
|
||||
def __call__(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
||||
if self.use_cuda_graph:
|
||||
self.static_input.copy_(input_ids)
|
||||
@@ -97,14 +105,14 @@ class GraphWrapper:
|
||||
|
||||
class ModelOptimizer:
|
||||
"""Applies various optimizations to the model."""
|
||||
|
||||
|
||||
@staticmethod
|
||||
def optimize(model: nn.Module, config: BenchmarkConfig) -> nn.Module:
|
||||
print("\nApplying model optimizations:")
|
||||
|
||||
|
||||
if model is None:
|
||||
raise ValueError("Cannot optimize None model")
|
||||
|
||||
|
||||
# Move to GPU
|
||||
if torch.cuda.is_available():
|
||||
model = model.cuda()
|
||||
@@ -116,59 +124,53 @@ class ModelOptimizer:
|
||||
model = model.cpu()
|
||||
device = "cpu"
|
||||
print(f"- Model moved to {device}")
|
||||
|
||||
|
||||
# FP16
|
||||
if config.use_fp16 and not config.use_int4:
|
||||
model = model.half()
|
||||
# use torch compile
|
||||
model = torch.compile(model)
|
||||
print("- Using FP16 precision")
|
||||
|
||||
|
||||
# Check if using SDPA (only on CUDA)
|
||||
if (
|
||||
torch.cuda.is_available()
|
||||
and torch.version.cuda
|
||||
and float(torch.version.cuda[:3]) >= 11.6
|
||||
):
|
||||
if hasattr(torch.nn.functional, "scaled_dot_product_attention"):
|
||||
if torch.cuda.is_available() and torch.version.cuda and float(torch.version.cuda[:3]) >= 11.6:
|
||||
if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
|
||||
print("- Using PyTorch SDPA (scaled_dot_product_attention)")
|
||||
else:
|
||||
print("- PyTorch SDPA not available")
|
||||
|
||||
|
||||
# Flash Attention (only on CUDA)
|
||||
if config.use_flash_attention and torch.cuda.is_available():
|
||||
try:
|
||||
from flash_attn.flash_attention import FlashAttention # noqa: F401
|
||||
|
||||
from flash_attn.flash_attention import FlashAttention
|
||||
print("- Flash Attention 2 available")
|
||||
if hasattr(model.config, "attention_mode"):
|
||||
model.config.attention_mode = "flash_attention_2"
|
||||
print(" - Enabled Flash Attention 2 mode")
|
||||
except ImportError:
|
||||
print("- Flash Attention not available")
|
||||
|
||||
|
||||
# Memory efficient attention (only on CUDA)
|
||||
if torch.cuda.is_available():
|
||||
try:
|
||||
from xformers.ops import memory_efficient_attention # noqa: F401
|
||||
|
||||
if hasattr(model, "enable_xformers_memory_efficient_attention"):
|
||||
from xformers.ops import memory_efficient_attention
|
||||
if hasattr(model, 'enable_xformers_memory_efficient_attention'):
|
||||
model.enable_xformers_memory_efficient_attention()
|
||||
print("- Enabled xformers memory efficient attention")
|
||||
else:
|
||||
print("- Model doesn't support xformers")
|
||||
except (ImportError, AttributeError):
|
||||
print("- Xformers not available")
|
||||
|
||||
|
||||
model.eval()
|
||||
print("- Model set to eval mode")
|
||||
|
||||
|
||||
return model
|
||||
|
||||
|
||||
class Timer:
|
||||
"""Handles accurate GPU timing using GPU events or CPU timing."""
|
||||
|
||||
|
||||
def __init__(self):
|
||||
if torch.cuda.is_available():
|
||||
self.start_event = torch.cuda.Event(enable_timing=True)
|
||||
@@ -180,7 +182,7 @@ class Timer:
|
||||
else:
|
||||
# CPU timing
|
||||
self.use_gpu_timing = False
|
||||
|
||||
|
||||
@contextmanager
|
||||
def timing(self):
|
||||
if self.use_gpu_timing:
|
||||
@@ -193,7 +195,7 @@ class Timer:
|
||||
start_time = time.time()
|
||||
yield
|
||||
self.cpu_elapsed = time.time() - start_time
|
||||
|
||||
|
||||
def elapsed_time(self) -> float:
|
||||
if self.use_gpu_timing:
|
||||
return self.start_event.elapsed_time(self.end_event) / 1000 # ms to seconds
|
||||
@@ -203,14 +205,14 @@ class Timer:
|
||||
|
||||
class Benchmark:
|
||||
"""Main benchmark runner."""
|
||||
|
||||
|
||||
def __init__(self, config: BenchmarkConfig):
|
||||
self.config = config
|
||||
try:
|
||||
self.model = self._load_model()
|
||||
if self.model is None:
|
||||
raise ValueError("Model initialization failed - model is None")
|
||||
|
||||
|
||||
# Only use CUDA graphs on NVIDIA GPUs
|
||||
if config.use_cuda_graphs and torch.cuda.is_available():
|
||||
self.graphs = GraphContainer(self.model, config.seq_length)
|
||||
@@ -218,27 +220,25 @@ class Benchmark:
|
||||
self.graphs = None
|
||||
self.timer = Timer()
|
||||
except Exception as e:
|
||||
print(f"ERROR in benchmark initialization: {e!s}")
|
||||
print(f"ERROR in benchmark initialization: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def _load_model(self) -> nn.Module:
|
||||
print(f"Loading model from {self.config.model_path}...")
|
||||
|
||||
|
||||
try:
|
||||
# Int4 quantization using HuggingFace integration
|
||||
if self.config.use_int4:
|
||||
import bitsandbytes as bnb
|
||||
|
||||
print(f"- bitsandbytes version: {bnb.__version__}")
|
||||
|
||||
# Check if using custom 8bit quantization
|
||||
if hasattr(self.config, "use_linear8bitlt") and self.config.use_linear8bitlt:
|
||||
|
||||
# 检查是否使用自定义的8bit量化
|
||||
if hasattr(self.config, 'use_linear8bitlt') and self.config.use_linear8bitlt:
|
||||
print("- Using custom Linear8bitLt replacement for all linear layers")
|
||||
|
||||
# Load original model (without quantization config)
|
||||
|
||||
# 加载原始模型(不使用量化配置)
|
||||
import bitsandbytes as bnb
|
||||
import torch
|
||||
|
||||
# set default to half
|
||||
torch.set_default_dtype(torch.float16)
|
||||
compute_dtype = torch.float16 if self.config.use_fp16 else torch.float32
|
||||
@@ -246,120 +246,112 @@ class Benchmark:
|
||||
self.config.model_path,
|
||||
torch_dtype=compute_dtype,
|
||||
)
|
||||
|
||||
# Define replacement function
|
||||
|
||||
# 定义替换函数
|
||||
def replace_linear_with_linear8bitlt(model):
|
||||
"""Recursively replace all nn.Linear layers with Linear8bitLt"""
|
||||
"""递归地将模型中的所有nn.Linear层替换为Linear8bitLt"""
|
||||
for name, module in list(model.named_children()):
|
||||
if isinstance(module, nn.Linear):
|
||||
# Get original linear layer parameters
|
||||
# 获取原始线性层的参数
|
||||
in_features = module.in_features
|
||||
out_features = module.out_features
|
||||
bias = module.bias is not None
|
||||
|
||||
# Create 8bit linear layer
|
||||
|
||||
# 创建8bit线性层
|
||||
# print size
|
||||
print(f"in_features: {in_features}, out_features: {out_features}")
|
||||
new_module = bnb.nn.Linear8bitLt(
|
||||
in_features, out_features, bias=bias, has_fp16_weights=False
|
||||
in_features,
|
||||
out_features,
|
||||
bias=bias,
|
||||
has_fp16_weights=False
|
||||
)
|
||||
|
||||
# Copy weights and bias
|
||||
|
||||
# 复制权重和偏置
|
||||
new_module.weight.data = module.weight.data
|
||||
if bias:
|
||||
new_module.bias.data = module.bias.data
|
||||
|
||||
# Replace module
|
||||
|
||||
# 替换模块
|
||||
setattr(model, name, new_module)
|
||||
else:
|
||||
# Process child modules recursively
|
||||
# 递归处理子模块
|
||||
replace_linear_with_linear8bitlt(module)
|
||||
|
||||
|
||||
return model
|
||||
|
||||
# Replace all linear layers
|
||||
|
||||
# 替换所有线性层
|
||||
model = replace_linear_with_linear8bitlt(model)
|
||||
# add torch compile
|
||||
model = torch.compile(model)
|
||||
|
||||
# Move model to GPU (quantization happens here)
|
||||
device = (
|
||||
"cuda"
|
||||
if torch.cuda.is_available()
|
||||
else "mps"
|
||||
if torch.backends.mps.is_available()
|
||||
else "cpu"
|
||||
)
|
||||
|
||||
# 将模型移到GPU(量化发生在这里)
|
||||
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
model = model.to(device)
|
||||
|
||||
|
||||
print("- All linear layers replaced with Linear8bitLt")
|
||||
|
||||
|
||||
else:
|
||||
# Use original Int4 quantization method
|
||||
# 使用原来的Int4量化方法
|
||||
print("- Using bitsandbytes for Int4 quantization")
|
||||
|
||||
|
||||
# Create quantization config
|
||||
|
||||
|
||||
compute_dtype = torch.float16 if self.config.use_fp16 else torch.float32
|
||||
quantization_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_compute_dtype=compute_dtype,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_quant_type="nf4"
|
||||
)
|
||||
|
||||
|
||||
print("- Quantization config:", quantization_config)
|
||||
|
||||
|
||||
# Load model directly with quantization config
|
||||
model = AutoModel.from_pretrained(
|
||||
self.config.model_path,
|
||||
quantization_config=quantization_config,
|
||||
torch_dtype=compute_dtype,
|
||||
device_map="auto", # Let HF decide on device mapping
|
||||
device_map="auto" # Let HF decide on device mapping
|
||||
)
|
||||
|
||||
|
||||
# Check if model loaded successfully
|
||||
if model is None:
|
||||
raise ValueError("Model loading returned None")
|
||||
|
||||
|
||||
print(f"- Model type: {type(model)}")
|
||||
|
||||
|
||||
# Apply optimizations directly here
|
||||
print("\nApplying model optimizations:")
|
||||
|
||||
if hasattr(self.config, "use_linear8bitlt") and self.config.use_linear8bitlt:
|
||||
|
||||
if hasattr(self.config, 'use_linear8bitlt') and self.config.use_linear8bitlt:
|
||||
print("- Model moved to GPU with Linear8bitLt quantization")
|
||||
else:
|
||||
# Skip moving to GPU since device_map="auto" already did that
|
||||
print("- Model already on GPU due to device_map='auto'")
|
||||
|
||||
|
||||
# Skip FP16 conversion since we specified compute_dtype
|
||||
print(f"- Using {compute_dtype} for compute dtype")
|
||||
|
||||
|
||||
# Check CUDA and SDPA
|
||||
if (
|
||||
torch.cuda.is_available()
|
||||
and torch.version.cuda
|
||||
and float(torch.version.cuda[:3]) >= 11.6
|
||||
):
|
||||
if hasattr(torch.nn.functional, "scaled_dot_product_attention"):
|
||||
if torch.cuda.is_available() and torch.version.cuda and float(torch.version.cuda[:3]) >= 11.6:
|
||||
if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
|
||||
print("- Using PyTorch SDPA (scaled_dot_product_attention)")
|
||||
else:
|
||||
print("- PyTorch SDPA not available")
|
||||
|
||||
|
||||
# Try xformers if available (only on CUDA)
|
||||
if torch.cuda.is_available():
|
||||
try:
|
||||
from xformers.ops import memory_efficient_attention # noqa: F401
|
||||
|
||||
if hasattr(model, "enable_xformers_memory_efficient_attention"):
|
||||
from xformers.ops import memory_efficient_attention
|
||||
if hasattr(model, 'enable_xformers_memory_efficient_attention'):
|
||||
model.enable_xformers_memory_efficient_attention()
|
||||
print("- Enabled xformers memory efficient attention")
|
||||
else:
|
||||
print("- Model doesn't support xformers")
|
||||
except (ImportError, AttributeError):
|
||||
print("- Xformers not available")
|
||||
|
||||
|
||||
# Set to eval mode
|
||||
model.eval()
|
||||
print("- Model set to eval mode")
|
||||
@@ -373,79 +365,76 @@ class Benchmark:
|
||||
llm_int8_threshold=6.0,
|
||||
llm_int8_has_fp16_weight=False,
|
||||
)
|
||||
|
||||
|
||||
model = AutoModel.from_pretrained(
|
||||
self.config.model_path,
|
||||
quantization_config=quantization_config,
|
||||
torch_dtype=compute_dtype,
|
||||
device_map="auto",
|
||||
device_map="auto"
|
||||
)
|
||||
|
||||
|
||||
if model is None:
|
||||
raise ValueError("Model loading returned None")
|
||||
|
||||
|
||||
print(f"- Model type: {type(model)}")
|
||||
model.eval()
|
||||
print("- Model set to eval mode")
|
||||
|
||||
|
||||
else:
|
||||
# Standard loading for FP16/FP32
|
||||
model = AutoModel.from_pretrained(self.config.model_path)
|
||||
print("- Model loaded in standard precision")
|
||||
print(f"- Model type: {type(model)}")
|
||||
|
||||
|
||||
# Apply standard optimizations
|
||||
# set default to half
|
||||
import torch
|
||||
|
||||
torch.set_default_dtype(torch.bfloat16)
|
||||
model = ModelOptimizer.optimize(model, self.config)
|
||||
model = model.half()
|
||||
# add torch compile
|
||||
model = torch.compile(model)
|
||||
|
||||
|
||||
# Final check to ensure model is not None
|
||||
if model is None:
|
||||
raise ValueError("Model is None after optimization")
|
||||
|
||||
|
||||
print(f"- Final model type: {type(model)}")
|
||||
return model
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"ERROR loading model: {e!s}")
|
||||
print(f"ERROR loading model: {str(e)}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
|
||||
def _create_random_batch(self, batch_size: int) -> torch.Tensor:
|
||||
device = (
|
||||
"cuda"
|
||||
if torch.cuda.is_available()
|
||||
else "mps"
|
||||
if torch.backends.mps.is_available()
|
||||
else "cpu"
|
||||
)
|
||||
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
return torch.randint(
|
||||
0, 1000, (batch_size, self.config.seq_length), device=device, dtype=torch.long
|
||||
0, 1000,
|
||||
(batch_size, self.config.seq_length),
|
||||
device=device,
|
||||
dtype=torch.long
|
||||
)
|
||||
|
||||
|
||||
def _run_inference(
|
||||
self, input_ids: torch.Tensor, graph_wrapper: GraphWrapper | None = None
|
||||
) -> tuple[float, torch.Tensor]:
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
graph_wrapper: Optional[GraphWrapper] = None
|
||||
) -> Tuple[float, torch.Tensor]:
|
||||
attention_mask = torch.ones_like(input_ids)
|
||||
|
||||
|
||||
with torch.no_grad(), self.timer.timing():
|
||||
if graph_wrapper is not None:
|
||||
output = graph_wrapper(input_ids, attention_mask)
|
||||
else:
|
||||
output = self.model(input_ids=input_ids, attention_mask=attention_mask)
|
||||
|
||||
|
||||
return self.timer.elapsed_time(), output
|
||||
|
||||
def run(self) -> dict[int, dict[str, float]]:
|
||||
|
||||
def run(self) -> Dict[int, Dict[str, float]]:
|
||||
results = {}
|
||||
|
||||
|
||||
# Reset peak memory stats
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
@@ -454,20 +443,22 @@ class Benchmark:
|
||||
pass
|
||||
else:
|
||||
print("- No GPU memory stats available")
|
||||
|
||||
|
||||
for batch_size in self.config.batch_sizes:
|
||||
print(f"\nTesting batch size: {batch_size}")
|
||||
times = []
|
||||
|
||||
|
||||
# Get or create graph for this batch size
|
||||
graph_wrapper = (
|
||||
self.graphs.get_or_create(batch_size) if self.graphs is not None else None
|
||||
self.graphs.get_or_create(batch_size)
|
||||
if self.graphs is not None
|
||||
else None
|
||||
)
|
||||
|
||||
|
||||
# Pre-allocate input tensor
|
||||
input_ids = self._create_random_batch(batch_size)
|
||||
print(f"Input shape: {input_ids.shape}")
|
||||
|
||||
|
||||
# Run benchmark
|
||||
for i in tqdm(range(self.config.num_runs), desc=f"Batch size {batch_size}"):
|
||||
try:
|
||||
@@ -478,44 +469,44 @@ class Benchmark:
|
||||
except Exception as e:
|
||||
print(f"Error during inference: {e}")
|
||||
break
|
||||
|
||||
|
||||
if not times:
|
||||
print(f"No successful runs for batch size {batch_size}, skipping")
|
||||
continue
|
||||
|
||||
|
||||
# Calculate statistics
|
||||
avg_time = np.mean(times)
|
||||
std_time = np.std(times)
|
||||
throughput = batch_size / avg_time
|
||||
|
||||
|
||||
results[batch_size] = {
|
||||
"avg_time": avg_time,
|
||||
"std_time": std_time,
|
||||
"throughput": throughput,
|
||||
}
|
||||
|
||||
|
||||
print(f"Avg Time: {avg_time:.4f}s ± {std_time:.4f}s")
|
||||
print(f"Throughput: {throughput:.2f} sequences/second")
|
||||
|
||||
|
||||
# Log memory usage
|
||||
if torch.cuda.is_available():
|
||||
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
|
||||
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024 ** 3)
|
||||
elif torch.backends.mps.is_available():
|
||||
# MPS doesn't have max_memory_allocated, use 0
|
||||
peak_memory_gb = 0.0
|
||||
else:
|
||||
peak_memory_gb = 0.0
|
||||
print("- No GPU memory usage available")
|
||||
|
||||
|
||||
if peak_memory_gb > 0:
|
||||
print(f"\nPeak GPU memory usage: {peak_memory_gb:.2f} GB")
|
||||
else:
|
||||
print("\n- GPU memory usage not available")
|
||||
|
||||
|
||||
# Add memory info to results
|
||||
for batch_size in results:
|
||||
results[batch_size]["peak_memory_gb"] = peak_memory_gb
|
||||
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@@ -575,14 +566,14 @@ def main():
|
||||
action="store_true",
|
||||
help="Enable Linear8bitLt quantization for all linear layers",
|
||||
)
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
# Print arguments for debugging
|
||||
print("\nCommand line arguments:")
|
||||
for arg, value in vars(args).items():
|
||||
print(f"- {arg}: {value}")
|
||||
|
||||
|
||||
config = BenchmarkConfig(
|
||||
model_path=args.model_path,
|
||||
batch_sizes=[int(bs) for bs in args.batch_sizes.split(",")],
|
||||
@@ -595,56 +586,45 @@ def main():
|
||||
use_flash_attention=args.use_flash_attention,
|
||||
use_linear8bitlt=args.use_linear8bitlt,
|
||||
)
|
||||
|
||||
|
||||
# Print configuration for debugging
|
||||
print("\nBenchmark configuration:")
|
||||
for field, value in vars(config).items():
|
||||
print(f"- {field}: {value}")
|
||||
|
||||
|
||||
try:
|
||||
benchmark = Benchmark(config)
|
||||
results = benchmark.run()
|
||||
|
||||
|
||||
# Save results to file
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
# Create results directory if it doesn't exist
|
||||
os.makedirs("results", exist_ok=True)
|
||||
|
||||
|
||||
# Generate filename based on configuration
|
||||
precision_type = (
|
||||
"int4"
|
||||
if config.use_int4
|
||||
else "int8"
|
||||
if config.use_int8
|
||||
else "fp16"
|
||||
if config.use_fp16
|
||||
else "fp32"
|
||||
)
|
||||
precision_type = "int4" if config.use_int4 else "int8" if config.use_int8 else "fp16" if config.use_fp16 else "fp32"
|
||||
model_name = os.path.basename(config.model_path)
|
||||
output_file = f"results/benchmark_{model_name}_{precision_type}.json"
|
||||
|
||||
|
||||
# Save results
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(
|
||||
{
|
||||
"config": {
|
||||
k: str(v) if isinstance(v, list) else v for k, v in vars(config).items()
|
||||
},
|
||||
"results": {str(k): v for k, v in results.items()},
|
||||
},
|
||||
f,
|
||||
indent=2,
|
||||
"config": {k: str(v) if isinstance(v, list) else v for k, v in vars(config).items()},
|
||||
"results": {str(k): v for k, v in results.items()}
|
||||
},
|
||||
f,
|
||||
indent=2
|
||||
)
|
||||
print(f"Results saved to {output_file}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Benchmark failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
@@ -1,39 +1,37 @@
|
||||
import os
|
||||
|
||||
from llama_index.core import StorageContext, VectorStoreIndex
|
||||
|
||||
from llama_index.core import VectorStoreIndex, StorageContext
|
||||
|
||||
def load_index(save_dir: str = "mail_index"):
|
||||
"""
|
||||
Load the saved index from disk.
|
||||
|
||||
|
||||
Args:
|
||||
save_dir: Directory where the index is saved
|
||||
|
||||
|
||||
Returns:
|
||||
Loaded index or None if loading fails
|
||||
"""
|
||||
try:
|
||||
# Load storage context
|
||||
storage_context = StorageContext.from_defaults(persist_dir=save_dir)
|
||||
|
||||
|
||||
# Load index
|
||||
index = VectorStoreIndex.from_vector_store(
|
||||
storage_context.vector_store, storage_context=storage_context
|
||||
storage_context.vector_store,
|
||||
storage_context=storage_context
|
||||
)
|
||||
|
||||
|
||||
print(f"Index loaded from {save_dir}")
|
||||
return index
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading index: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def query_index(index, query: str):
|
||||
"""
|
||||
Query the loaded index.
|
||||
|
||||
|
||||
Args:
|
||||
index: The loaded index
|
||||
query: The query string
|
||||
@@ -41,47 +39,44 @@ def query_index(index, query: str):
|
||||
if index is None:
|
||||
print("No index available for querying.")
|
||||
return
|
||||
|
||||
|
||||
query_engine = index.as_query_engine()
|
||||
response = query_engine.query(query)
|
||||
print(f"\nQuery: {query}")
|
||||
print(f"Response: {response}")
|
||||
|
||||
|
||||
def main():
|
||||
save_dir = "mail_index"
|
||||
|
||||
|
||||
# Check if index exists
|
||||
if not os.path.exists(save_dir) or not os.path.exists(
|
||||
os.path.join(save_dir, "vector_store.json")
|
||||
):
|
||||
if not os.path.exists(save_dir) or not os.path.exists(os.path.join(save_dir, "vector_store.json")):
|
||||
print(f"Index not found in {save_dir}")
|
||||
print("Please run mail_reader_save_load.py first to create the index.")
|
||||
return
|
||||
|
||||
|
||||
# Load the index
|
||||
index = load_index(save_dir)
|
||||
|
||||
|
||||
if not index:
|
||||
print("Failed to load index.")
|
||||
return
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Email Query Interface")
|
||||
print("=" * 60)
|
||||
print("="*60)
|
||||
print("Type 'quit' to exit")
|
||||
print("Type 'help' for example queries")
|
||||
print("=" * 60)
|
||||
|
||||
print("="*60)
|
||||
|
||||
# Interactive query loop
|
||||
while True:
|
||||
try:
|
||||
query = input("\nEnter your query: ").strip()
|
||||
|
||||
if query.lower() == "quit":
|
||||
|
||||
if query.lower() == 'quit':
|
||||
print("Goodbye!")
|
||||
break
|
||||
elif query.lower() == "help":
|
||||
elif query.lower() == 'help':
|
||||
print("\nExample queries:")
|
||||
print("- Hows Berkeley Graduate Student Instructor")
|
||||
print("- What emails mention GSR appointments?")
|
||||
@@ -91,15 +86,14 @@ def main():
|
||||
continue
|
||||
elif not query:
|
||||
continue
|
||||
|
||||
|
||||
query_index(index, query)
|
||||
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nGoodbye!")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error processing query: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
@@ -1,46 +1,43 @@
|
||||
import time
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import mlx.core as mx
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import torch
|
||||
from mlx_lm import load
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import mlx.core as mx
|
||||
from mlx_lm import load
|
||||
|
||||
# --- Configuration ---
|
||||
MODEL_NAME_TORCH = "Qwen/Qwen3-Embedding-0.6B"
|
||||
MODEL_NAME_MLX = "mlx-community/Qwen3-Embedding-0.6B-4bit-DWQ"
|
||||
BATCH_SIZES = [1, 8, 16, 32, 64, 128]
|
||||
NUM_RUNS = 10 # Number of runs to average for each batch size
|
||||
WARMUP_RUNS = 2 # Number of warm-up runs
|
||||
WARMUP_RUNS = 2 # Number of warm-up runs
|
||||
|
||||
# --- Generate Dummy Data ---
|
||||
DUMMY_SENTENCES = ["This is a test sentence for benchmarking." * 5] * max(BATCH_SIZES)
|
||||
|
||||
# --- Benchmark Functions ---b
|
||||
|
||||
|
||||
def benchmark_torch(model, sentences):
|
||||
start_time = time.time()
|
||||
model.encode(sentences, convert_to_numpy=True)
|
||||
end_time = time.time()
|
||||
return (end_time - start_time) * 1000 # Return time in ms
|
||||
|
||||
|
||||
def benchmark_mlx(model, tokenizer, sentences):
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
# Tokenize sentences using MLX tokenizer
|
||||
tokens = []
|
||||
for sentence in sentences:
|
||||
token_ids = tokenizer.encode(sentence)
|
||||
tokens.append(token_ids)
|
||||
|
||||
|
||||
# Pad sequences to the same length
|
||||
max_len = max(len(t) for t in tokens)
|
||||
input_ids = []
|
||||
attention_mask = []
|
||||
|
||||
|
||||
for token_seq in tokens:
|
||||
# Pad sequence
|
||||
padded = token_seq + [tokenizer.eos_token_id] * (max_len - len(token_seq))
|
||||
@@ -48,25 +45,24 @@ def benchmark_mlx(model, tokenizer, sentences):
|
||||
# Create attention mask (1 for real tokens, 0 for padding)
|
||||
mask = [1] * len(token_seq) + [0] * (max_len - len(token_seq))
|
||||
attention_mask.append(mask)
|
||||
|
||||
|
||||
# Convert to MLX arrays
|
||||
input_ids = mx.array(input_ids)
|
||||
attention_mask = mx.array(attention_mask)
|
||||
|
||||
|
||||
# Get embeddings
|
||||
embeddings = model(input_ids)
|
||||
|
||||
|
||||
# Mean pooling
|
||||
mask = mx.expand_dims(attention_mask, -1)
|
||||
sum_embeddings = (embeddings * mask).sum(axis=1)
|
||||
sum_mask = mask.sum(axis=1)
|
||||
_ = sum_embeddings / sum_mask
|
||||
|
||||
|
||||
mx.eval() # Ensure computation is finished
|
||||
end_time = time.time()
|
||||
return (end_time - start_time) * 1000 # Return time in ms
|
||||
|
||||
|
||||
# --- Main Execution ---
|
||||
def main():
|
||||
print("--- Initializing Models ---")
|
||||
@@ -96,15 +92,13 @@ def main():
|
||||
for batch_size in BATCH_SIZES:
|
||||
print(f"Benchmarking batch size: {batch_size}")
|
||||
sentences_batch = DUMMY_SENTENCES[:batch_size]
|
||||
|
||||
|
||||
# Benchmark PyTorch
|
||||
torch_times = [benchmark_torch(model_torch, sentences_batch) for _ in range(NUM_RUNS)]
|
||||
results_torch.append(np.mean(torch_times))
|
||||
|
||||
|
||||
# Benchmark MLX
|
||||
mlx_times = [
|
||||
benchmark_mlx(model_mlx, tokenizer_mlx, sentences_batch) for _ in range(NUM_RUNS)
|
||||
]
|
||||
mlx_times = [benchmark_mlx(model_mlx, tokenizer_mlx, sentences_batch) for _ in range(NUM_RUNS)]
|
||||
results_mlx.append(np.mean(mlx_times))
|
||||
|
||||
print("\n--- Benchmark Results (Average time per batch in ms) ---")
|
||||
@@ -115,21 +109,20 @@ def main():
|
||||
# --- Plotting ---
|
||||
print("\n--- Generating Plot ---")
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.plot(BATCH_SIZES, results_torch, marker="o", linestyle="-", label=f"PyTorch ({device})")
|
||||
plt.plot(BATCH_SIZES, results_mlx, marker="s", linestyle="-", label="MLX")
|
||||
plt.plot(BATCH_SIZES, results_torch, marker='o', linestyle='-', label=f'PyTorch ({device})')
|
||||
plt.plot(BATCH_SIZES, results_mlx, marker='s', linestyle='-', label='MLX')
|
||||
|
||||
plt.title(f"Embedding Performance: MLX vs PyTorch\nModel: {MODEL_NAME_TORCH}")
|
||||
plt.title(f'Embedding Performance: MLX vs PyTorch\nModel: {MODEL_NAME_TORCH}')
|
||||
plt.xlabel("Batch Size")
|
||||
plt.ylabel("Average Time per Batch (ms)")
|
||||
plt.xticks(BATCH_SIZES)
|
||||
plt.grid(True)
|
||||
plt.legend()
|
||||
|
||||
|
||||
# Save the plot
|
||||
output_filename = "embedding_benchmark.png"
|
||||
plt.savefig(output_filename)
|
||||
print(f"Plot saved to {output_filename}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -3,52 +3,49 @@
|
||||
Debug script to test ZMQ communication with the exact same setup as main_cli_example.py
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import zmq
|
||||
|
||||
sys.path.append("packages/leann-backend-diskann")
|
||||
import time
|
||||
import threading
|
||||
import sys
|
||||
sys.path.append('packages/leann-backend-diskann')
|
||||
from leann_backend_diskann import embedding_pb2
|
||||
|
||||
|
||||
def test_zmq_with_same_model():
|
||||
print("=== Testing ZMQ with same model as main_cli_example.py ===")
|
||||
|
||||
|
||||
# Test the exact same model that main_cli_example.py uses
|
||||
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
|
||||
|
||||
# Start server with the same model
|
||||
import subprocess
|
||||
|
||||
server_cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
sys.executable, "-m",
|
||||
"packages.leann-backend-diskann.leann_backend_diskann.embedding_server",
|
||||
"--zmq-port",
|
||||
"5556", # Use different port to avoid conflicts
|
||||
"--model-name",
|
||||
model_name,
|
||||
"--zmq-port", "5556", # Use different port to avoid conflicts
|
||||
"--model-name", model_name
|
||||
]
|
||||
|
||||
|
||||
print(f"Starting server with command: {' '.join(server_cmd)}")
|
||||
server_process = subprocess.Popen(
|
||||
server_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
||||
server_cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
|
||||
# Wait for server to start
|
||||
print("Waiting for server to start...")
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
# Check if server is running
|
||||
if server_process.poll() is not None:
|
||||
stdout, stderr = server_process.communicate()
|
||||
print(f"Server failed to start. stdout: {stdout}")
|
||||
print(f"Server failed to start. stderr: {stderr}")
|
||||
return False
|
||||
|
||||
|
||||
print(f"Server started with PID: {server_process.pid}")
|
||||
|
||||
|
||||
try:
|
||||
# Test client
|
||||
context = zmq.Context()
|
||||
@@ -56,39 +53,39 @@ def test_zmq_with_same_model():
|
||||
socket.connect("tcp://127.0.0.1:5556")
|
||||
socket.setsockopt(zmq.RCVTIMEO, 30000) # 30 second timeout like C++
|
||||
socket.setsockopt(zmq.SNDTIMEO, 30000)
|
||||
|
||||
|
||||
# Create request with same format as C++
|
||||
request = embedding_pb2.NodeEmbeddingRequest()
|
||||
request.node_ids.extend([0, 1, 2, 3, 4]) # Test with some node IDs
|
||||
|
||||
|
||||
print(f"Sending request with {len(request.node_ids)} node IDs...")
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
# Send request
|
||||
socket.send(request.SerializeToString())
|
||||
|
||||
|
||||
# Receive response
|
||||
response_data = socket.recv()
|
||||
end_time = time.time()
|
||||
|
||||
|
||||
print(f"Received response in {end_time - start_time:.3f} seconds")
|
||||
print(f"Response size: {len(response_data)} bytes")
|
||||
|
||||
|
||||
# Parse response
|
||||
response = embedding_pb2.NodeEmbeddingResponse()
|
||||
response.ParseFromString(response_data)
|
||||
|
||||
|
||||
print(f"Response dimensions: {list(response.dimensions)}")
|
||||
print(f"Embeddings data size: {len(response.embeddings_data)} bytes")
|
||||
print(f"Missing IDs: {list(response.missing_ids)}")
|
||||
|
||||
|
||||
# Calculate expected size
|
||||
if len(response.dimensions) == 2:
|
||||
batch_size = response.dimensions[0]
|
||||
embedding_dim = response.dimensions[1]
|
||||
expected_bytes = batch_size * embedding_dim * 4 # 4 bytes per float
|
||||
print(f"Expected bytes: {expected_bytes}, Actual: {len(response.embeddings_data)}")
|
||||
|
||||
|
||||
if len(response.embeddings_data) == expected_bytes:
|
||||
print("✅ Response format is correct!")
|
||||
return True
|
||||
@@ -98,7 +95,7 @@ def test_zmq_with_same_model():
|
||||
else:
|
||||
print("❌ Invalid response dimensions!")
|
||||
return False
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error during ZMQ test: {e}")
|
||||
return False
|
||||
@@ -108,10 +105,9 @@ def test_zmq_with_same_model():
|
||||
server_process.wait()
|
||||
print("Server terminated")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = test_zmq_with_same_model()
|
||||
if success:
|
||||
print("\n✅ ZMQ communication test passed!")
|
||||
else:
|
||||
print("\n❌ ZMQ communication test failed!")
|
||||
print("\n❌ ZMQ communication test failed!")
|
||||
@@ -1,27 +1,26 @@
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import AutoModel, BitsAndBytesConfig
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoModel
|
||||
|
||||
# Add MLX imports
|
||||
try:
|
||||
import mlx.core as mx
|
||||
from mlx_lm.utils import load
|
||||
|
||||
MLX_AVAILABLE = True
|
||||
except ImportError:
|
||||
except ImportError as e:
|
||||
print("MLX not available. Install with: uv pip install mlx mlx-lm")
|
||||
MLX_AVAILABLE = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkConfig:
|
||||
model_path: str = "facebook/contriever"
|
||||
batch_sizes: list[int] = None
|
||||
batch_sizes: List[int] = None
|
||||
seq_length: int = 256
|
||||
num_runs: int = 5
|
||||
use_fp16: bool = True
|
||||
@@ -31,19 +30,18 @@ class BenchmarkConfig:
|
||||
use_flash_attention: bool = False
|
||||
use_linear8bitlt: bool = False
|
||||
use_mlx: bool = False # New flag for MLX testing
|
||||
|
||||
|
||||
def __post_init__(self):
|
||||
if self.batch_sizes is None:
|
||||
self.batch_sizes = [1, 2, 4, 8, 16, 32, 64]
|
||||
|
||||
|
||||
class MLXBenchmark:
|
||||
"""MLX-specific benchmark for embedding models"""
|
||||
|
||||
|
||||
def __init__(self, config: BenchmarkConfig):
|
||||
self.config = config
|
||||
self.model, self.tokenizer = self._load_model()
|
||||
|
||||
|
||||
def _load_model(self):
|
||||
"""Load MLX model and tokenizer following the API pattern"""
|
||||
print(f"Loading MLX model from {self.config.model_path}...")
|
||||
@@ -54,51 +52,55 @@ class MLXBenchmark:
|
||||
except Exception as e:
|
||||
print(f"Error loading MLX model: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def _create_random_batch(self, batch_size: int):
|
||||
"""Create random input batches for MLX testing - same as PyTorch"""
|
||||
return torch.randint(0, 1000, (batch_size, self.config.seq_length), dtype=torch.long)
|
||||
|
||||
return torch.randint(
|
||||
0, 1000,
|
||||
(batch_size, self.config.seq_length),
|
||||
dtype=torch.long
|
||||
)
|
||||
|
||||
def _run_inference(self, input_ids: torch.Tensor) -> float:
|
||||
"""Run MLX inference with same input as PyTorch"""
|
||||
start_time = time.time()
|
||||
try:
|
||||
# Convert PyTorch tensor to MLX array
|
||||
input_ids_mlx = mx.array(input_ids.numpy())
|
||||
|
||||
|
||||
# Get embeddings
|
||||
embeddings = self.model(input_ids_mlx)
|
||||
|
||||
|
||||
# Mean pooling (following the API pattern)
|
||||
pooled = embeddings.mean(axis=1)
|
||||
|
||||
|
||||
# Convert to numpy (following the API pattern)
|
||||
pooled_numpy = np.array(pooled.tolist(), dtype=np.float32)
|
||||
|
||||
|
||||
# Force computation
|
||||
_ = pooled_numpy.shape
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"MLX inference error: {e}")
|
||||
return float("inf")
|
||||
return float('inf')
|
||||
end_time = time.time()
|
||||
|
||||
|
||||
return end_time - start_time
|
||||
|
||||
def run(self) -> dict[int, dict[str, float]]:
|
||||
|
||||
def run(self) -> Dict[int, Dict[str, float]]:
|
||||
"""Run the MLX benchmark across all batch sizes"""
|
||||
results = {}
|
||||
|
||||
|
||||
print(f"Starting MLX benchmark with model: {self.config.model_path}")
|
||||
print(f"Testing batch sizes: {self.config.batch_sizes}")
|
||||
|
||||
|
||||
for batch_size in self.config.batch_sizes:
|
||||
print(f"\n=== Testing MLX batch size: {batch_size} ===")
|
||||
times = []
|
||||
|
||||
|
||||
# Create input batch (same as PyTorch)
|
||||
input_ids = self._create_random_batch(batch_size)
|
||||
|
||||
|
||||
# Warm up
|
||||
print("Warming up...")
|
||||
for _ in range(3):
|
||||
@@ -107,26 +109,26 @@ class MLXBenchmark:
|
||||
except Exception as e:
|
||||
print(f"Warmup error: {e}")
|
||||
break
|
||||
|
||||
|
||||
# Run benchmark
|
||||
for _i in tqdm(range(self.config.num_runs), desc=f"MLX Batch size {batch_size}"):
|
||||
for i in tqdm(range(self.config.num_runs), desc=f"MLX Batch size {batch_size}"):
|
||||
try:
|
||||
elapsed_time = self._run_inference(input_ids)
|
||||
if elapsed_time != float("inf"):
|
||||
if elapsed_time != float('inf'):
|
||||
times.append(elapsed_time)
|
||||
except Exception as e:
|
||||
print(f"Error during MLX inference: {e}")
|
||||
break
|
||||
|
||||
|
||||
if not times:
|
||||
print(f"Skipping batch size {batch_size} due to errors")
|
||||
continue
|
||||
|
||||
|
||||
# Calculate statistics
|
||||
avg_time = np.mean(times)
|
||||
std_time = np.std(times)
|
||||
throughput = batch_size / avg_time
|
||||
|
||||
|
||||
results[batch_size] = {
|
||||
"avg_time": avg_time,
|
||||
"std_time": std_time,
|
||||
@@ -134,166 +136,179 @@ class MLXBenchmark:
|
||||
"min_time": np.min(times),
|
||||
"max_time": np.max(times),
|
||||
}
|
||||
|
||||
|
||||
print(f"MLX Results for batch size {batch_size}:")
|
||||
print(f" Avg Time: {avg_time:.4f}s ± {std_time:.4f}s")
|
||||
print(f" Min Time: {np.min(times):.4f}s")
|
||||
print(f" Max Time: {np.max(times):.4f}s")
|
||||
print(f" Throughput: {throughput:.2f} sequences/second")
|
||||
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class Benchmark:
|
||||
def __init__(self, config: BenchmarkConfig):
|
||||
self.config = config
|
||||
self.device = (
|
||||
"cuda"
|
||||
if torch.cuda.is_available()
|
||||
else "mps"
|
||||
if torch.backends.mps.is_available()
|
||||
else "cpu"
|
||||
)
|
||||
self.device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
self.model = self._load_model()
|
||||
|
||||
|
||||
def _load_model(self) -> nn.Module:
|
||||
print(f"Loading model from {self.config.model_path}...")
|
||||
|
||||
|
||||
|
||||
model = AutoModel.from_pretrained(self.config.model_path)
|
||||
if self.config.use_fp16:
|
||||
model = model.half()
|
||||
model = torch.compile(model)
|
||||
model = model.to(self.device)
|
||||
|
||||
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
|
||||
def _create_random_batch(self, batch_size: int) -> torch.Tensor:
|
||||
return torch.randint(
|
||||
0, 1000, (batch_size, self.config.seq_length), device=self.device, dtype=torch.long
|
||||
0, 1000,
|
||||
(batch_size, self.config.seq_length),
|
||||
device=self.device,
|
||||
dtype=torch.long
|
||||
)
|
||||
|
||||
|
||||
def _run_inference(self, input_ids: torch.Tensor) -> float:
|
||||
attention_mask = torch.ones_like(input_ids)
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
with torch.no_grad():
|
||||
self.model(input_ids=input_ids, attention_mask=attention_mask)
|
||||
output = self.model(input_ids=input_ids, attention_mask=attention_mask)
|
||||
end_time = time.time()
|
||||
|
||||
|
||||
return end_time - start_time
|
||||
|
||||
def run(self) -> dict[int, dict[str, float]]:
|
||||
|
||||
def run(self) -> Dict[int, Dict[str, float]]:
|
||||
results = {}
|
||||
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
|
||||
for batch_size in self.config.batch_sizes:
|
||||
print(f"\nTesting batch size: {batch_size}")
|
||||
times = []
|
||||
|
||||
|
||||
input_ids = self._create_random_batch(batch_size)
|
||||
|
||||
for _i in tqdm(range(self.config.num_runs), desc=f"Batch size {batch_size}"):
|
||||
|
||||
for i in tqdm(range(self.config.num_runs), desc=f"Batch size {batch_size}"):
|
||||
try:
|
||||
elapsed_time = self._run_inference(input_ids)
|
||||
times.append(elapsed_time)
|
||||
except Exception as e:
|
||||
print(f"Error during inference: {e}")
|
||||
break
|
||||
|
||||
|
||||
if not times:
|
||||
continue
|
||||
|
||||
|
||||
avg_time = np.mean(times)
|
||||
std_time = np.std(times)
|
||||
throughput = batch_size / avg_time
|
||||
|
||||
|
||||
results[batch_size] = {
|
||||
"avg_time": avg_time,
|
||||
"std_time": std_time,
|
||||
"throughput": throughput,
|
||||
}
|
||||
|
||||
|
||||
print(f"Avg Time: {avg_time:.4f}s ± {std_time:.4f}s")
|
||||
print(f"Throughput: {throughput:.2f} sequences/second")
|
||||
|
||||
|
||||
if torch.cuda.is_available():
|
||||
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)
|
||||
peak_memory_gb = torch.cuda.max_memory_allocated() / (1024 ** 3)
|
||||
else:
|
||||
peak_memory_gb = 0.0
|
||||
|
||||
|
||||
for batch_size in results:
|
||||
results[batch_size]["peak_memory_gb"] = peak_memory_gb
|
||||
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def run_benchmark():
|
||||
"""Main function to run the benchmark with optimized parameters."""
|
||||
config = BenchmarkConfig()
|
||||
|
||||
|
||||
try:
|
||||
benchmark = Benchmark(config)
|
||||
results = benchmark.run()
|
||||
|
||||
|
||||
max_throughput = max(results[batch_size]["throughput"] for batch_size in results)
|
||||
avg_throughput = np.mean([results[batch_size]["throughput"] for batch_size in results])
|
||||
|
||||
|
||||
return {
|
||||
"max_throughput": max_throughput,
|
||||
"avg_throughput": avg_throughput,
|
||||
"results": results,
|
||||
"results": results
|
||||
}
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Benchmark failed: {e}")
|
||||
return {"max_throughput": 0.0, "avg_throughput": 0.0, "error": str(e)}
|
||||
|
||||
return {
|
||||
"max_throughput": 0.0,
|
||||
"avg_throughput": 0.0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def run_mlx_benchmark():
|
||||
"""Run MLX-specific benchmark"""
|
||||
if not MLX_AVAILABLE:
|
||||
print("MLX not available, skipping MLX benchmark")
|
||||
return {"max_throughput": 0.0, "avg_throughput": 0.0, "error": "MLX not available"}
|
||||
|
||||
config = BenchmarkConfig(model_path="mlx-community/all-MiniLM-L6-v2-4bit", use_mlx=True)
|
||||
|
||||
return {
|
||||
"max_throughput": 0.0,
|
||||
"avg_throughput": 0.0,
|
||||
"error": "MLX not available"
|
||||
}
|
||||
|
||||
config = BenchmarkConfig(
|
||||
model_path="mlx-community/all-MiniLM-L6-v2-4bit",
|
||||
use_mlx=True
|
||||
)
|
||||
|
||||
try:
|
||||
benchmark = MLXBenchmark(config)
|
||||
results = benchmark.run()
|
||||
|
||||
|
||||
if not results:
|
||||
return {"max_throughput": 0.0, "avg_throughput": 0.0, "error": "No valid results"}
|
||||
|
||||
return {
|
||||
"max_throughput": 0.0,
|
||||
"avg_throughput": 0.0,
|
||||
"error": "No valid results"
|
||||
}
|
||||
|
||||
max_throughput = max(results[batch_size]["throughput"] for batch_size in results)
|
||||
avg_throughput = np.mean([results[batch_size]["throughput"] for batch_size in results])
|
||||
|
||||
|
||||
return {
|
||||
"max_throughput": max_throughput,
|
||||
"avg_throughput": avg_throughput,
|
||||
"results": results,
|
||||
"results": results
|
||||
}
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"MLX benchmark failed: {e}")
|
||||
return {"max_throughput": 0.0, "avg_throughput": 0.0, "error": str(e)}
|
||||
|
||||
return {
|
||||
"max_throughput": 0.0,
|
||||
"avg_throughput": 0.0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=== PyTorch Benchmark ===")
|
||||
pytorch_result = run_benchmark()
|
||||
print(f"PyTorch Max throughput: {pytorch_result['max_throughput']:.2f} sequences/second")
|
||||
print(f"PyTorch Average throughput: {pytorch_result['avg_throughput']:.2f} sequences/second")
|
||||
|
||||
|
||||
print("\n=== MLX Benchmark ===")
|
||||
mlx_result = run_mlx_benchmark()
|
||||
print(f"MLX Max throughput: {mlx_result['max_throughput']:.2f} sequences/second")
|
||||
print(f"MLX Average throughput: {mlx_result['avg_throughput']:.2f} sequences/second")
|
||||
|
||||
|
||||
# Compare results
|
||||
if pytorch_result["max_throughput"] > 0 and mlx_result["max_throughput"] > 0:
|
||||
speedup = mlx_result["max_throughput"] / pytorch_result["max_throughput"]
|
||||
print("\n=== Comparison ===")
|
||||
print(f"MLX is {speedup:.2f}x {'faster' if speedup > 1 else 'slower'} than PyTorch")
|
||||
if pytorch_result['max_throughput'] > 0 and mlx_result['max_throughput'] > 0:
|
||||
speedup = mlx_result['max_throughput'] / pytorch_result['max_throughput']
|
||||
print(f"\n=== Comparison ===")
|
||||
print(f"MLX is {speedup:.2f}x {'faster' if speedup > 1 else 'slower'} than PyTorch")
|
||||
157
uv.lock
generated
157
uv.lock
generated
@@ -701,53 +701,6 @@ toml = [
|
||||
{ name = "tomli", marker = "python_full_version <= '3.11'" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "45.0.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/95/1e/49527ac611af559665f71cbb8f92b332b5ec9c6fbc4e88b0f8e92f5e85df/cryptography-45.0.5.tar.gz", hash = "sha256:72e76caa004ab63accdf26023fccd1d087f6d90ec6048ff33ad0445abf7f605a", size = 744903 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f0/fb/09e28bc0c46d2c547085e60897fea96310574c70fb21cd58a730a45f3403/cryptography-45.0.5-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:101ee65078f6dd3e5a028d4f19c07ffa4dd22cce6a20eaa160f8b5219911e7d8", size = 7043092 },
|
||||
{ url = "https://files.pythonhosted.org/packages/b1/05/2194432935e29b91fb649f6149c1a4f9e6d3d9fc880919f4ad1bcc22641e/cryptography-45.0.5-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3a264aae5f7fbb089dbc01e0242d3b67dffe3e6292e1f5182122bdf58e65215d", size = 4205926 },
|
||||
{ url = "https://files.pythonhosted.org/packages/07/8b/9ef5da82350175e32de245646b1884fc01124f53eb31164c77f95a08d682/cryptography-45.0.5-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e74d30ec9c7cb2f404af331d5b4099a9b322a8a6b25c4632755c8757345baac5", size = 4429235 },
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/e1/c809f398adde1994ee53438912192d92a1d0fc0f2d7582659d9ef4c28b0c/cryptography-45.0.5-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3af26738f2db354aafe492fb3869e955b12b2ef2e16908c8b9cb928128d42c57", size = 4209785 },
|
||||
{ url = "https://files.pythonhosted.org/packages/d0/8b/07eb6bd5acff58406c5e806eff34a124936f41a4fb52909ffa4d00815f8c/cryptography-45.0.5-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e6c00130ed423201c5bc5544c23359141660b07999ad82e34e7bb8f882bb78e0", size = 3893050 },
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/ef/3333295ed58d900a13c92806b67e62f27876845a9a908c939f040887cca9/cryptography-45.0.5-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:dd420e577921c8c2d31289536c386aaa30140b473835e97f83bc71ea9d2baf2d", size = 4457379 },
|
||||
{ url = "https://files.pythonhosted.org/packages/d9/9d/44080674dee514dbb82b21d6fa5d1055368f208304e2ab1828d85c9de8f4/cryptography-45.0.5-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:d05a38884db2ba215218745f0781775806bde4f32e07b135348355fe8e4991d9", size = 4209355 },
|
||||
{ url = "https://files.pythonhosted.org/packages/c9/d8/0749f7d39f53f8258e5c18a93131919ac465ee1f9dccaf1b3f420235e0b5/cryptography-45.0.5-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:ad0caded895a00261a5b4aa9af828baede54638754b51955a0ac75576b831b27", size = 4456087 },
|
||||
{ url = "https://files.pythonhosted.org/packages/09/d7/92acac187387bf08902b0bf0699816f08553927bdd6ba3654da0010289b4/cryptography-45.0.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9024beb59aca9d31d36fcdc1604dd9bbeed0a55bface9f1908df19178e2f116e", size = 4332873 },
|
||||
{ url = "https://files.pythonhosted.org/packages/03/c2/840e0710da5106a7c3d4153c7215b2736151bba60bf4491bdb421df5056d/cryptography-45.0.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:91098f02ca81579c85f66df8a588c78f331ca19089763d733e34ad359f474174", size = 4564651 },
|
||||
{ url = "https://files.pythonhosted.org/packages/2e/92/cc723dd6d71e9747a887b94eb3827825c6c24b9e6ce2bb33b847d31d5eaa/cryptography-45.0.5-cp311-abi3-win32.whl", hash = "sha256:926c3ea71a6043921050eaa639137e13dbe7b4ab25800932a8498364fc1abec9", size = 2929050 },
|
||||
{ url = "https://files.pythonhosted.org/packages/1f/10/197da38a5911a48dd5389c043de4aec4b3c94cb836299b01253940788d78/cryptography-45.0.5-cp311-abi3-win_amd64.whl", hash = "sha256:b85980d1e345fe769cfc57c57db2b59cff5464ee0c045d52c0df087e926fbe63", size = 3403224 },
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/2b/160ce8c2765e7a481ce57d55eba1546148583e7b6f85514472b1d151711d/cryptography-45.0.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:f3562c2f23c612f2e4a6964a61d942f891d29ee320edb62ff48ffb99f3de9ae8", size = 7017143 },
|
||||
{ url = "https://files.pythonhosted.org/packages/c2/e7/2187be2f871c0221a81f55ee3105d3cf3e273c0a0853651d7011eada0d7e/cryptography-45.0.5-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3fcfbefc4a7f332dece7272a88e410f611e79458fab97b5efe14e54fe476f4fd", size = 4197780 },
|
||||
{ url = "https://files.pythonhosted.org/packages/b9/cf/84210c447c06104e6be9122661159ad4ce7a8190011669afceeaea150524/cryptography-45.0.5-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:460f8c39ba66af7db0545a8c6f2eabcbc5a5528fc1cf6c3fa9a1e44cec33385e", size = 4420091 },
|
||||
{ url = "https://files.pythonhosted.org/packages/3e/6a/cb8b5c8bb82fafffa23aeff8d3a39822593cee6e2f16c5ca5c2ecca344f7/cryptography-45.0.5-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:9b4cf6318915dccfe218e69bbec417fdd7c7185aa7aab139a2c0beb7468c89f0", size = 4198711 },
|
||||
{ url = "https://files.pythonhosted.org/packages/04/f7/36d2d69df69c94cbb2473871926daf0f01ad8e00fe3986ac3c1e8c4ca4b3/cryptography-45.0.5-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2089cc8f70a6e454601525e5bf2779e665d7865af002a5dec8d14e561002e135", size = 3883299 },
|
||||
{ url = "https://files.pythonhosted.org/packages/82/c7/f0ea40f016de72f81288e9fe8d1f6748036cb5ba6118774317a3ffc6022d/cryptography-45.0.5-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0027d566d65a38497bc37e0dd7c2f8ceda73597d2ac9ba93810204f56f52ebc7", size = 4450558 },
|
||||
{ url = "https://files.pythonhosted.org/packages/06/ae/94b504dc1a3cdf642d710407c62e86296f7da9e66f27ab12a1ee6fdf005b/cryptography-45.0.5-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:be97d3a19c16a9be00edf79dca949c8fa7eff621763666a145f9f9535a5d7f42", size = 4198020 },
|
||||
{ url = "https://files.pythonhosted.org/packages/05/2b/aaf0adb845d5dabb43480f18f7ca72e94f92c280aa983ddbd0bcd6ecd037/cryptography-45.0.5-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:7760c1c2e1a7084153a0f68fab76e754083b126a47d0117c9ed15e69e2103492", size = 4449759 },
|
||||
{ url = "https://files.pythonhosted.org/packages/91/e4/f17e02066de63e0100a3a01b56f8f1016973a1d67551beaf585157a86b3f/cryptography-45.0.5-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6ff8728d8d890b3dda5765276d1bc6fb099252915a2cd3aff960c4c195745dd0", size = 4319991 },
|
||||
{ url = "https://files.pythonhosted.org/packages/f2/2e/e2dbd629481b499b14516eed933f3276eb3239f7cee2dcfa4ee6b44d4711/cryptography-45.0.5-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7259038202a47fdecee7e62e0fd0b0738b6daa335354396c6ddebdbe1206af2a", size = 4554189 },
|
||||
{ url = "https://files.pythonhosted.org/packages/f8/ea/a78a0c38f4c8736287b71c2ea3799d173d5ce778c7d6e3c163a95a05ad2a/cryptography-45.0.5-cp37-abi3-win32.whl", hash = "sha256:1e1da5accc0c750056c556a93c3e9cb828970206c68867712ca5805e46dc806f", size = 2911769 },
|
||||
{ url = "https://files.pythonhosted.org/packages/79/b3/28ac139109d9005ad3f6b6f8976ffede6706a6478e21c889ce36c840918e/cryptography-45.0.5-cp37-abi3-win_amd64.whl", hash = "sha256:90cb0a7bb35959f37e23303b7eed0a32280510030daba3f7fdfbb65defde6a97", size = 3390016 },
|
||||
{ url = "https://files.pythonhosted.org/packages/f8/8b/34394337abe4566848a2bd49b26bcd4b07fd466afd3e8cce4cb79a390869/cryptography-45.0.5-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:206210d03c1193f4e1ff681d22885181d47efa1ab3018766a7b32a7b3d6e6afd", size = 3575762 },
|
||||
{ url = "https://files.pythonhosted.org/packages/8b/5d/a19441c1e89afb0f173ac13178606ca6fab0d3bd3ebc29e9ed1318b507fc/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c648025b6840fe62e57107e0a25f604db740e728bd67da4f6f060f03017d5097", size = 4140906 },
|
||||
{ url = "https://files.pythonhosted.org/packages/4b/db/daceb259982a3c2da4e619f45b5bfdec0e922a23de213b2636e78ef0919b/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b8fa8b0a35a9982a3c60ec79905ba5bb090fc0b9addcfd3dc2dd04267e45f25e", size = 4374411 },
|
||||
{ url = "https://files.pythonhosted.org/packages/6a/35/5d06ad06402fc522c8bf7eab73422d05e789b4e38fe3206a85e3d6966c11/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:14d96584701a887763384f3c47f0ca7c1cce322aa1c31172680eb596b890ec30", size = 4140942 },
|
||||
{ url = "https://files.pythonhosted.org/packages/65/79/020a5413347e44c382ef1f7f7e7a66817cd6273e3e6b5a72d18177b08b2f/cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:57c816dfbd1659a367831baca4b775b2a5b43c003daf52e9d57e1d30bc2e1b0e", size = 4374079 },
|
||||
{ url = "https://files.pythonhosted.org/packages/9b/c5/c0e07d84a9a2a8a0ed4f865e58f37c71af3eab7d5e094ff1b21f3f3af3bc/cryptography-45.0.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b9e38e0a83cd51e07f5a48ff9691cae95a79bea28fe4ded168a8e5c6c77e819d", size = 3321362 },
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/71/9bdbcfd58d6ff5084687fe722c58ac718ebedbc98b9f8f93781354e6d286/cryptography-45.0.5-pp311-pypy311_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8c4a6ff8a30e9e3d38ac0539e9a9e02540ab3f827a3394f8852432f6b0ea152e", size = 3587878 },
|
||||
{ url = "https://files.pythonhosted.org/packages/f0/63/83516cfb87f4a8756eaa4203f93b283fda23d210fc14e1e594bd5f20edb6/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bd4c45986472694e5121084c6ebbd112aa919a25e783b87eb95953c9573906d6", size = 4152447 },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/11/d2823d2a5a0bd5802b3565437add16f5c8ce1f0778bf3822f89ad2740a38/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:982518cd64c54fcada9d7e5cf28eabd3ee76bd03ab18e08a48cad7e8b6f31b18", size = 4386778 },
|
||||
{ url = "https://files.pythonhosted.org/packages/5f/38/6bf177ca6bce4fe14704ab3e93627c5b0ca05242261a2e43ef3168472540/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:12e55281d993a793b0e883066f590c1ae1e802e3acb67f8b442e721e475e6463", size = 4151627 },
|
||||
{ url = "https://files.pythonhosted.org/packages/38/6a/69fc67e5266bff68a91bcb81dff8fb0aba4d79a78521a08812048913e16f/cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:5aa1e32983d4443e310f726ee4b071ab7569f58eedfdd65e9675484a4eb67bd1", size = 4385593 },
|
||||
{ url = "https://files.pythonhosted.org/packages/f6/34/31a1604c9a9ade0fdab61eb48570e09a796f4d9836121266447b0eaf7feb/cryptography-45.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:e357286c1b76403dd384d938f93c46b2b058ed4dfcdce64a770f0537ed3feb6f", size = 3331106 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cycler"
|
||||
version = "0.12.1"
|
||||
@@ -1847,25 +1800,23 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "leann-backend-diskann"
|
||||
version = "0.1.13"
|
||||
version = "0.1.8"
|
||||
source = { editable = "packages/leann-backend-diskann" }
|
||||
dependencies = [
|
||||
{ name = "leann-core" },
|
||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
|
||||
{ name = "numpy", version = "2.3.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
{ name = "protobuf" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "leann-core", specifier = "==0.1.13" },
|
||||
{ name = "leann-core", specifier = "==0.1.8" },
|
||||
{ name = "numpy" },
|
||||
{ name = "protobuf", specifier = ">=3.19.0" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "leann-backend-hnsw"
|
||||
version = "0.1.13"
|
||||
version = "0.1.8"
|
||||
source = { editable = "packages/leann-backend-hnsw" }
|
||||
dependencies = [
|
||||
{ name = "leann-core" },
|
||||
@@ -1877,7 +1828,7 @@ dependencies = [
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "leann-core", specifier = "==0.1.13" },
|
||||
{ name = "leann-core", specifier = "==0.1.8" },
|
||||
{ name = "msgpack", specifier = ">=1.0.0" },
|
||||
{ name = "numpy" },
|
||||
{ name = "pyzmq", specifier = ">=23.0.0" },
|
||||
@@ -1885,61 +1836,33 @@ requires-dist = [
|
||||
|
||||
[[package]]
|
||||
name = "leann-core"
|
||||
version = "0.1.13"
|
||||
version = "0.1.8"
|
||||
source = { editable = "packages/leann-core" }
|
||||
dependencies = [
|
||||
{ name = "accelerate" },
|
||||
{ name = "huggingface-hub" },
|
||||
{ name = "llama-index-core" },
|
||||
{ name = "llama-index-embeddings-huggingface" },
|
||||
{ name = "llama-index-readers-file" },
|
||||
{ name = "mlx", marker = "sys_platform == 'darwin'" },
|
||||
{ name = "mlx-lm", marker = "sys_platform == 'darwin'" },
|
||||
{ name = "msgpack" },
|
||||
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
|
||||
{ name = "numpy", version = "2.3.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
{ name = "openai" },
|
||||
{ name = "pdfplumber" },
|
||||
{ name = "psutil" },
|
||||
{ name = "pymupdf" },
|
||||
{ name = "pypdf2" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "pyzmq" },
|
||||
{ name = "requests" },
|
||||
{ name = "sentence-transformers" },
|
||||
{ name = "torch" },
|
||||
{ name = "tqdm" },
|
||||
{ name = "transformers" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "accelerate", specifier = ">=0.20.0" },
|
||||
{ name = "accelerate", marker = "extra == 'colab'", specifier = ">=0.20.0,<1.0.0" },
|
||||
{ name = "huggingface-hub", specifier = ">=0.20.0" },
|
||||
{ name = "llama-index-core", specifier = ">=0.12.0" },
|
||||
{ name = "llama-index-embeddings-huggingface", specifier = ">=0.5.5" },
|
||||
{ name = "llama-index-readers-file", specifier = ">=0.4.0" },
|
||||
{ name = "mlx", marker = "sys_platform == 'darwin'", specifier = ">=0.26.3" },
|
||||
{ name = "mlx-lm", marker = "sys_platform == 'darwin'", specifier = ">=0.26.0" },
|
||||
{ name = "msgpack", specifier = ">=1.0.0" },
|
||||
{ name = "numpy", specifier = ">=1.20.0" },
|
||||
{ name = "openai", specifier = ">=1.0.0" },
|
||||
{ name = "pdfplumber", specifier = ">=0.10.0" },
|
||||
{ name = "psutil", specifier = ">=5.8.0" },
|
||||
{ name = "pymupdf", specifier = ">=1.23.0" },
|
||||
{ name = "pypdf2", specifier = ">=3.0.0" },
|
||||
{ name = "python-dotenv", specifier = ">=1.0.0" },
|
||||
{ name = "pyzmq", specifier = ">=23.0.0" },
|
||||
{ name = "requests", specifier = ">=2.25.0" },
|
||||
{ name = "sentence-transformers", specifier = ">=2.2.0" },
|
||||
{ name = "torch", specifier = ">=2.0.0" },
|
||||
{ name = "torch", marker = "extra == 'colab'", specifier = ">=2.0.0,<3.0.0" },
|
||||
{ name = "tqdm", specifier = ">=4.60.0" },
|
||||
{ name = "transformers", specifier = ">=4.30.0" },
|
||||
{ name = "transformers", marker = "extra == 'colab'", specifier = ">=4.30.0,<5.0.0" },
|
||||
]
|
||||
provides-extras = ["colab"]
|
||||
|
||||
[[package]]
|
||||
name = "leann-workspace"
|
||||
@@ -1959,7 +1882,6 @@ dependencies = [
|
||||
{ name = "llama-index-embeddings-huggingface" },
|
||||
{ name = "llama-index-node-parser-docling" },
|
||||
{ name = "llama-index-readers-docling" },
|
||||
{ name = "llama-index-readers-file" },
|
||||
{ name = "llama-index-vector-stores-faiss" },
|
||||
{ name = "mlx", marker = "sys_platform == 'darwin'" },
|
||||
{ name = "mlx-lm", marker = "sys_platform == 'darwin'" },
|
||||
@@ -1968,12 +1890,9 @@ dependencies = [
|
||||
{ name = "numpy", version = "2.3.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
||||
{ name = "ollama" },
|
||||
{ name = "openai" },
|
||||
{ name = "pdfplumber" },
|
||||
{ name = "protobuf" },
|
||||
{ name = "psutil" },
|
||||
{ name = "pymupdf" },
|
||||
{ name = "pypdf2" },
|
||||
{ name = "pypdfium2" },
|
||||
{ name = "requests" },
|
||||
{ name = "sentence-transformers" },
|
||||
{ name = "sglang" },
|
||||
@@ -1993,21 +1912,9 @@ dev = [
|
||||
diskann = [
|
||||
{ name = "leann-backend-diskann" },
|
||||
]
|
||||
documents = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "openpyxl" },
|
||||
{ name = "pandas" },
|
||||
{ name = "python-docx" },
|
||||
]
|
||||
|
||||
[package.dev-dependencies]
|
||||
dev = [
|
||||
{ name = "ruff" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "beautifulsoup4", marker = "extra == 'documents'", specifier = ">=4.13.0" },
|
||||
{ name = "black", marker = "extra == 'dev'", specifier = ">=23.0" },
|
||||
{ name = "boto3" },
|
||||
{ name = "colorama" },
|
||||
@@ -2024,7 +1931,6 @@ requires-dist = [
|
||||
{ name = "llama-index-embeddings-huggingface", specifier = ">=0.5.5" },
|
||||
{ name = "llama-index-node-parser-docling" },
|
||||
{ name = "llama-index-readers-docling" },
|
||||
{ name = "llama-index-readers-file", specifier = ">=0.4.0" },
|
||||
{ name = "llama-index-vector-stores-faiss", specifier = ">=0.4.0" },
|
||||
{ name = "matplotlib", marker = "extra == 'dev'" },
|
||||
{ name = "mlx", marker = "sys_platform == 'darwin'", specifier = ">=0.26.3" },
|
||||
@@ -2033,17 +1939,11 @@ requires-dist = [
|
||||
{ name = "numpy", specifier = ">=1.26.0" },
|
||||
{ name = "ollama" },
|
||||
{ name = "openai", specifier = ">=1.0.0" },
|
||||
{ name = "openpyxl", marker = "extra == 'documents'", specifier = ">=3.1.0" },
|
||||
{ name = "pandas", marker = "extra == 'documents'", specifier = ">=2.2.0" },
|
||||
{ name = "pdfplumber", specifier = ">=0.11.0" },
|
||||
{ name = "protobuf", specifier = "==4.25.3" },
|
||||
{ name = "psutil", specifier = ">=5.8.0" },
|
||||
{ name = "pymupdf", specifier = ">=1.26.0" },
|
||||
{ name = "pypdf2", specifier = ">=3.0.0" },
|
||||
{ name = "pypdfium2", specifier = ">=4.30.0" },
|
||||
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" },
|
||||
{ name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" },
|
||||
{ name = "python-docx", marker = "extra == 'documents'", specifier = ">=0.8.11" },
|
||||
{ name = "requests", specifier = ">=2.25.0" },
|
||||
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
|
||||
{ name = "sentence-transformers", specifier = ">=2.2.0" },
|
||||
@@ -2051,10 +1951,7 @@ requires-dist = [
|
||||
{ name = "torch" },
|
||||
{ name = "tqdm" },
|
||||
]
|
||||
provides-extras = ["dev", "diskann", "documents"]
|
||||
|
||||
[package.metadata.requires-dev]
|
||||
dev = [{ name = "ruff", specifier = ">=0.12.4" }]
|
||||
provides-extras = ["dev", "diskann"]
|
||||
|
||||
[[package]]
|
||||
name = "llama-cloud"
|
||||
@@ -3366,33 +3263,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pdfminer-six"
|
||||
version = "20250506"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "charset-normalizer" },
|
||||
{ name = "cryptography" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/78/46/5223d613ac4963e1f7c07b2660fe0e9e770102ec6bda8c038400113fb215/pdfminer_six-20250506.tar.gz", hash = "sha256:b03cc8df09cf3c7aba8246deae52e0bca7ebb112a38895b5e1d4f5dd2b8ca2e7", size = 7387678 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/73/16/7a432c0101fa87457e75cb12c879e1749c5870a786525e2e0f42871d6462/pdfminer_six-20250506-py3-none-any.whl", hash = "sha256:d81ad173f62e5f841b53a8ba63af1a4a355933cfc0ffabd608e568b9193909e3", size = 5620187 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pdfplumber"
|
||||
version = "0.11.7"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pdfminer-six" },
|
||||
{ name = "pillow" },
|
||||
{ name = "pypdfium2" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/6d/0d/4135821aa7b1a0b77a29fac881ef0890b46b0b002290d04915ed7acc0043/pdfplumber-0.11.7.tar.gz", hash = "sha256:fa67773e5e599de1624255e9b75d1409297c5e1d7493b386ce63648637c67368", size = 115518 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/db/e0/52b67d4f00e09e497aec4f71bc44d395605e8ebcea52543242ed34c25ef9/pdfplumber-0.11.7-py3-none-any.whl", hash = "sha256:edd2195cca68bd770da479cf528a737e362968ec2351e62a6c0b71ff612ac25e", size = 60029 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pexpect"
|
||||
version = "4.9.0"
|
||||
@@ -3888,21 +3758,6 @@ version = "2.10"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c00119d0351b7a2531d2ebddb51833a36448fc7b862be60/pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3", size = 162597 }
|
||||
|
||||
[[package]]
|
||||
name = "pymupdf"
|
||||
version = "1.26.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/6d/d4/70a265e4bcd43e97480ae62da69396ef4507c8f9cfd179005ee731c92a04/pymupdf-1.26.3.tar.gz", hash = "sha256:b7d2c3ffa9870e1e4416d18862f5ccd356af5fe337b4511093bbbce2ca73b7e5", size = 75990308 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/70/d3/c7af70545cd3097a869fd635bb6222108d3a0fb28c0b8254754a126c4cbb/pymupdf-1.26.3-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ded891963944e5f13b03b88f6d9e982e816a4ec8689fe360876eef000c161f2b", size = 23057205 },
|
||||
{ url = "https://files.pythonhosted.org/packages/04/3d/ec5b69bfeaa5deefa7141fc0b20d77bb20404507cf17196b4eb59f1f2977/pymupdf-1.26.3-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:436a33c738bb10eadf00395d18a6992b801ffb26521ee1f361ae786dd283327a", size = 22406630 },
|
||||
{ url = "https://files.pythonhosted.org/packages/fc/20/661d3894bb05ad75ed6ca103ee2c3fa44d88a458b5c8d4a946b9c0f2569b/pymupdf-1.26.3-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:a2d7a3cd442f12f05103cb3bb1415111517f0a97162547a3720f3bbbc5e0b51c", size = 23450287 },
|
||||
{ url = "https://files.pythonhosted.org/packages/9c/7f/21828f018e65b16a033731d21f7b46d93fa81c6e8257f769ca4a1c2a1cb0/pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:454f38c8cf07eb333eb4646dca10517b6e90f57ce2daa2265a78064109d85555", size = 24057319 },
|
||||
{ url = "https://files.pythonhosted.org/packages/71/5d/e8f88cd5a45b8f5fa6590ce8cef3ce0fad30eac6aac8aea12406f95bee7d/pymupdf-1.26.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:759b75d2f710ff4edf8d097d2e98f60e9ecef47632cead6f949b3412facdb9f0", size = 24261350 },
|
||||
{ url = "https://files.pythonhosted.org/packages/82/22/ecc560e4f281b5dffafbf3a81f023d268b1746d028044f495115b74a2e70/pymupdf-1.26.3-cp39-abi3-win32.whl", hash = "sha256:a839ed44742faa1cd4956bb18068fe5aae435d67ce915e901318646c4e7bbea6", size = 17116371 },
|
||||
{ url = "https://files.pythonhosted.org/packages/4a/26/8c72973b8833a72785cedc3981eb59b8ac7075942718bbb7b69b352cdde4/pymupdf-1.26.3-cp39-abi3-win_amd64.whl", hash = "sha256:b4cd5124d05737944636cf45fc37ce5824f10e707b0342efe109c7b6bd37a9cc", size = 18735124 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyparsing"
|
||||
version = "3.2.3"
|
||||
|
||||
Reference in New Issue
Block a user