* feat: finance bench * docs: results * chore: ignroe data README * feat: fix financebench * feat: laion, also required idmaps support * style: format * style: format * fix: resolve ruff linting errors - Remove unused variables in benchmark scripts - Rename unused loop variables to follow convention * feat: enron email bench * experiments for running DiskANN & BM25 on Arch 4090 * style: format * chore(ci): remove paru-bin submodule and config to fix checkout --recurse-submodules * docs: data * docs: data updated * fix: as package * fix(ci): only run pre-commit * chore: use http url of astchunk; use group for some dev deps * fix(ci): should checkout modules as well since `uv sync` checks * fix(ci): run with lint only * fix: find links to install wheels available * CI: force local wheels in uv install step * CI: install local wheels via file paths * CI: pick wheels matching current Python tag * CI: handle python tag mismatches for local wheels * CI: use matrix python venv and set macOS deployment target * CI: revert install step to match main * CI: use uv group install with local wheel selection * CI: rely on setup-uv for Python and tighten group install * CI: install build deps with uv python interpreter * CI: use temporary uv venv for build deps * CI: add build venv scripts path for wheel repair
122 lines
3.4 KiB
Python
122 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Upload local evaluation data to Hugging Face Hub, excluding diskann_rpj_wiki.
|
|
|
|
Defaults:
|
|
- repo_id: LEANN-RAG/leann-rag-evaluation-data (dataset)
|
|
- folder_path: benchmarks/data
|
|
- ignore_patterns: diskann_rpj_wiki/** and .cache/**
|
|
|
|
Requires authentication via `huggingface-cli login` or HF_TOKEN env var.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
|
|
try:
|
|
from huggingface_hub import HfApi
|
|
except Exception as e:
|
|
raise SystemExit(
|
|
"huggingface_hub is required. Install with: pip install huggingface_hub hf_transfer"
|
|
) from e
|
|
|
|
|
|
def _enable_transfer_accel_if_available() -> None:
|
|
"""Best-effort enabling of accelerated transfers across hub versions.
|
|
|
|
Tries the public util if present; otherwise, falls back to env flag when
|
|
hf_transfer is installed. Silently no-ops if unavailable.
|
|
"""
|
|
try:
|
|
# Newer huggingface_hub exposes this under utils
|
|
from huggingface_hub.utils import hf_hub_enable_hf_transfer # type: ignore
|
|
|
|
hf_hub_enable_hf_transfer()
|
|
return
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
# If hf_transfer is installed, set env flag recognized by the hub
|
|
import hf_transfer # noqa: F401
|
|
|
|
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
|
|
except Exception:
|
|
# Acceleration not available; proceed without it
|
|
pass
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
p = argparse.ArgumentParser(description="Upload local data to HF, excluding diskann_rpj_wiki")
|
|
p.add_argument(
|
|
"--repo-id",
|
|
default="LEANN-RAG/leann-rag-evaluation-data",
|
|
help="Target dataset repo id (namespace/name)",
|
|
)
|
|
p.add_argument(
|
|
"--folder-path",
|
|
default="benchmarks/data",
|
|
help="Local folder to upload (default: benchmarks/data)",
|
|
)
|
|
p.add_argument(
|
|
"--ignore",
|
|
default=["diskann_rpj_wiki/**", ".cache/**"],
|
|
nargs="+",
|
|
help="Glob patterns to ignore (space-separated)",
|
|
)
|
|
p.add_argument(
|
|
"--allow",
|
|
default=["**"],
|
|
nargs="+",
|
|
help="Glob patterns to allow (space-separated). Defaults to everything.",
|
|
)
|
|
p.add_argument(
|
|
"--message",
|
|
default="sync local data (exclude diskann_rpj_wiki)",
|
|
help="Commit message",
|
|
)
|
|
p.add_argument(
|
|
"--no-transfer-accel",
|
|
action="store_true",
|
|
help="Disable hf_transfer accelerated uploads",
|
|
)
|
|
return p.parse_args()
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
|
|
if not args.no_transfer_accel:
|
|
_enable_transfer_accel_if_available()
|
|
|
|
if not os.path.isdir(args.folder_path):
|
|
raise SystemExit(f"Folder not found: {args.folder_path}")
|
|
|
|
print("Uploading to Hugging Face Hub:")
|
|
print(f" repo_id: {args.repo_id}")
|
|
print(" repo_type: dataset")
|
|
print(f" folder_path: {args.folder_path}")
|
|
print(f" allow_patterns: {args.allow}")
|
|
print(f" ignore_patterns:{args.ignore}")
|
|
|
|
api = HfApi()
|
|
|
|
# Perform upload. This skips unchanged files by content hash.
|
|
api.upload_folder(
|
|
repo_id=args.repo_id,
|
|
repo_type="dataset",
|
|
folder_path=args.folder_path,
|
|
path_in_repo=".",
|
|
allow_patterns=args.allow,
|
|
ignore_patterns=args.ignore,
|
|
commit_message=args.message,
|
|
)
|
|
|
|
print("Upload completed (unchanged files were skipped by the Hub).")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|