fix readme
This commit is contained in:
121
scripts/hf_upload.py
Normal file
121
scripts/hf_upload.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Upload local evaluation data to Hugging Face Hub, excluding diskann_rpj_wiki.
|
||||
|
||||
Defaults:
|
||||
- repo_id: LEANN-RAG/leann-rag-evaluation-data (dataset)
|
||||
- folder_path: benchmarks/data
|
||||
- ignore_patterns: diskann_rpj_wiki/** and .cache/**
|
||||
|
||||
Requires authentication via `huggingface-cli login` or HF_TOKEN env var.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
try:
|
||||
from huggingface_hub import HfApi
|
||||
except Exception as e:
|
||||
raise SystemExit(
|
||||
"huggingface_hub is required. Install with: pip install huggingface_hub hf_transfer"
|
||||
) from e
|
||||
|
||||
|
||||
def _enable_transfer_accel_if_available() -> None:
|
||||
"""Best-effort enabling of accelerated transfers across hub versions.
|
||||
|
||||
Tries the public util if present; otherwise, falls back to env flag when
|
||||
hf_transfer is installed. Silently no-ops if unavailable.
|
||||
"""
|
||||
try:
|
||||
# Newer huggingface_hub exposes this under utils
|
||||
from huggingface_hub.utils import hf_hub_enable_hf_transfer # type: ignore
|
||||
|
||||
hf_hub_enable_hf_transfer()
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
# If hf_transfer is installed, set env flag recognized by the hub
|
||||
import hf_transfer # noqa: F401
|
||||
|
||||
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
|
||||
except Exception:
|
||||
# Acceleration not available; proceed without it
|
||||
pass
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(description="Upload local data to HF, excluding diskann_rpj_wiki")
|
||||
p.add_argument(
|
||||
"--repo-id",
|
||||
default="LEANN-RAG/leann-rag-evaluation-data",
|
||||
help="Target dataset repo id (namespace/name)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--folder-path",
|
||||
default="benchmarks/data",
|
||||
help="Local folder to upload (default: benchmarks/data)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore",
|
||||
default=["diskann_rpj_wiki/**", ".cache/**"],
|
||||
nargs="+",
|
||||
help="Glob patterns to ignore (space-separated)",
|
||||
)
|
||||
p.add_argument(
|
||||
"--allow",
|
||||
default=["**"],
|
||||
nargs="+",
|
||||
help="Glob patterns to allow (space-separated). Defaults to everything.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--message",
|
||||
default="sync local data (exclude diskann_rpj_wiki)",
|
||||
help="Commit message",
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-transfer-accel",
|
||||
action="store_true",
|
||||
help="Disable hf_transfer accelerated uploads",
|
||||
)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
if not args.no_transfer_accel:
|
||||
_enable_transfer_accel_if_available()
|
||||
|
||||
if not os.path.isdir(args.folder_path):
|
||||
raise SystemExit(f"Folder not found: {args.folder_path}")
|
||||
|
||||
print("Uploading to Hugging Face Hub:")
|
||||
print(f" repo_id: {args.repo_id}")
|
||||
print(" repo_type: dataset")
|
||||
print(f" folder_path: {args.folder_path}")
|
||||
print(f" allow_patterns: {args.allow}")
|
||||
print(f" ignore_patterns:{args.ignore}")
|
||||
|
||||
api = HfApi()
|
||||
|
||||
# Perform upload. This skips unchanged files by content hash.
|
||||
api.upload_folder(
|
||||
repo_id=args.repo_id,
|
||||
repo_type="dataset",
|
||||
folder_path=args.folder_path,
|
||||
path_in_repo=".",
|
||||
allow_patterns=args.allow,
|
||||
ignore_patterns=args.ignore,
|
||||
commit_message=args.message,
|
||||
)
|
||||
|
||||
print("Upload completed (unchanged files were skipped by the Hub).")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user