all_code_base / upload.py
aryadomain's picture
Add files using upload-large-folder tool
9ebbe39 verified
#!/usr/bin/env python3
"""Upload this codebase to Hugging Face Hub while excluding the .back folder.
Examples:
/g/data/rr81/aev/bin/python upload.py
/g/data/rr81/aev/bin/python upload.py --repo-id aryadomain/all_code_base
/g/data/rr81/aev/bin/python upload.py --method large
/g/data/rr81/aev/bin/python upload.py --repo-id my-user/all_code_base
"""
from __future__ import annotations
import argparse
import inspect
from pathlib import Path
from typing import List
from huggingface_hub import HfApi
def build_ignore_patterns(extra_ignore: List[str]) -> List[str]:
# Always exclude local metadata and the requested backup folder.
patterns = [
".git",
".git/**",
".back",
".back/**",
"**/.back/**",
]
patterns.extend(extra_ignore)
return patterns
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Upload a folder to Hugging Face Hub, excluding .back."
)
parser.add_argument(
"--source-dir",
type=str,
default=".",
help="Local folder to upload (default: current directory).",
)
parser.add_argument(
"--repo-name",
type=str,
default="all_code_base",
help="Repo name used when --repo-id is not provided.",
)
parser.add_argument(
"--repo-id",
type=str,
default="aryadomain/all_code_base",
help="Full Hugging Face repo id like user_or_org/repo_name (default: aryadomain/all_code_base).",
)
parser.add_argument(
"--namespace",
type=str,
default=None,
help="Optional user/org namespace override when using --repo-name.",
)
parser.add_argument(
"--repo-type",
type=str,
default="model",
choices=["model", "dataset", "space"],
help="Hub repo type.",
)
parser.add_argument(
"--private",
action="store_true",
help="Create the repo as private.",
)
parser.add_argument(
"--revision",
type=str,
default="main",
help="Target branch/revision (default: main).",
)
parser.add_argument(
"--commit-message",
type=str,
default="Upload codebase excluding .back",
help="Commit message for upload.",
)
parser.add_argument(
"--extra-ignore",
nargs="*",
default=[],
help="Additional ignore patterns for upload_folder.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Print resolved settings and exit without uploading.",
)
parser.add_argument(
"--method",
type=str,
default="large",
choices=["auto", "folder", "large"],
help=(
"Upload method: auto chooses large upload for big folders, "
"folder forces upload_folder, large forces upload_large_folder compatibility path."
),
)
parser.add_argument(
"--large-threshold-gb",
type=float,
default=10.0,
help="In auto mode, switch to large upload when folder size exceeds this many GB.",
)
return parser.parse_args()
def resolve_repo_id(api: HfApi, args: argparse.Namespace) -> str:
if args.repo_id:
return args.repo_id
if args.namespace:
return f"{args.namespace}/{args.repo_name}"
who = api.whoami()
username = who.get("name")
if not username:
raise RuntimeError("Could not resolve username from Hugging Face login.")
return f"{username}/{args.repo_name}"
def folder_size_bytes(folder: Path) -> int:
total = 0
for p in folder.rglob("*"):
if p.is_file() and ".back" not in p.parts:
try:
total += p.stat().st_size
except OSError:
pass
return total
def upload_folder_compat(
api: HfApi,
repo_id: str,
repo_type: str,
source_dir: Path,
revision: str,
commit_message: str,
ignore_patterns: List[str],
):
kwargs = {
"repo_id": repo_id,
"repo_type": repo_type,
"folder_path": str(source_dir),
"path_in_repo": ".",
"revision": revision,
"commit_message": commit_message,
"ignore_patterns": ignore_patterns,
}
sig = inspect.signature(api.upload_folder)
if "multi_commits" in sig.parameters:
kwargs["multi_commits"] = True
if "multi_commits_verbose" in sig.parameters:
kwargs["multi_commits_verbose"] = True
return api.upload_folder(**kwargs)
def upload_large_compat(
api: HfApi,
repo_id: str,
repo_type: str,
source_dir: Path,
revision: str,
ignore_patterns: List[str],
):
if not hasattr(api, "upload_large_folder"):
raise RuntimeError("Installed huggingface_hub does not provide upload_large_folder")
sig = inspect.signature(api.upload_large_folder)
kwargs = {
"repo_id": repo_id,
"repo_type": repo_type,
"folder_path": str(source_dir),
}
if "revision" in sig.parameters:
kwargs["revision"] = revision
if "path_in_repo" in sig.parameters:
kwargs["path_in_repo"] = "."
if "ignore_patterns" in sig.parameters:
kwargs["ignore_patterns"] = ignore_patterns
return api.upload_large_folder(**kwargs)
def main() -> None:
args = parse_args()
source_dir = Path(args.source_dir).resolve()
if not source_dir.exists() or not source_dir.is_dir():
raise FileNotFoundError(f"Source directory not found or not a directory: {source_dir}")
api = HfApi()
repo_id = resolve_repo_id(api, args)
ignore_patterns = build_ignore_patterns(args.extra_ignore)
total_size_gb = None
if args.method == "auto":
total_size = folder_size_bytes(source_dir)
total_size_gb = total_size / (1024 ** 3)
use_large = total_size_gb >= args.large_threshold_gb
else:
use_large = args.method == "large"
print("Source directory:", source_dir)
print("Repo id:", repo_id)
print("Repo type:", args.repo_type)
print("Private:", args.private)
print("Revision:", args.revision)
print("Ignore patterns:", ignore_patterns)
if total_size_gb is None:
print("Folder size scan: skipped (set --method auto to enable size-based selection)")
else:
print(f"Folder size (excluding .back): {total_size_gb:.2f} GB")
print("Upload method:", "large" if use_large else "folder")
if args.dry_run:
print("Dry run requested. Exiting before create/upload.")
return
api.create_repo(
repo_id=repo_id,
repo_type=args.repo_type,
private=args.private,
exist_ok=True,
)
if use_large:
try:
commit_info = upload_large_compat(
api=api,
repo_id=repo_id,
repo_type=args.repo_type,
source_dir=source_dir,
revision=args.revision,
ignore_patterns=ignore_patterns,
)
except Exception as e:
print(f"Large upload path failed ({e}). Falling back to upload_folder with multi-commit mode.")
commit_info = upload_folder_compat(
api=api,
repo_id=repo_id,
repo_type=args.repo_type,
source_dir=source_dir,
revision=args.revision,
commit_message=args.commit_message,
ignore_patterns=ignore_patterns,
)
else:
commit_info = upload_folder_compat(
api=api,
repo_id=repo_id,
repo_type=args.repo_type,
source_dir=source_dir,
revision=args.revision,
commit_message=args.commit_message,
ignore_patterns=ignore_patterns,
)
print("Upload completed.")
print("Commit:", commit_info)
print("Repo URL: https://huggingface.co/" + repo_id)
if __name__ == "__main__":
main()