#!/usr/bin/env python3 """Upload this codebase to Hugging Face Hub while excluding the .back folder. Examples: /g/data/rr81/aev/bin/python upload.py /g/data/rr81/aev/bin/python upload.py --repo-id aryadomain/all_code_base /g/data/rr81/aev/bin/python upload.py --method large /g/data/rr81/aev/bin/python upload.py --repo-id my-user/all_code_base """ from __future__ import annotations import argparse import inspect from pathlib import Path from typing import List from huggingface_hub import HfApi def build_ignore_patterns(extra_ignore: List[str]) -> List[str]: # Always exclude local metadata and the requested backup folder. patterns = [ ".git", ".git/**", ".back", ".back/**", "**/.back/**", ] patterns.extend(extra_ignore) return patterns def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Upload a folder to Hugging Face Hub, excluding .back." ) parser.add_argument( "--source-dir", type=str, default=".", help="Local folder to upload (default: current directory).", ) parser.add_argument( "--repo-name", type=str, default="all_code_base", help="Repo name used when --repo-id is not provided.", ) parser.add_argument( "--repo-id", type=str, default="aryadomain/all_code_base", help="Full Hugging Face repo id like user_or_org/repo_name (default: aryadomain/all_code_base).", ) parser.add_argument( "--namespace", type=str, default=None, help="Optional user/org namespace override when using --repo-name.", ) parser.add_argument( "--repo-type", type=str, default="model", choices=["model", "dataset", "space"], help="Hub repo type.", ) parser.add_argument( "--private", action="store_true", help="Create the repo as private.", ) parser.add_argument( "--revision", type=str, default="main", help="Target branch/revision (default: main).", ) parser.add_argument( "--commit-message", type=str, default="Upload codebase excluding .back", help="Commit message for upload.", ) parser.add_argument( "--extra-ignore", nargs="*", default=[], help="Additional ignore patterns for upload_folder.", ) parser.add_argument( "--dry-run", action="store_true", help="Print resolved settings and exit without uploading.", ) parser.add_argument( "--method", type=str, default="large", choices=["auto", "folder", "large"], help=( "Upload method: auto chooses large upload for big folders, " "folder forces upload_folder, large forces upload_large_folder compatibility path." ), ) parser.add_argument( "--large-threshold-gb", type=float, default=10.0, help="In auto mode, switch to large upload when folder size exceeds this many GB.", ) return parser.parse_args() def resolve_repo_id(api: HfApi, args: argparse.Namespace) -> str: if args.repo_id: return args.repo_id if args.namespace: return f"{args.namespace}/{args.repo_name}" who = api.whoami() username = who.get("name") if not username: raise RuntimeError("Could not resolve username from Hugging Face login.") return f"{username}/{args.repo_name}" def folder_size_bytes(folder: Path) -> int: total = 0 for p in folder.rglob("*"): if p.is_file() and ".back" not in p.parts: try: total += p.stat().st_size except OSError: pass return total def upload_folder_compat( api: HfApi, repo_id: str, repo_type: str, source_dir: Path, revision: str, commit_message: str, ignore_patterns: List[str], ): kwargs = { "repo_id": repo_id, "repo_type": repo_type, "folder_path": str(source_dir), "path_in_repo": ".", "revision": revision, "commit_message": commit_message, "ignore_patterns": ignore_patterns, } sig = inspect.signature(api.upload_folder) if "multi_commits" in sig.parameters: kwargs["multi_commits"] = True if "multi_commits_verbose" in sig.parameters: kwargs["multi_commits_verbose"] = True return api.upload_folder(**kwargs) def upload_large_compat( api: HfApi, repo_id: str, repo_type: str, source_dir: Path, revision: str, ignore_patterns: List[str], ): if not hasattr(api, "upload_large_folder"): raise RuntimeError("Installed huggingface_hub does not provide upload_large_folder") sig = inspect.signature(api.upload_large_folder) kwargs = { "repo_id": repo_id, "repo_type": repo_type, "folder_path": str(source_dir), } if "revision" in sig.parameters: kwargs["revision"] = revision if "path_in_repo" in sig.parameters: kwargs["path_in_repo"] = "." if "ignore_patterns" in sig.parameters: kwargs["ignore_patterns"] = ignore_patterns return api.upload_large_folder(**kwargs) def main() -> None: args = parse_args() source_dir = Path(args.source_dir).resolve() if not source_dir.exists() or not source_dir.is_dir(): raise FileNotFoundError(f"Source directory not found or not a directory: {source_dir}") api = HfApi() repo_id = resolve_repo_id(api, args) ignore_patterns = build_ignore_patterns(args.extra_ignore) total_size_gb = None if args.method == "auto": total_size = folder_size_bytes(source_dir) total_size_gb = total_size / (1024 ** 3) use_large = total_size_gb >= args.large_threshold_gb else: use_large = args.method == "large" print("Source directory:", source_dir) print("Repo id:", repo_id) print("Repo type:", args.repo_type) print("Private:", args.private) print("Revision:", args.revision) print("Ignore patterns:", ignore_patterns) if total_size_gb is None: print("Folder size scan: skipped (set --method auto to enable size-based selection)") else: print(f"Folder size (excluding .back): {total_size_gb:.2f} GB") print("Upload method:", "large" if use_large else "folder") if args.dry_run: print("Dry run requested. Exiting before create/upload.") return api.create_repo( repo_id=repo_id, repo_type=args.repo_type, private=args.private, exist_ok=True, ) if use_large: try: commit_info = upload_large_compat( api=api, repo_id=repo_id, repo_type=args.repo_type, source_dir=source_dir, revision=args.revision, ignore_patterns=ignore_patterns, ) except Exception as e: print(f"Large upload path failed ({e}). Falling back to upload_folder with multi-commit mode.") commit_info = upload_folder_compat( api=api, repo_id=repo_id, repo_type=args.repo_type, source_dir=source_dir, revision=args.revision, commit_message=args.commit_message, ignore_patterns=ignore_patterns, ) else: commit_info = upload_folder_compat( api=api, repo_id=repo_id, repo_type=args.repo_type, source_dir=source_dir, revision=args.revision, commit_message=args.commit_message, ignore_patterns=ignore_patterns, ) print("Upload completed.") print("Commit:", commit_info) print("Repo URL: https://huggingface.co/" + repo_id) if __name__ == "__main__": main()