Instructions to use boom-project/tiny-llama-2-debug with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use boom-project/tiny-llama-2-debug with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="boom-project/tiny-llama-2-debug")# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("boom-project/tiny-llama-2-debug") model = AutoModelForCausalLM.from_pretrained("boom-project/tiny-llama-2-debug") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use boom-project/tiny-llama-2-debug with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "boom-project/tiny-llama-2-debug" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "boom-project/tiny-llama-2-debug", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/boom-project/tiny-llama-2-debug
- SGLang
How to use boom-project/tiny-llama-2-debug with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "boom-project/tiny-llama-2-debug" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "boom-project/tiny-llama-2-debug", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "boom-project/tiny-llama-2-debug" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "boom-project/tiny-llama-2-debug", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use boom-project/tiny-llama-2-debug with Docker Model Runner:
docker model run hf.co/boom-project/tiny-llama-2-debug
| import subprocess | |
| import shlex | |
| import torch | |
| from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizerFast | |
| mname_from = "meta-llama/Llama-2-7b-hf" | |
| mname_tiny = "tiny-llama-2-debug" | |
| vocab_keep_items = 3000 | |
| config = LlamaConfig.from_pretrained(mname_from) | |
| config.update(dict( | |
| hidden_size=16, | |
| intermediate_size=64, | |
| num_attention_heads=4, | |
| num_hidden_layers=2, | |
| max_position_embeddings=256, | |
| num_key_value_heads=4, | |
| vocab_size=vocab_keep_items, | |
| )) | |
| print("new config", config) | |
| # create a tiny random model | |
| tiny_model = LlamaForCausalLM(config) | |
| print(f"num of params {tiny_model.num_parameters()}") | |
| # shrink it more and save | |
| tiny_model.save_pretrained(mname_tiny) | |
| # shrink the tokenizer from 32k to 3k vocab | |
| tokenizer_fast = LlamaTokenizerFast.from_pretrained(mname_from) | |
| tmp_dir = f"/tmp/{mname_from}" | |
| tokenizer_fast.save_pretrained(tmp_dir) | |
| # resize tokenizer.json (vocab.txt will be automatically resized on save_pretrained) | |
| # perl -0777 -pi -e 's|(2999).*|$1},"merges": []}}|msg' tokenizer.json # 0-indexed, so vocab_keep_items-1! | |
| closing_pat = '},"merges": []}}' | |
| cmd = (f"perl -0777 -pi -e 's|({vocab_keep_items-1}).*|$1{closing_pat}|msg' {tmp_dir}/tokenizer.json") | |
| #print(f"Running:\n{cmd}") | |
| result = subprocess.run(shlex.split(cmd), capture_output=True, text=True) | |
| #print(result) | |
| # reload with modified tokenizer | |
| tiny_tokenizer = LlamaTokenizerFast.from_pretrained(tmp_dir) | |
| tiny_tokenizer.save_pretrained(mname_tiny) | |
| # test the new model and tokenizer function | |
| model_inputs = tiny_tokenizer("Making tiny model", return_tensors="pt") | |
| gen_tokens = tiny_model.generate(**model_inputs, max_new_tokens=100) | |
| print(tiny_tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)) | |
| print("Random output should be expected, but no crashing") | |
| print(f"Model+Tokenizer saved in {mname_tiny}") | |
| # Push to Hugging Face Hub | |
| tiny_model.push_to_hub(f"boom-project/{mname_tiny}") | |
| tiny_tokenizer.push_to_hub(f"boom-project/{mname_tiny}") | |
| print(f"Model and tokenizer pushed to boom-project/{mname_tiny}") |