Text Generation
Transformers
Safetensors
English
asterisk
reasoning
implicit-reasoning
chain-of-thought
llama
aspp
pi-flow
deep-reasoning
conversational
custom_code
Instructions to use NoesisLab/Geilim-1B-Instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use NoesisLab/Geilim-1B-Instruct with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="NoesisLab/Geilim-1B-Instruct", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("NoesisLab/Geilim-1B-Instruct", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use NoesisLab/Geilim-1B-Instruct with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "NoesisLab/Geilim-1B-Instruct" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "NoesisLab/Geilim-1B-Instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/NoesisLab/Geilim-1B-Instruct
- SGLang
How to use NoesisLab/Geilim-1B-Instruct with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "NoesisLab/Geilim-1B-Instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "NoesisLab/Geilim-1B-Instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "NoesisLab/Geilim-1B-Instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "NoesisLab/Geilim-1B-Instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use NoesisLab/Geilim-1B-Instruct with Docker Model Runner:
docker model run hf.co/NoesisLab/Geilim-1B-Instruct
| # handler.py | |
| from __future__ import annotations | |
| from typing import Any, Dict, List, Union | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| Json = Dict[str, Any] | |
| Messages = List[Dict[str, str]] # [{"role":"user|assistant|system", "content":"..."}] | |
| def _is_messages(x: Any) -> bool: | |
| return ( | |
| isinstance(x, list) | |
| and len(x) > 0 | |
| and all(isinstance(m, dict) and "role" in m and "content" in m for m in x) | |
| ) | |
| class EndpointHandler: | |
| """ | |
| Hugging Face Inference Endpoints custom handler. | |
| Supports both text and chat formats: | |
| Text format: | |
| {"inputs": "Hello, how are you?"} | |
| Chat format (recommended): | |
| {"inputs": [{"role": "user", "content": "Hello!"}]} | |
| or | |
| {"inputs": {"messages": [{"role": "user", "content": "Hello!"}]}} | |
| Parameters: | |
| - max_new_tokens (default: 256): Max tokens to generate | |
| - temperature (default: 0.7): Sampling temperature | |
| - top_p (default: 0.95): Nucleus sampling | |
| - repetition_penalty (default: 1.0): Penalize repetitions | |
| - return_full_text (default: False): If True, return full conversation; if False, only new tokens | |
| """ | |
| def __init__(self, model_dir: str): | |
| self.model_dir = model_dir | |
| # Pick dtype/device | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| if self.device == "cuda": | |
| # bfloat16 is usually safe on A100/H100; if your instance doesn't support bf16, change to float16 | |
| self.dtype = torch.bfloat16 | |
| else: | |
| self.dtype = torch.float32 | |
| # IMPORTANT: trust_remote_code=True because repo contains AsteriskForCausalLM.py + auto_map | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| model_dir, | |
| trust_remote_code=True, | |
| use_fast=True, | |
| ) | |
| # Make sure pad token exists (your config uses pad_token_id=2 which equals eos_token_id in many llama-like models) | |
| if self.tokenizer.pad_token_id is None and self.tokenizer.eos_token_id is not None: | |
| self.tokenizer.pad_token = self.tokenizer.eos_token | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| model_dir, | |
| trust_remote_code=True, | |
| torch_dtype=self.dtype, | |
| device_map="auto" if self.device == "cuda" else None, | |
| ) | |
| if self.device != "cuda": | |
| self.model.to(self.device) | |
| self.model.eval() | |
| def __call__(self, data: Json) -> Union[Json, List[Json]]: | |
| inputs = data.get("inputs", "") | |
| params = data.get("parameters", {}) or {} | |
| # Generation defaults (can be overridden via `parameters`) | |
| max_new_tokens = int(params.get("max_new_tokens", 256)) | |
| temperature = float(params.get("temperature", 0.7)) | |
| top_p = float(params.get("top_p", 0.95)) | |
| top_k = int(params.get("top_k", 0)) | |
| repetition_penalty = float(params.get("repetition_penalty", 1.0)) | |
| return_full_text = bool(params.get("return_full_text", False)) | |
| do_sample = bool(params.get("do_sample", temperature > 0)) | |
| num_beams = int(params.get("num_beams", 1)) | |
| def _one(item: Any) -> Json: | |
| # Accept: | |
| # 1) string prompt | |
| # 2) messages list: [{"role":"user","content":"..."}] | |
| # 3) dict {"messages":[...]} (common chat style) | |
| if isinstance(item, dict) and "messages" in item: | |
| item = item["messages"] | |
| if _is_messages(item): | |
| # Chat template path exists in repo; tokenizer.apply_chat_template will use it if configured | |
| try: | |
| # Use tokenize=False to get the formatted string first | |
| prompt = self.tokenizer.apply_chat_template( | |
| item, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| # Then tokenize it separately to avoid unpacking issues | |
| enc = self.tokenizer(prompt, return_tensors="pt") | |
| input_ids = enc["input_ids"] | |
| except Exception: | |
| # Fallback: if chat template fails, use the last user message | |
| last_user_msg = next((m["content"] for m in reversed(item) if m.get("role") == "user"), "") | |
| enc = self.tokenizer(last_user_msg, return_tensors="pt") | |
| input_ids = enc["input_ids"] | |
| else: | |
| if not isinstance(item, str): | |
| item = str(item) | |
| enc = self.tokenizer(item, return_tensors="pt") | |
| input_ids = enc["input_ids"] | |
| input_ids = input_ids.to(self.model.device) | |
| input_len = input_ids.shape[-1] | |
| gen_ids = self.model.generate( | |
| input_ids=input_ids, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=do_sample, | |
| temperature=temperature if do_sample else None, | |
| top_p=top_p if do_sample else None, | |
| top_k=top_k if do_sample and top_k > 0 else None, | |
| num_beams=num_beams, | |
| repetition_penalty=repetition_penalty, | |
| pad_token_id=self.tokenizer.pad_token_id, | |
| eos_token_id=self.tokenizer.eos_token_id, | |
| ) | |
| # Return newly generated tokens by default, or full text if requested | |
| if return_full_text: | |
| text = self.tokenizer.decode(gen_ids[0], skip_special_tokens=True) | |
| else: | |
| new_tokens = gen_ids[0, input_len:] | |
| text = self.tokenizer.decode(new_tokens, skip_special_tokens=True) | |
| return {"generated_text": text} | |
| # Batch support | |
| if isinstance(inputs, list) and not _is_messages(inputs): | |
| return [_one(x) for x in inputs] | |
| else: | |
| return _one(inputs) |