Text Generation
Transformers
PyTorch
English
experimental
research
bit-level
transformer
reversible
safety
telemetry
language-modeling
Instructions to use WCNegentropy/BitTransformerLM with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use WCNegentropy/BitTransformerLM with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="WCNegentropy/BitTransformerLM")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("WCNegentropy/BitTransformerLM", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use WCNegentropy/BitTransformerLM with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "WCNegentropy/BitTransformerLM" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/WCNegentropy/BitTransformerLM
- SGLang
How to use WCNegentropy/BitTransformerLM with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "WCNegentropy/BitTransformerLM" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "WCNegentropy/BitTransformerLM" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use WCNegentropy/BitTransformerLM with Docker Model Runner:
docker model run hf.co/WCNegentropy/BitTransformerLM
| import torch | |
| import torch.nn as nn | |
| from torch.ao.quantization.fake_quantize import FakeQuantize | |
| from torch.ao.quantization.observer import MinMaxObserver | |
| from torch.ao.quantization.qconfig import QConfig | |
| from torch.ao.quantization import convert | |
| from .model import BitTransformerLM | |
| def quantize_dynamic(model: BitTransformerLM, dtype: torch.dtype = torch.qint8) -> BitTransformerLM: | |
| """Return a dynamically quantized copy of the model for inference.""" | |
| quantized = torch.quantization.quantize_dynamic( | |
| model, {nn.Linear}, dtype=dtype | |
| ) | |
| return quantized | |
| class FourBitObserver(MinMaxObserver): | |
| """Min-max observer configured for 4-bit quantization.""" | |
| def __init__(self, **kwargs): | |
| super().__init__( | |
| quant_min=0, | |
| quant_max=15, | |
| dtype=torch.quint8, | |
| qscheme=torch.per_tensor_affine, | |
| **kwargs, | |
| ) | |
| FourBitFakeQuantize = FakeQuantize.with_args(observer=FourBitObserver) | |
| four_bit_qconfig = QConfig(activation=FourBitFakeQuantize, weight=FourBitFakeQuantize) | |
| class QATLinear(nn.Linear): | |
| """Linear layer with fake quantization for QAT.""" | |
| def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None: | |
| super().__init__(in_features, out_features, bias) | |
| self.weight_fake_quant = FourBitFakeQuantize() | |
| self.activation_post_process = FourBitFakeQuantize() | |
| def from_float(cls, mod: nn.Linear) -> "QATLinear": | |
| qat = cls(mod.in_features, mod.out_features, mod.bias is not None) | |
| qat.weight = mod.weight | |
| qat.bias = mod.bias | |
| return qat | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| x = self.activation_post_process(x) | |
| w = self.weight_fake_quant(self.weight) | |
| return nn.functional.linear(x, w, self.bias) | |
| def prepare_qat_fx(model: BitTransformerLM) -> BitTransformerLM: | |
| """Prepare BitTransformerLM for quantization-aware training.""" | |
| for name, module in model.named_children(): | |
| if isinstance(module, nn.Linear): | |
| setattr(model, name, QATLinear.from_float(module)) | |
| else: | |
| prepare_qat_fx(module) | |
| return model | |
| def convert_qat_fx(model: BitTransformerLM) -> BitTransformerLM: | |
| """Convert a QAT-prepared model to a quantized version.""" | |
| for name, module in model.named_children(): | |
| if isinstance(module, QATLinear): | |
| w = module.weight.data | |
| qmin, qmax = 0, 15 | |
| min_w = w.min() | |
| max_w = w.max() | |
| scale = (max_w - min_w) / (qmax - qmin + 1e-8) | |
| zero_point = qmin - torch.round(min_w / scale) | |
| q_w = torch.clamp(torch.round(w / scale + zero_point), qmin, qmax) | |
| new_mod = nn.Linear(module.in_features, module.out_features, module.bias is not None) | |
| new_mod.weight = nn.Parameter((q_w - zero_point) * scale) | |
| if module.bias is not None: | |
| new_mod.bias = nn.Parameter(module.bias.data) | |
| setattr(model, name, new_mod) | |
| else: | |
| convert_qat_fx(module) | |
| return model | |