Text Generation
Transformers
PyTorch
English
experimental
research
bit-level
transformer
reversible
safety
telemetry
language-modeling
Instructions to use WCNegentropy/BitTransformerLM with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use WCNegentropy/BitTransformerLM with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="WCNegentropy/BitTransformerLM")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("WCNegentropy/BitTransformerLM", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use WCNegentropy/BitTransformerLM with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "WCNegentropy/BitTransformerLM" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/WCNegentropy/BitTransformerLM
- SGLang
How to use WCNegentropy/BitTransformerLM with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "WCNegentropy/BitTransformerLM" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "WCNegentropy/BitTransformerLM" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use WCNegentropy/BitTransformerLM with Docker Model Runner:
docker model run hf.co/WCNegentropy/BitTransformerLM
File size: 3,117 Bytes
36c78b1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | import torch
import torch.nn as nn
from torch.ao.quantization.fake_quantize import FakeQuantize
from torch.ao.quantization.observer import MinMaxObserver
from torch.ao.quantization.qconfig import QConfig
from torch.ao.quantization import convert
from .model import BitTransformerLM
def quantize_dynamic(model: BitTransformerLM, dtype: torch.dtype = torch.qint8) -> BitTransformerLM:
"""Return a dynamically quantized copy of the model for inference."""
quantized = torch.quantization.quantize_dynamic(
model, {nn.Linear}, dtype=dtype
)
return quantized
class FourBitObserver(MinMaxObserver):
"""Min-max observer configured for 4-bit quantization."""
def __init__(self, **kwargs):
super().__init__(
quant_min=0,
quant_max=15,
dtype=torch.quint8,
qscheme=torch.per_tensor_affine,
**kwargs,
)
FourBitFakeQuantize = FakeQuantize.with_args(observer=FourBitObserver)
four_bit_qconfig = QConfig(activation=FourBitFakeQuantize, weight=FourBitFakeQuantize)
class QATLinear(nn.Linear):
"""Linear layer with fake quantization for QAT."""
def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
super().__init__(in_features, out_features, bias)
self.weight_fake_quant = FourBitFakeQuantize()
self.activation_post_process = FourBitFakeQuantize()
@classmethod
def from_float(cls, mod: nn.Linear) -> "QATLinear":
qat = cls(mod.in_features, mod.out_features, mod.bias is not None)
qat.weight = mod.weight
qat.bias = mod.bias
return qat
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.activation_post_process(x)
w = self.weight_fake_quant(self.weight)
return nn.functional.linear(x, w, self.bias)
def prepare_qat_fx(model: BitTransformerLM) -> BitTransformerLM:
"""Prepare BitTransformerLM for quantization-aware training."""
for name, module in model.named_children():
if isinstance(module, nn.Linear):
setattr(model, name, QATLinear.from_float(module))
else:
prepare_qat_fx(module)
return model
def convert_qat_fx(model: BitTransformerLM) -> BitTransformerLM:
"""Convert a QAT-prepared model to a quantized version."""
for name, module in model.named_children():
if isinstance(module, QATLinear):
w = module.weight.data
qmin, qmax = 0, 15
min_w = w.min()
max_w = w.max()
scale = (max_w - min_w) / (qmax - qmin + 1e-8)
zero_point = qmin - torch.round(min_w / scale)
q_w = torch.clamp(torch.round(w / scale + zero_point), qmin, qmax)
new_mod = nn.Linear(module.in_features, module.out_features, module.bias is not None)
new_mod.weight = nn.Parameter((q_w - zero_point) * scale)
if module.bias is not None:
new_mod.bias = nn.Parameter(module.bias.data)
setattr(model, name, new_mod)
else:
convert_qat_fx(module)
return model
|