Text Generation
Transformers
PyTorch
English
experimental
research
bit-level
transformer
reversible
safety
telemetry
language-modeling
Instructions to use WCNegentropy/BitTransformerLM with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use WCNegentropy/BitTransformerLM with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="WCNegentropy/BitTransformerLM")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("WCNegentropy/BitTransformerLM", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use WCNegentropy/BitTransformerLM with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "WCNegentropy/BitTransformerLM" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/WCNegentropy/BitTransformerLM
- SGLang
How to use WCNegentropy/BitTransformerLM with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "WCNegentropy/BitTransformerLM" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "WCNegentropy/BitTransformerLM" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use WCNegentropy/BitTransformerLM with Docker Model Runner:
docker model run hf.co/WCNegentropy/BitTransformerLM
| #!/usr/bin/env python3 | |
| """ | |
| BitTransformerLM Conversation Test Script | |
| ========================================= | |
| Load the trained breakthrough model and test its conversational capabilities! | |
| """ | |
| import sys | |
| import os | |
| import torch | |
| import torch.nn.functional as F | |
| from pathlib import Path | |
| # Add paths for imports | |
| sys.path.append('/data') | |
| sys.path.append('/data/BitTransformerLM') | |
| from bit_transformer import BitTransformerLM, text_to_bits, bits_to_text | |
| def load_breakthrough_model(): | |
| """Load the trained breakthrough BitTransformerLM.""" | |
| print("π Loading breakthrough BitTransformerLM...") | |
| # Create model with EXACT same config as training | |
| model = BitTransformerLM( | |
| d_model=512, # Breakthrough config | |
| nhead=16, # 16 attention heads | |
| num_layers=8, # 8 layers for ~16M params | |
| dim_feedforward=1024, # 2x d_model | |
| max_seq_len=512, # Match checkpoint positional encoding | |
| reversible=True, # Memory efficiency | |
| use_checkpoint=True, # Gradient checkpointing | |
| use_autocast=True, # CPU mixed precision | |
| use_act=True, # Adaptive Computation Time | |
| act_threshold=0.9, # ACT threshold | |
| lambda_K=0.05, # Safety telemetry weights | |
| lambda_C=0.05, | |
| lambda_S=0.05 | |
| ) | |
| # Load the breakthrough checkpoint | |
| checkpoint_path = '/data/BitTransformerLM/checkpoints/checkpoint_latest.pt' | |
| print(f"Loading checkpoint: {checkpoint_path}") | |
| checkpoint = torch.load(checkpoint_path, map_location='cpu') | |
| model.load_state_dict(checkpoint['model_state_dict']) | |
| # Set to eval mode with proper inference settings | |
| model.eval() | |
| print(f"β Model loaded successfully!") | |
| print(f"π Checkpoint info:") | |
| print(f" - Epoch: {checkpoint['epoch']}") | |
| print(f" - Loss: {checkpoint['loss']:.6f}") | |
| print(f" - Best Loss: {checkpoint['best_loss']:.6f}") | |
| total_params = sum(p.numel() for p in model.parameters()) | |
| print(f" - Parameters: {total_params:,}") | |
| return model | |
| def generate_text(model, prompt, max_length=100, temperature=0.8, top_p=0.9): | |
| """Generate text using the breakthrough model.""" | |
| print(f"\nπ€ Generating response to: '{prompt}'") | |
| # Convert prompt to bits | |
| input_bits = text_to_bits(prompt) | |
| print(f"π Input bits: {len(input_bits)} bits") | |
| # Ensure we don't exceed max sequence length | |
| if len(input_bits) > 200: # Leave room for generation | |
| input_bits = input_bits[:200] | |
| # Convert to tensor | |
| input_tensor = torch.tensor(input_bits, dtype=torch.long).unsqueeze(0) # Add batch dim | |
| generated_bits = input_bits.copy() | |
| print("π Generating...") | |
| with torch.no_grad(): | |
| for i in range(max_length): | |
| # Prepare current sequence (last 256 bits max) | |
| current_seq = generated_bits[-256:] if len(generated_bits) > 256 else generated_bits | |
| current_tensor = torch.tensor(current_seq, dtype=torch.long).unsqueeze(0) | |
| # Forward pass | |
| logits, telemetry = model(current_tensor) | |
| # Get the last prediction (next bit) | |
| next_bit_logits = logits[0, -1, :] # [batch=0, last_pos, 2_classes] | |
| # Apply temperature | |
| next_bit_logits = next_bit_logits / temperature | |
| # Apply top-p sampling | |
| if top_p < 1.0: | |
| sorted_logits, sorted_indices = torch.sort(next_bit_logits, descending=True) | |
| cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) | |
| # Remove tokens with cumulative probability above the threshold | |
| sorted_indices_to_remove = cumulative_probs > top_p | |
| sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone() | |
| sorted_indices_to_remove[0] = 0 | |
| next_bit_logits[sorted_indices[sorted_indices_to_remove]] = float('-inf') | |
| # Sample next bit | |
| probs = F.softmax(next_bit_logits, dim=-1) | |
| next_bit = torch.multinomial(probs, 1).item() | |
| generated_bits.append(next_bit) | |
| # Try to decode periodically to see if we have meaningful text | |
| if i % 9 == 8: # Every 9 bits (1 character with parity) | |
| try: | |
| partial_text = bits_to_text(generated_bits[len(input_bits):]) | |
| if len(partial_text) > 0: | |
| # Check if we hit a natural stopping point | |
| if partial_text.endswith(('.', '!', '?', '\n')): | |
| break | |
| except: | |
| continue # Keep generating if decode fails | |
| # Extract generated bits (exclude input) | |
| generated_only = generated_bits[len(input_bits):] | |
| try: | |
| generated_text = bits_to_text(generated_only) | |
| print(f"β¨ Generated text: '{generated_text}'") | |
| print(f"π Generated {len(generated_only)} bits -> {len(generated_text)} characters") | |
| if telemetry: | |
| print(f"π Final telemetry: K={telemetry.get('negentropy_logits', 0):.3f}, " + | |
| f"C={telemetry.get('lz_complexity_logits', 0):.3f}, " + | |
| f"S={telemetry.get('symbiosis_score', 0):.3f}") | |
| return prompt + generated_text | |
| except Exception as e: | |
| print(f"β Failed to decode generated bits: {e}") | |
| print(f"Raw bits: {generated_only[:50]}..." if len(generated_only) > 50 else f"Raw bits: {generated_only}") | |
| return None | |
| def interactive_conversation(model): | |
| """Interactive conversation loop.""" | |
| print("\nπ― BREAKTHROUGH BITRANSFORMERLM CONVERSATION TEST") | |
| print("=" * 60) | |
| print("Type 'quit' to exit, 'help' for commands") | |
| print() | |
| conversation_history = "" | |
| while True: | |
| try: | |
| # Get user input | |
| user_input = input("You: ").strip() | |
| if user_input.lower() in ['quit', 'exit', 'q']: | |
| print("π Goodbye!") | |
| break | |
| if user_input.lower() == 'help': | |
| print("Commands:") | |
| print(" quit/exit/q - Exit conversation") | |
| print(" help - Show this help") | |
| print(" clear - Clear conversation history") | |
| continue | |
| if user_input.lower() == 'clear': | |
| conversation_history = "" | |
| print("π§Ή Conversation history cleared") | |
| continue | |
| if not user_input: | |
| continue | |
| # Add to conversation history | |
| conversation_history += f"Human: {user_input}\nAI: " | |
| # Generate response | |
| response = generate_text( | |
| model, | |
| conversation_history, | |
| max_length=150, | |
| temperature=0.8, | |
| top_p=0.9 | |
| ) | |
| if response: | |
| # Extract just the AI response | |
| ai_response = response[len(conversation_history):] | |
| print(f"AI: {ai_response}") | |
| # Update history with AI response | |
| conversation_history += ai_response + "\n" | |
| # Keep history manageable | |
| if len(conversation_history) > 500: | |
| # Keep only the last few exchanges | |
| lines = conversation_history.split('\n') | |
| conversation_history = '\n'.join(lines[-10:]) | |
| else: | |
| print("AI: [Failed to generate response]") | |
| except KeyboardInterrupt: | |
| print("\nπ Goodbye!") | |
| break | |
| except Exception as e: | |
| print(f"β Error: {e}") | |
| def main(): | |
| """Main conversation test function.""" | |
| print("π BITRANSFORMERLM BREAKTHROUGH CONVERSATION TEST") | |
| print("=" * 60) | |
| # Load the trained model | |
| model = load_breakthrough_model() | |
| print("\nπ§ͺ QUICK TESTS:") | |
| # Test 1: Simple greeting | |
| print("\n--- Test 1: Simple Greeting ---") | |
| generate_text(model, "Hello", max_length=50) | |
| # Test 2: Question | |
| print("\n--- Test 2: Question ---") | |
| generate_text(model, "What is", max_length=50) | |
| # Test 3: Conversation starter | |
| print("\n--- Test 3: Conversation ---") | |
| generate_text(model, "Hi there! How are you?", max_length=80) | |
| print("\n" + "=" * 60) | |
| print("Ready for interactive conversation!") | |
| # Start interactive conversation | |
| interactive_conversation(model) | |
| if __name__ == "__main__": | |
| main() |