mlx-lm cheatsheet
mlx-cheatsheet.md
MLX Framework Cheatsheet
Overview
MLX is an array framework for machine learning on Apple silicon, designed by Apple machine learning research. It offers high performance, familiar APIs, and seamless integration with Apple's ecosystem.
Core Features
- Familiar APIs: Python API based on NumPy, with C++, Swift interfaces
- Composable function transformations: For automatic differentiation, vectorization, optimization
- Lazy computation: Arrays only materialized when needed
- Dynamic graph construction: No slow recompilations when shapes change
- Unified memory model: Operations across devices without data copies
Installation
# Install MLX
pip install mlx
# Install MLX-LM for language models
pip install mlx-lm
MLX Core Components
Arrays and Basic Operations
import mlx.core as mx
# Create arrays
a = mx.array([1, 2, 3])
b = mx.zeros((3, 3))
c = mx.ones((2, 4))
d = mx.random.normal((2, 2))
# Basic operations
result = a + b
result = mx.matmul(b, b)
# Evaluate lazily computed arrays
mx.eval(result)
Function Transformations
import mlx.core as mx
# Gradient computation
def f(x):
return mx.sum(x ** 2)
grad_f = mx.grad(f)
x = mx.array([1.0, 2.0, 3.0])
grad_value = grad_f(x) # [2.0, 4.0, 6.0]
# Vectorization
def scalar_fn(x):
return x ** 2
vector_fn = mx.vmap(scalar_fn)
vector_fn(mx.array([1.0, 2.0, 3.0])) # [1.0, 4.0, 9.0]
# Combined transformations
grad_vector_fn = mx.grad(mx.vmap(scalar_fn))
Compilation
import mlx.core as mx
@mx.compile
def optimized_fn(x):
return mx.sum(x ** 2)
# With state tracking
state = [mx.array(1.0)]
@mx.compile(inputs=state, outputs=state)
def stateful_fn(x):
result = x + state[0]
state[0] = result
return result
Neural Networks (mlx.nn)
Building a Basic Neural Network
import mlx.core as mx
import mlx.nn as nn
class MLP(nn.Module):
def __init__(self, in_dims, hidden_dims, out_dims):
super().__init__()
self.layers = [
nn.Linear(in_dims, hidden_dims),
nn.Linear(hidden_dims, out_dims)
]
def __call__(self, x):
for i, layer in enumerate(self.layers[:-1]):
x = layer(x)
x = mx.maximum(x, 0) # ReLU activation
return self.layers[-1](x)
# Create model
model = MLP(10, 128, 1)
# Initialize parameters
mx.eval(model.parameters())
# Access parameters
params = model.parameters()
Common Layers
# Linear layer
linear = nn.Linear(input_dim, output_dim)
# Convolutional layer
conv = nn.Conv2d(in_channels, out_channels, kernel_size=3)
# Layer normalization
norm = nn.LayerNorm(dim)
# Dropout (for training)
dropout = nn.Dropout(p=0.5)
# Multi-head attention
attention = nn.MultiHeadAttention(dim, num_heads)
Loss Functions
import mlx.nn.losses as losses
# Common loss functions
mse_loss = losses.mse_loss(predictions, targets)
bce_loss = losses.binary_cross_entropy(predictions, targets)
ce_loss = losses.cross_entropy(predictions, targets)
Optimizers (mlx.optimizers)
import mlx.optimizers as optim
# Create optimizer
optimizer = optim.SGD(learning_rate=0.01)
# Or
optimizer = optim.Adam(learning_rate=0.001, betas=(0.9, 0.999))
# Update model with gradients
optimizer.update(model, gradients)
# Evaluate optimizer state and model parameters
mx.eval(optimizer.state, model.parameters())
Training Loop Pattern
import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim
# Create model
model = MyModel()
mx.eval(model.parameters())
# Define loss function
def loss_fn(model, x, y):
y_pred = model(x)
return nn.losses.mse_loss(y_pred, y)
# Create gradient function and optimizer
loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
optimizer = optim.Adam(learning_rate=0.001)
# Training loop
for epoch in range(num_epochs):
for x_batch, y_batch in data_loader:
# Forward and backward pass
loss, grads = loss_and_grad_fn(model, x_batch, y_batch)
# Update model parameters
optimizer.update(model, grads)
# Evaluate parameters and optimizer state
mx.eval(model.parameters(), optimizer.state)
MLX-LM Commands
Model Generation
# Generate text with a model
mlx_lm.generate --model mistralai/Mistral-7B-Instruct-v0.3 --prompt "hello"
# Stream text generation
mlx_lm.generate --model mistralai/Mistral-7B-Instruct-v0.3 --prompt "hello" --stream
# Set generation parameters
mlx_lm.generate --model <model_name> --prompt "hello" --max-tokens 100 --temperature 0.7 --top-p 0.9
Model Conversion
# Convert Hugging Face model to MLX format
mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3
# Convert and quantize to 4-bit
mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3 -q
# Convert, quantize, and upload to Hugging Face
mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3 -q --upload-repo <username>/<repo-name>
Interactive Chat
# Start interactive chat with a model
mlx_lm.chat --model mistralai/Mistral-7B-Instruct-v0.3
# Use a local model
mlx_lm.chat --model ./path/to/local/model
Fine-tuning with LoRA
# Basic LoRA fine-tuning
mlx_lm.lora --model mistralai/Mistral-7B-v0.1 --train --data ./my_data_folder
# Set specific parameters
mlx_lm.lora \
--model mistralai/Mistral-7B-v0.1 \
--train \
--data ./my_data_folder \
--batch-size 1 \
--num-layers 4 \
--iters 500
# Use quantized model (QLoRA)
mlx_lm.lora --model <quantized_model_path> --train --data ./my_data_folder
# Test a fine-tuned model
mlx_lm.lora \
--model <path_to_model> \
--adapter-path <path_to_adapters> \
--data <path_to_data> \
--test
# Generate with a fine-tuned model
mlx_lm.generate \
--model <path_to_model> \
--adapter-path <path_to_adapters> \
--prompt "<your_prompt>"
Fusing Adapters
# Fuse LoRA adapters with the original model
mlx_lm.fuse \
--model <path_to_model> \
--adapter-path <path_to_adapters> \
--save-path <output_path>
# Fuse and upload to Hugging Face
mlx_lm.fuse \
--model <path_to_model> \
--adapter-path <path_to_adapters> \
--save-path <output_path> \
--upload-name <username>/<repo-name>
# Export to GGUF format
mlx_lm.fuse \
--model <path_to_model> \
--adapter-path <path_to_adapters> \
--export-gguf
Model Management
# Scan all locally cached models
mlx_lm.manage --scan
# Delete specific models
mlx_lm.manage --delete --pattern <model_name_pattern>
API Server
# Run OpenAI-compatible API server
mlx_lm.server
# Interact with the server
curl localhost:8080/v1/chat/completions -d '{
"model": "mlx-community/Llama-3.2-3B-Instruct-4bit",
"max_completion_tokens": 2000,
"messages": [{"role": "user", "content": "Hello there"}]
}'
Swift MLX Integration
// Add dependency in Package.swift
dependencies: [
.package(url: "https://github.com/ml-explore/mlx-swift", from: "0.10.0")
]
// Import packages
import MLX
import MLXNN
import MLXOptimizers
import MLXRandom