gary.info

here be dragons

mlx-lm cheatsheet

mlx-cheatsheet.md

MLX Framework Cheatsheet

Overview

MLX is an array framework for machine learning on Apple silicon, designed by Apple machine learning research. It offers high performance, familiar APIs, and seamless integration with Apple's ecosystem.

Core Features

  • Familiar APIs: Python API based on NumPy, with C++, Swift interfaces
  • Composable function transformations: For automatic differentiation, vectorization, optimization
  • Lazy computation: Arrays only materialized when needed
  • Dynamic graph construction: No slow recompilations when shapes change
  • Unified memory model: Operations across devices without data copies
  • Installation

    # Install MLX
    pip install mlx
    
    # Install MLX-LM for language models
    pip install mlx-lm

    MLX Core Components

    Arrays and Basic Operations

    import mlx.core as mx
    
    # Create arrays
    a = mx.array([1, 2, 3])
    b = mx.zeros((3, 3))
    c = mx.ones((2, 4))
    d = mx.random.normal((2, 2))
    
    # Basic operations
    result = a + b
    result = mx.matmul(b, b)
    
    # Evaluate lazily computed arrays
    mx.eval(result)

    Function Transformations

    import mlx.core as mx
    
    # Gradient computation
    def f(x):
        return mx.sum(x ** 2)
    
    grad_f = mx.grad(f)
    x = mx.array([1.0, 2.0, 3.0])
    grad_value = grad_f(x)  # [2.0, 4.0, 6.0]
    
    # Vectorization
    def scalar_fn(x):
        return x ** 2
    
    vector_fn = mx.vmap(scalar_fn)
    vector_fn(mx.array([1.0, 2.0, 3.0]))  # [1.0, 4.0, 9.0]
    
    # Combined transformations
    grad_vector_fn = mx.grad(mx.vmap(scalar_fn))

    Compilation

    import mlx.core as mx
    
    @mx.compile
    def optimized_fn(x):
        return mx.sum(x ** 2)
    
    # With state tracking
    state = [mx.array(1.0)]
    
    @mx.compile(inputs=state, outputs=state)
    def stateful_fn(x):
        result = x + state[0]
        state[0] = result
        return result

    Neural Networks (mlx.nn)

    Building a Basic Neural Network

    import mlx.core as mx
    import mlx.nn as nn
    
    class MLP(nn.Module):
        def __init__(self, in_dims, hidden_dims, out_dims):
            super().__init__()
            self.layers = [
                nn.Linear(in_dims, hidden_dims),
                nn.Linear(hidden_dims, out_dims)
            ]
        
        def __call__(self, x):
            for i, layer in enumerate(self.layers[:-1]):
                x = layer(x)
                x = mx.maximum(x, 0)  # ReLU activation
            return self.layers[-1](x)
    
    # Create model
    model = MLP(10, 128, 1)
    
    # Initialize parameters
    mx.eval(model.parameters())
    
    # Access parameters
    params = model.parameters()

    Common Layers

    # Linear layer
    linear = nn.Linear(input_dim, output_dim)
    
    # Convolutional layer
    conv = nn.Conv2d(in_channels, out_channels, kernel_size=3)
    
    # Layer normalization
    norm = nn.LayerNorm(dim)
    
    # Dropout (for training)
    dropout = nn.Dropout(p=0.5)
    
    # Multi-head attention
    attention = nn.MultiHeadAttention(dim, num_heads)

    Loss Functions

    import mlx.nn.losses as losses
    
    # Common loss functions
    mse_loss = losses.mse_loss(predictions, targets)
    bce_loss = losses.binary_cross_entropy(predictions, targets)
    ce_loss = losses.cross_entropy(predictions, targets)

    Optimizers (mlx.optimizers)

    import mlx.optimizers as optim
    
    # Create optimizer
    optimizer = optim.SGD(learning_rate=0.01)
    # Or
    optimizer = optim.Adam(learning_rate=0.001, betas=(0.9, 0.999))
    
    # Update model with gradients
    optimizer.update(model, gradients)
    
    # Evaluate optimizer state and model parameters
    mx.eval(optimizer.state, model.parameters())

    Training Loop Pattern

    import mlx.core as mx
    import mlx.nn as nn
    import mlx.optimizers as optim
    
    # Create model
    model = MyModel()
    mx.eval(model.parameters())
    
    # Define loss function
    def loss_fn(model, x, y):
        y_pred = model(x)
        return nn.losses.mse_loss(y_pred, y)
    
    # Create gradient function and optimizer
    loss_and_grad_fn = nn.value_and_grad(model, loss_fn)
    optimizer = optim.Adam(learning_rate=0.001)
    
    # Training loop
    for epoch in range(num_epochs):
        for x_batch, y_batch in data_loader:
            # Forward and backward pass
            loss, grads = loss_and_grad_fn(model, x_batch, y_batch)
            
            # Update model parameters
            optimizer.update(model, grads)
            
            # Evaluate parameters and optimizer state
            mx.eval(model.parameters(), optimizer.state)

    MLX-LM Commands

    Model Generation

    # Generate text with a model
    mlx_lm.generate --model mistralai/Mistral-7B-Instruct-v0.3 --prompt "hello"
    
    # Stream text generation
    mlx_lm.generate --model mistralai/Mistral-7B-Instruct-v0.3 --prompt "hello" --stream
    
    # Set generation parameters
    mlx_lm.generate --model <model_name> --prompt "hello" --max-tokens 100 --temperature 0.7 --top-p 0.9

    Model Conversion

    # Convert Hugging Face model to MLX format
    mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3
    
    # Convert and quantize to 4-bit
    mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3 -q
    
    # Convert, quantize, and upload to Hugging Face
    mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3 -q --upload-repo <username>/<repo-name>

    Interactive Chat

    # Start interactive chat with a model
    mlx_lm.chat --model mistralai/Mistral-7B-Instruct-v0.3
    
    # Use a local model
    mlx_lm.chat --model ./path/to/local/model

    Fine-tuning with LoRA

    # Basic LoRA fine-tuning
    mlx_lm.lora --model mistralai/Mistral-7B-v0.1 --train --data ./my_data_folder
    
    # Set specific parameters
    mlx_lm.lora \
      --model mistralai/Mistral-7B-v0.1 \
      --train \
      --data ./my_data_folder \
      --batch-size 1 \
      --num-layers 4 \
      --iters 500
    
    # Use quantized model (QLoRA)
    mlx_lm.lora --model <quantized_model_path> --train --data ./my_data_folder
    
    # Test a fine-tuned model
    mlx_lm.lora \
      --model <path_to_model> \
      --adapter-path <path_to_adapters> \
      --data <path_to_data> \
      --test
    
    # Generate with a fine-tuned model
    mlx_lm.generate \
      --model <path_to_model> \
      --adapter-path <path_to_adapters> \
      --prompt "<your_prompt>"

    Fusing Adapters

    # Fuse LoRA adapters with the original model
    mlx_lm.fuse \
      --model <path_to_model> \
      --adapter-path <path_to_adapters> \
      --save-path <output_path>
    
    # Fuse and upload to Hugging Face
    mlx_lm.fuse \
      --model <path_to_model> \
      --adapter-path <path_to_adapters> \
      --save-path <output_path> \
      --upload-name <username>/<repo-name>
    
    # Export to GGUF format
    mlx_lm.fuse \
      --model <path_to_model> \
      --adapter-path <path_to_adapters> \
      --export-gguf

    Model Management

    # Scan all locally cached models
    mlx_lm.manage --scan
    
    # Delete specific models
    mlx_lm.manage --delete --pattern <model_name_pattern>

    API Server

    # Run OpenAI-compatible API server
    mlx_lm.server
    
    # Interact with the server
    curl localhost:8080/v1/chat/completions -d '{
      "model": "mlx-community/Llama-3.2-3B-Instruct-4bit", 
      "max_completion_tokens": 2000, 
      "messages": [{"role": "user", "content": "Hello there"}]
    }'

    Swift MLX Integration

    // Add dependency in Package.swift
    dependencies: [
        .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.10.0")
    ]
    
    // Import packages
    import MLX
    import MLXNN
    import MLXOptimizers
    import MLXRandom

    Resource Links

  • MLX Documentation
  • MLX GitHub Repository
  • MLX Examples Repository
  • MLX-LM Repository
  • MLX Community Models