In [ ]:

  Copied!     
 
"""
Simple Modal script to run nanoGPT training on serverless GPUs.

This demonstrates how you can take a local repository (nanoGPT) and run it
on Modal with minimal changes - just copy the code into the image and run!

Usage:
    # Prepare Shakespeare dataset and train a small GPT
    modal run FinetuneNanoGPT.py

    # Or run individual steps:
    modal run FinetuneNanoGPT.py::prepare_data
    modal run FinetuneNanoGPT.py::train
    modal run FinetuneNanoGPT.py::sample
"""
""" Simple Modal script to run nanoGPT training on serverless GPUs. This demonstrates how you can take a local repository (nanoGPT) and run it on Modal with minimal changes - just copy the code into the image and run! Usage: # Prepare Shakespeare dataset and train a small GPT modal run FinetuneNanoGPT.py # Or run individual steps: modal run FinetuneNanoGPT.py::prepare_data modal run FinetuneNanoGPT.py::train modal run FinetuneNanoGPT.py::sample """

In [ ]:

  Copied!     
 
from modal import App, Image as ModalImage, Volume
from modal import App, Image as ModalImage, Volume

============================================================================= CONFIGURATION¶

In [ ]:

  Copied!     
 
HOURS = 60 * 60
GPU_TYPE = "a100-40gb"  # Can be: a100-40gb, a100-80gb, l40s, t4, etc.
HOURS = 60 * 60 GPU_TYPE = "a100-40gb" # Can be: a100-40gb, a100-80gb, l40s, t4, etc.

In [ ]:

  Copied!     
 
app = App("nanogpt-training")
volume = Volume.from_name("nanogpt-outputs", create_if_missing=True)
app = App("nanogpt-training") volume = Volume.from_name("nanogpt-outputs", create_if_missing=True)

In [ ]:

  Copied!     
 
VOLUME_CONFIG = {
    "/data": volume,
}
VOLUME_CONFIG = { "/data": volume, }

============================================================================= IMAGE SETUP - Copy local nanoGPT repo into the image¶

In [ ]:

  Copied!     
 
# Simple approach: copy the entire nanoGPT directory into the image
NANOGPT_IMAGE = (
    ModalImage.debian_slim(python_version="3.11")
    .pip_install(
        "torch",
        "numpy",
        "transformers",
        "datasets",
        "tiktoken",
        "tqdm",
    )
    # Copy the nanoGPT directory from local filesystem into the image
    # copy=True because we have .workdir() after this
    .add_local_dir(local_path="nanoGPT", remote_path="/root/nanoGPT", copy=True)
    .workdir("/root/nanoGPT")
)
# Simple approach: copy the entire nanoGPT directory into the image NANOGPT_IMAGE = ( ModalImage.debian_slim(python_version="3.11") .pip_install( "torch", "numpy", "transformers", "datasets", "tiktoken", "tqdm", ) # Copy the nanoGPT directory from local filesystem into the image # copy=True because we have .workdir() after this .add_local_dir(local_path="nanoGPT", remote_path="/root/nanoGPT", copy=True) .workdir("/root/nanoGPT") )

============================================================================= DATA PREPARATION FUNCTION¶

In [ ]:

  Copied!     
 
@app.function(
    image=NANOGPT_IMAGE,
    timeout=10 * 60,  # 10 minutes
)
def prepare_data():
    """
    Prepare the Shakespeare dataset for character-level training.
    This downloads the data and creates train.bin and val.bin files.
    """
    import subprocess

    print("=" * 80)
    print("PREPARING SHAKESPEARE DATASET")
    print("=" * 80)

    # Run the prepare script
    result = subprocess.run(
        ["python", "data/shakespeare_char/prepare.py"], capture_output=True, text=True
    )

    print(result.stdout)
    if result.stderr:
        print("STDERR:", result.stderr)

    if result.returncode != 0:
        raise RuntimeError(f"Data preparation failed with code {result.returncode}")

    print("✓ Data preparation completed!")
    return {"status": "completed", "dataset": "shakespeare_char"}
@app.function( image=NANOGPT_IMAGE, timeout=10 * 60, # 10 minutes ) def prepare_data(): """ Prepare the Shakespeare dataset for character-level training. This downloads the data and creates train.bin and val.bin files. """ import subprocess print("=" * 80) print("PREPARING SHAKESPEARE DATASET") print("=" * 80) # Run the prepare script result = subprocess.run( ["python", "data/shakespeare_char/prepare.py"], capture_output=True, text=True ) print(result.stdout) if result.stderr: print("STDERR:", result.stderr) if result.returncode != 0: raise RuntimeError(f"Data preparation failed with code {result.returncode}") print("✓ Data preparation completed!") return {"status": "completed", "dataset": "shakespeare_char"}

============================================================================= TRAINING FUNCTION¶

In [ ]:

  Copied!     
 
@app.function(
    image=NANOGPT_IMAGE,
    gpu=GPU_TYPE,
    volumes=VOLUME_CONFIG,
    timeout=2 * HOURS,
)
def train(
    max_iters: int = 1000,
    eval_interval: int = 500,
    batch_size: int = 64,
    block_size: int = 256,
    n_layer: int = 6,
    n_head: int = 6,
    n_embd: int = 384,
    learning_rate: float = 1e-3,
):
    """
    Train a character-level GPT on Shakespeare data.

    This runs the nanoGPT training script with customizable hyperparameters.
    The trained model checkpoint will be saved to the Modal volume.

    Args:
        max_iters: Number of training iterations
        eval_interval: How often to evaluate
        batch_size: Batch size for training
        block_size: Context length
        n_layer: Number of transformer layers
        n_head: Number of attention heads
        n_embd: Embedding dimension
        learning_rate: Learning rate
    """
    import subprocess
    import os

    print("=" * 80)
    print("TRAINING NANOGPT ON SHAKESPEARE")
    print("=" * 80)
    print(f"Max iterations: {max_iters}")
    print(f"Batch size: {batch_size}")
    print(f"Block size: {block_size}")
    print(f"Layers: {n_layer}, Heads: {n_head}, Embedding: {n_embd}")
    print("=" * 80)

    # Make sure data is prepared
    if not os.path.exists("data/shakespeare_char/train.bin"):
        print("Data not found, preparing it first...")
        prepare_data.local()

    # Build training command with arguments
    cmd = [
        "python",
        "train.py",
        "config/train_shakespeare_char.py",
        f"--max_iters={max_iters}",
        f"--eval_interval={eval_interval}",
        f"--batch_size={batch_size}",
        f"--block_size={block_size}",
        f"--n_layer={n_layer}",
        f"--n_head={n_head}",
        f"--n_embd={n_embd}",
        f"--learning_rate={learning_rate}",
        "--out_dir=/data/out",  # Save outputs to volume
        "--dataset=shakespeare_char",  # Important: tells sample.py where to find meta.pkl
        "--compile=False",  # Disable compilation for faster startup
    ]

    print(f"Running: {' '.join(cmd)}")
    print()

    # Run training
    result = subprocess.run(cmd, capture_output=False, text=True)

    if result.returncode != 0:
        raise RuntimeError(f"Training failed with code {result.returncode}")

    # Copy meta.pkl to output directory for sampling
    import shutil

    meta_src = "data/shakespeare_char/meta.pkl"
    meta_dst = "/data/out/meta.pkl"
    if os.path.exists(meta_src):
        shutil.copy(meta_src, meta_dst)
        print(f"✓ Copied {meta_src} to {meta_dst}")

    # Commit the volume to save the checkpoint
    volume.commit()

    print("\n" + "=" * 80)
    print("✓ Training completed! Model saved to /data/out")
    print("=" * 80)

    return {
        "status": "completed",
        "max_iters": max_iters,
        "output_dir": "/data/out",
    }
@app.function( image=NANOGPT_IMAGE, gpu=GPU_TYPE, volumes=VOLUME_CONFIG, timeout=2 * HOURS, ) def train( max_iters: int = 1000, eval_interval: int = 500, batch_size: int = 64, block_size: int = 256, n_layer: int = 6, n_head: int = 6, n_embd: int = 384, learning_rate: float = 1e-3, ): """ Train a character-level GPT on Shakespeare data. This runs the nanoGPT training script with customizable hyperparameters. The trained model checkpoint will be saved to the Modal volume. Args: max_iters: Number of training iterations eval_interval: How often to evaluate batch_size: Batch size for training block_size: Context length n_layer: Number of transformer layers n_head: Number of attention heads n_embd: Embedding dimension learning_rate: Learning rate """ import subprocess import os print("=" * 80) print("TRAINING NANOGPT ON SHAKESPEARE") print("=" * 80) print(f"Max iterations: {max_iters}") print(f"Batch size: {batch_size}") print(f"Block size: {block_size}") print(f"Layers: {n_layer}, Heads: {n_head}, Embedding: {n_embd}") print("=" * 80) # Make sure data is prepared if not os.path.exists("data/shakespeare_char/train.bin"): print("Data not found, preparing it first...") prepare_data.local() # Build training command with arguments cmd = [ "python", "train.py", "config/train_shakespeare_char.py", f"--max_iters={max_iters}", f"--eval_interval={eval_interval}", f"--batch_size={batch_size}", f"--block_size={block_size}", f"--n_layer={n_layer}", f"--n_head={n_head}", f"--n_embd={n_embd}", f"--learning_rate={learning_rate}", "--out_dir=/data/out", # Save outputs to volume "--dataset=shakespeare_char", # Important: tells sample.py where to find meta.pkl "--compile=False", # Disable compilation for faster startup ] print(f"Running: {' '.join(cmd)}") print() # Run training result = subprocess.run(cmd, capture_output=False, text=True) if result.returncode != 0: raise RuntimeError(f"Training failed with code {result.returncode}") # Copy meta.pkl to output directory for sampling import shutil meta_src = "data/shakespeare_char/meta.pkl" meta_dst = "/data/out/meta.pkl" if os.path.exists(meta_src): shutil.copy(meta_src, meta_dst) print(f"✓ Copied {meta_src} to {meta_dst}") # Commit the volume to save the checkpoint volume.commit() print("\n" + "=" * 80) print("✓ Training completed! Model saved to /data/out") print("=" * 80) return { "status": "completed", "max_iters": max_iters, "output_dir": "/data/out", }

============================================================================= SAMPLING FUNCTION¶

In [ ]:

  Copied!     
 
@app.function(
    image=NANOGPT_IMAGE,
    gpu=GPU_TYPE,
    volumes=VOLUME_CONFIG,
    timeout=10 * 60,
)
def sample(
    num_samples: int = 5,
    max_new_tokens: int = 500,
    temperature: float = 0.8,
    start: str = "\n",
):
    """
    Generate text samples from the trained model.

    Args:
        num_samples: Number of samples to generate
        max_new_tokens: Length of each sample
        temperature: Sampling temperature (higher = more random)
        start: Starting prompt for generation
    """
    import subprocess
    import os

    os.environ["TORCH_USE_CUDA_DSA"] = "1"

    print("=" * 80)
    print("GENERATING SAMPLES FROM TRAINED MODEL")
    print("=" * 80)
    print(f"Num samples: {num_samples}")
    print(f"Max tokens: {max_new_tokens}")
    print(f"Temperature: {temperature}")
    print(f"Start prompt: {repr(start)}")
    print("=" * 80)

    # Check if model files exist
    if os.path.exists("/data/out/ckpt.pt"):
        print("✓ Found checkpoint: /data/out/ckpt.pt")
    else:
        print("✗ Checkpoint not found: /data/out/ckpt.pt")

    if os.path.exists("/data/out/meta.pkl"):
        print("✓ Found meta file: /data/out/meta.pkl")
    else:
        print("✗ Meta file not found: /data/out/meta.pkl")
        print("  Sampling will use GPT-2 encoding which will fail!")

    print()

    # Ensure meta.pkl exists in the data directory for sample.py to find
    # sample.py looks for meta.pkl in data/{dataset}/meta.pkl first, then falls back to out_dir
    import shutil

    os.makedirs("data/shakespeare_char", exist_ok=True)

    # Copy meta.pkl from volume to data directory if it exists
    if os.path.exists("/data/out/meta.pkl") and not os.path.exists(
        "data/shakespeare_char/meta.pkl"
    ):
        shutil.copy("/data/out/meta.pkl", "data/shakespeare_char/meta.pkl")
        print("✓ Copied meta.pkl to data/shakespeare_char/")

    # Build sampling command
    cmd = [
        "python",
        "sample.py",
        "--out_dir=/data/out",  # Read model from volume
        f"--num_samples={num_samples}",
        f"--max_new_tokens={max_new_tokens}",
        f"--temperature={temperature}",
        f"--start={start}",
        "--compile=False",
    ]

    print(f"Running: {' '.join(cmd)}")
    print()

    # Run sampling
    result = subprocess.run(cmd, capture_output=True, text=True)

    print(result.stdout)
    if result.stderr:
        print("STDERR:", result.stderr)

    if result.returncode != 0:
        raise RuntimeError(f"Sampling failed with code {result.returncode}")

    print("\n" + "=" * 80)
    print("✓ Sampling completed!")
    print("=" * 80)

    return {"status": "completed", "samples": result.stdout}
@app.function( image=NANOGPT_IMAGE, gpu=GPU_TYPE, volumes=VOLUME_CONFIG, timeout=10 * 60, ) def sample( num_samples: int = 5, max_new_tokens: int = 500, temperature: float = 0.8, start: str = "\n", ): """ Generate text samples from the trained model. Args: num_samples: Number of samples to generate max_new_tokens: Length of each sample temperature: Sampling temperature (higher = more random) start: Starting prompt for generation """ import subprocess import os os.environ["TORCH_USE_CUDA_DSA"] = "1" print("=" * 80) print("GENERATING SAMPLES FROM TRAINED MODEL") print("=" * 80) print(f"Num samples: {num_samples}") print(f"Max tokens: {max_new_tokens}") print(f"Temperature: {temperature}") print(f"Start prompt: {repr(start)}") print("=" * 80) # Check if model files exist if os.path.exists("/data/out/ckpt.pt"): print("✓ Found checkpoint: /data/out/ckpt.pt") else: print("✗ Checkpoint not found: /data/out/ckpt.pt") if os.path.exists("/data/out/meta.pkl"): print("✓ Found meta file: /data/out/meta.pkl") else: print("✗ Meta file not found: /data/out/meta.pkl") print(" Sampling will use GPT-2 encoding which will fail!") print() # Ensure meta.pkl exists in the data directory for sample.py to find # sample.py looks for meta.pkl in data/{dataset}/meta.pkl first, then falls back to out_dir import shutil os.makedirs("data/shakespeare_char", exist_ok=True) # Copy meta.pkl from volume to data directory if it exists if os.path.exists("/data/out/meta.pkl") and not os.path.exists( "data/shakespeare_char/meta.pkl" ): shutil.copy("/data/out/meta.pkl", "data/shakespeare_char/meta.pkl") print("✓ Copied meta.pkl to data/shakespeare_char/") # Build sampling command cmd = [ "python", "sample.py", "--out_dir=/data/out", # Read model from volume f"--num_samples={num_samples}", f"--max_new_tokens={max_new_tokens}", f"--temperature={temperature}", f"--start={start}", "--compile=False", ] print(f"Running: {' '.join(cmd)}") print() # Run sampling result = subprocess.run(cmd, capture_output=True, text=True) print(result.stdout) if result.stderr: print("STDERR:", result.stderr) if result.returncode != 0: raise RuntimeError(f"Sampling failed with code {result.returncode}") print("\n" + "=" * 80) print("✓ Sampling completed!") print("=" * 80) return {"status": "completed", "samples": result.stdout}

============================================================================= LOCAL ENTRYPOINT - Run everything in sequence¶

In [ ]:

  Copied!     
 
@app.local_entrypoint()
def main():
    """Run the complete pipeline: prepare data -> train -> sample"""
    print("🚀 Starting nanoGPT pipeline...")

    # Prepare data
    print("📁 Preparing dataset...")
    prepare_data.remote()

    # Train model
    print("🏋️ Training model...")
    train.remote(max_iters=1000, eval_interval=250, batch_size=64)

    # Generate samples
    print("✨ Generating samples...")
    sample.remote(num_samples=3, max_new_tokens=300)

    print("🎉 Pipeline completed!")
@app.local_entrypoint() def main(): """Run the complete pipeline: prepare data -> train -> sample""" print("🚀 Starting nanoGPT pipeline...") # Prepare data print("📁 Preparing dataset...") prepare_data.remote() # Train model print("🏋️ Training model...") train.remote(max_iters=1000, eval_interval=250, batch_size=64) # Generate samples print("✨ Generating samples...") sample.remote(num_samples=3, max_new_tokens=300) print("🎉 Pipeline completed!")

============================================================================= CONFIGURATION¶

============================================================================= MODAL APP AND VOLUME SETUP¶

============================================================================= IMAGE SETUP - Copy local nanoGPT repo into the image¶

============================================================================= DATA PREPARATION FUNCTION¶

============================================================================= TRAINING FUNCTION¶

============================================================================= SAMPLING FUNCTION¶

============================================================================= LOCAL ENTRYPOINT - Run everything in sequence¶