For updates follow @adithya_s_k on Twitter
logo
AI Engineering Academy
TrainNanoGPTModal
Initializing search
    adithya-s-k/AI-Engineering.academy
    • Home
    • Prompt Engineering
    • RAG
    • LLM
    • Deployment
    • Agents
    • Projects
    • Blog
    adithya-s-k/AI-Engineering.academy
    • Home
    • Prompt Engineering
    • RAG
    • LLM
    • Deployment
    • Agents
    • Projects
    • Blog
    In [ ]:
    Copied!
    """
    Simple Modal script to run nanoGPT training on serverless GPUs.
    
    This demonstrates how you can take a local repository (nanoGPT) and run it
    on Modal with minimal changes - just copy the code into the image and run!
    
    Usage:
        # Prepare Shakespeare dataset and train a small GPT
        modal run FinetuneNanoGPT.py
    
        # Or run individual steps:
        modal run FinetuneNanoGPT.py::prepare_data
        modal run FinetuneNanoGPT.py::train
        modal run FinetuneNanoGPT.py::sample
    """
    
    """ Simple Modal script to run nanoGPT training on serverless GPUs. This demonstrates how you can take a local repository (nanoGPT) and run it on Modal with minimal changes - just copy the code into the image and run! Usage: # Prepare Shakespeare dataset and train a small GPT modal run FinetuneNanoGPT.py # Or run individual steps: modal run FinetuneNanoGPT.py::prepare_data modal run FinetuneNanoGPT.py::train modal run FinetuneNanoGPT.py::sample """
    In [ ]:
    Copied!
    from modal import App, Image as ModalImage, Volume
    
    from modal import App, Image as ModalImage, Volume

    ============================================================================= CONFIGURATION¶

    In [ ]:
    Copied!
    HOURS = 60 * 60
    GPU_TYPE = "a100-40gb"  # Can be: a100-40gb, a100-80gb, l40s, t4, etc.
    
    HOURS = 60 * 60 GPU_TYPE = "a100-40gb" # Can be: a100-40gb, a100-80gb, l40s, t4, etc.

    ============================================================================= MODAL APP AND VOLUME SETUP¶

    In [ ]:
    Copied!
    app = App("nanogpt-training")
    volume = Volume.from_name("nanogpt-outputs", create_if_missing=True)
    
    app = App("nanogpt-training") volume = Volume.from_name("nanogpt-outputs", create_if_missing=True)
    In [ ]:
    Copied!
    VOLUME_CONFIG = {
        "/data": volume,
    }
    
    VOLUME_CONFIG = { "/data": volume, }

    ============================================================================= IMAGE SETUP - Copy local nanoGPT repo into the image¶

    In [ ]:
    Copied!
    # Simple approach: copy the entire nanoGPT directory into the image
    NANOGPT_IMAGE = (
        ModalImage.debian_slim(python_version="3.11")
        .pip_install(
            "torch",
            "numpy",
            "transformers",
            "datasets",
            "tiktoken",
            "tqdm",
        )
        # Copy the nanoGPT directory from local filesystem into the image
        # copy=True because we have .workdir() after this
        .add_local_dir(local_path="nanoGPT", remote_path="/root/nanoGPT", copy=True)
        .workdir("/root/nanoGPT")
    )
    
    # Simple approach: copy the entire nanoGPT directory into the image NANOGPT_IMAGE = ( ModalImage.debian_slim(python_version="3.11") .pip_install( "torch", "numpy", "transformers", "datasets", "tiktoken", "tqdm", ) # Copy the nanoGPT directory from local filesystem into the image # copy=True because we have .workdir() after this .add_local_dir(local_path="nanoGPT", remote_path="/root/nanoGPT", copy=True) .workdir("/root/nanoGPT") )

    ============================================================================= DATA PREPARATION FUNCTION¶

    In [ ]:
    Copied!
    @app.function(
        image=NANOGPT_IMAGE,
        timeout=10 * 60,  # 10 minutes
    )
    def prepare_data():
        """
        Prepare the Shakespeare dataset for character-level training.
        This downloads the data and creates train.bin and val.bin files.
        """
        import subprocess
    
        print("=" * 80)
        print("PREPARING SHAKESPEARE DATASET")
        print("=" * 80)
    
        # Run the prepare script
        result = subprocess.run(
            ["python", "data/shakespeare_char/prepare.py"], capture_output=True, text=True
        )
    
        print(result.stdout)
        if result.stderr:
            print("STDERR:", result.stderr)
    
        if result.returncode != 0:
            raise RuntimeError(f"Data preparation failed with code {result.returncode}")
    
        print("✓ Data preparation completed!")
        return {"status": "completed", "dataset": "shakespeare_char"}
    
    @app.function( image=NANOGPT_IMAGE, timeout=10 * 60, # 10 minutes ) def prepare_data(): """ Prepare the Shakespeare dataset for character-level training. This downloads the data and creates train.bin and val.bin files. """ import subprocess print("=" * 80) print("PREPARING SHAKESPEARE DATASET") print("=" * 80) # Run the prepare script result = subprocess.run( ["python", "data/shakespeare_char/prepare.py"], capture_output=True, text=True ) print(result.stdout) if result.stderr: print("STDERR:", result.stderr) if result.returncode != 0: raise RuntimeError(f"Data preparation failed with code {result.returncode}") print("✓ Data preparation completed!") return {"status": "completed", "dataset": "shakespeare_char"}

    ============================================================================= TRAINING FUNCTION¶

    In [ ]:
    Copied!
    @app.function(
        image=NANOGPT_IMAGE,
        gpu=GPU_TYPE,
        volumes=VOLUME_CONFIG,
        timeout=2 * HOURS,
    )
    def train(
        max_iters: int = 1000,
        eval_interval: int = 500,
        batch_size: int = 64,
        block_size: int = 256,
        n_layer: int = 6,
        n_head: int = 6,
        n_embd: int = 384,
        learning_rate: float = 1e-3,
    ):
        """
        Train a character-level GPT on Shakespeare data.
    
        This runs the nanoGPT training script with customizable hyperparameters.
        The trained model checkpoint will be saved to the Modal volume.
    
        Args:
            max_iters: Number of training iterations
            eval_interval: How often to evaluate
            batch_size: Batch size for training
            block_size: Context length
            n_layer: Number of transformer layers
            n_head: Number of attention heads
            n_embd: Embedding dimension
            learning_rate: Learning rate
        """
        import subprocess
        import os
    
        print("=" * 80)
        print("TRAINING NANOGPT ON SHAKESPEARE")
        print("=" * 80)
        print(f"Max iterations: {max_iters}")
        print(f"Batch size: {batch_size}")
        print(f"Block size: {block_size}")
        print(f"Layers: {n_layer}, Heads: {n_head}, Embedding: {n_embd}")
        print("=" * 80)
    
        # Make sure data is prepared
        if not os.path.exists("data/shakespeare_char/train.bin"):
            print("Data not found, preparing it first...")
            prepare_data.local()
    
        # Build training command with arguments
        cmd = [
            "python",
            "train.py",
            "config/train_shakespeare_char.py",
            f"--max_iters={max_iters}",
            f"--eval_interval={eval_interval}",
            f"--batch_size={batch_size}",
            f"--block_size={block_size}",
            f"--n_layer={n_layer}",
            f"--n_head={n_head}",
            f"--n_embd={n_embd}",
            f"--learning_rate={learning_rate}",
            "--out_dir=/data/out",  # Save outputs to volume
            "--dataset=shakespeare_char",  # Important: tells sample.py where to find meta.pkl
            "--compile=False",  # Disable compilation for faster startup
        ]
    
        print(f"Running: {' '.join(cmd)}")
        print()
    
        # Run training
        result = subprocess.run(cmd, capture_output=False, text=True)
    
        if result.returncode != 0:
            raise RuntimeError(f"Training failed with code {result.returncode}")
    
        # Copy meta.pkl to output directory for sampling
        import shutil
    
        meta_src = "data/shakespeare_char/meta.pkl"
        meta_dst = "/data/out/meta.pkl"
        if os.path.exists(meta_src):
            shutil.copy(meta_src, meta_dst)
            print(f"✓ Copied {meta_src} to {meta_dst}")
    
        # Commit the volume to save the checkpoint
        volume.commit()
    
        print("\n" + "=" * 80)
        print("✓ Training completed! Model saved to /data/out")
        print("=" * 80)
    
        return {
            "status": "completed",
            "max_iters": max_iters,
            "output_dir": "/data/out",
        }
    
    @app.function( image=NANOGPT_IMAGE, gpu=GPU_TYPE, volumes=VOLUME_CONFIG, timeout=2 * HOURS, ) def train( max_iters: int = 1000, eval_interval: int = 500, batch_size: int = 64, block_size: int = 256, n_layer: int = 6, n_head: int = 6, n_embd: int = 384, learning_rate: float = 1e-3, ): """ Train a character-level GPT on Shakespeare data. This runs the nanoGPT training script with customizable hyperparameters. The trained model checkpoint will be saved to the Modal volume. Args: max_iters: Number of training iterations eval_interval: How often to evaluate batch_size: Batch size for training block_size: Context length n_layer: Number of transformer layers n_head: Number of attention heads n_embd: Embedding dimension learning_rate: Learning rate """ import subprocess import os print("=" * 80) print("TRAINING NANOGPT ON SHAKESPEARE") print("=" * 80) print(f"Max iterations: {max_iters}") print(f"Batch size: {batch_size}") print(f"Block size: {block_size}") print(f"Layers: {n_layer}, Heads: {n_head}, Embedding: {n_embd}") print("=" * 80) # Make sure data is prepared if not os.path.exists("data/shakespeare_char/train.bin"): print("Data not found, preparing it first...") prepare_data.local() # Build training command with arguments cmd = [ "python", "train.py", "config/train_shakespeare_char.py", f"--max_iters={max_iters}", f"--eval_interval={eval_interval}", f"--batch_size={batch_size}", f"--block_size={block_size}", f"--n_layer={n_layer}", f"--n_head={n_head}", f"--n_embd={n_embd}", f"--learning_rate={learning_rate}", "--out_dir=/data/out", # Save outputs to volume "--dataset=shakespeare_char", # Important: tells sample.py where to find meta.pkl "--compile=False", # Disable compilation for faster startup ] print(f"Running: {' '.join(cmd)}") print() # Run training result = subprocess.run(cmd, capture_output=False, text=True) if result.returncode != 0: raise RuntimeError(f"Training failed with code {result.returncode}") # Copy meta.pkl to output directory for sampling import shutil meta_src = "data/shakespeare_char/meta.pkl" meta_dst = "/data/out/meta.pkl" if os.path.exists(meta_src): shutil.copy(meta_src, meta_dst) print(f"✓ Copied {meta_src} to {meta_dst}") # Commit the volume to save the checkpoint volume.commit() print("\n" + "=" * 80) print("✓ Training completed! Model saved to /data/out") print("=" * 80) return { "status": "completed", "max_iters": max_iters, "output_dir": "/data/out", }

    ============================================================================= SAMPLING FUNCTION¶

    In [ ]:
    Copied!
    @app.function(
        image=NANOGPT_IMAGE,
        gpu=GPU_TYPE,
        volumes=VOLUME_CONFIG,
        timeout=10 * 60,
    )
    def sample(
        num_samples: int = 5,
        max_new_tokens: int = 500,
        temperature: float = 0.8,
        start: str = "\n",
    ):
        """
        Generate text samples from the trained model.
    
        Args:
            num_samples: Number of samples to generate
            max_new_tokens: Length of each sample
            temperature: Sampling temperature (higher = more random)
            start: Starting prompt for generation
        """
        import subprocess
        import os
    
        os.environ["TORCH_USE_CUDA_DSA"] = "1"
    
        print("=" * 80)
        print("GENERATING SAMPLES FROM TRAINED MODEL")
        print("=" * 80)
        print(f"Num samples: {num_samples}")
        print(f"Max tokens: {max_new_tokens}")
        print(f"Temperature: {temperature}")
        print(f"Start prompt: {repr(start)}")
        print("=" * 80)
    
        # Check if model files exist
        if os.path.exists("/data/out/ckpt.pt"):
            print("✓ Found checkpoint: /data/out/ckpt.pt")
        else:
            print("✗ Checkpoint not found: /data/out/ckpt.pt")
    
        if os.path.exists("/data/out/meta.pkl"):
            print("✓ Found meta file: /data/out/meta.pkl")
        else:
            print("✗ Meta file not found: /data/out/meta.pkl")
            print("  Sampling will use GPT-2 encoding which will fail!")
    
        print()
    
        # Ensure meta.pkl exists in the data directory for sample.py to find
        # sample.py looks for meta.pkl in data/{dataset}/meta.pkl first, then falls back to out_dir
        import shutil
    
        os.makedirs("data/shakespeare_char", exist_ok=True)
    
        # Copy meta.pkl from volume to data directory if it exists
        if os.path.exists("/data/out/meta.pkl") and not os.path.exists(
            "data/shakespeare_char/meta.pkl"
        ):
            shutil.copy("/data/out/meta.pkl", "data/shakespeare_char/meta.pkl")
            print("✓ Copied meta.pkl to data/shakespeare_char/")
    
        # Build sampling command
        cmd = [
            "python",
            "sample.py",
            "--out_dir=/data/out",  # Read model from volume
            f"--num_samples={num_samples}",
            f"--max_new_tokens={max_new_tokens}",
            f"--temperature={temperature}",
            f"--start={start}",
            "--compile=False",
        ]
    
        print(f"Running: {' '.join(cmd)}")
        print()
    
        # Run sampling
        result = subprocess.run(cmd, capture_output=True, text=True)
    
        print(result.stdout)
        if result.stderr:
            print("STDERR:", result.stderr)
    
        if result.returncode != 0:
            raise RuntimeError(f"Sampling failed with code {result.returncode}")
    
        print("\n" + "=" * 80)
        print("✓ Sampling completed!")
        print("=" * 80)
    
        return {"status": "completed", "samples": result.stdout}
    
    @app.function( image=NANOGPT_IMAGE, gpu=GPU_TYPE, volumes=VOLUME_CONFIG, timeout=10 * 60, ) def sample( num_samples: int = 5, max_new_tokens: int = 500, temperature: float = 0.8, start: str = "\n", ): """ Generate text samples from the trained model. Args: num_samples: Number of samples to generate max_new_tokens: Length of each sample temperature: Sampling temperature (higher = more random) start: Starting prompt for generation """ import subprocess import os os.environ["TORCH_USE_CUDA_DSA"] = "1" print("=" * 80) print("GENERATING SAMPLES FROM TRAINED MODEL") print("=" * 80) print(f"Num samples: {num_samples}") print(f"Max tokens: {max_new_tokens}") print(f"Temperature: {temperature}") print(f"Start prompt: {repr(start)}") print("=" * 80) # Check if model files exist if os.path.exists("/data/out/ckpt.pt"): print("✓ Found checkpoint: /data/out/ckpt.pt") else: print("✗ Checkpoint not found: /data/out/ckpt.pt") if os.path.exists("/data/out/meta.pkl"): print("✓ Found meta file: /data/out/meta.pkl") else: print("✗ Meta file not found: /data/out/meta.pkl") print(" Sampling will use GPT-2 encoding which will fail!") print() # Ensure meta.pkl exists in the data directory for sample.py to find # sample.py looks for meta.pkl in data/{dataset}/meta.pkl first, then falls back to out_dir import shutil os.makedirs("data/shakespeare_char", exist_ok=True) # Copy meta.pkl from volume to data directory if it exists if os.path.exists("/data/out/meta.pkl") and not os.path.exists( "data/shakespeare_char/meta.pkl" ): shutil.copy("/data/out/meta.pkl", "data/shakespeare_char/meta.pkl") print("✓ Copied meta.pkl to data/shakespeare_char/") # Build sampling command cmd = [ "python", "sample.py", "--out_dir=/data/out", # Read model from volume f"--num_samples={num_samples}", f"--max_new_tokens={max_new_tokens}", f"--temperature={temperature}", f"--start={start}", "--compile=False", ] print(f"Running: {' '.join(cmd)}") print() # Run sampling result = subprocess.run(cmd, capture_output=True, text=True) print(result.stdout) if result.stderr: print("STDERR:", result.stderr) if result.returncode != 0: raise RuntimeError(f"Sampling failed with code {result.returncode}") print("\n" + "=" * 80) print("✓ Sampling completed!") print("=" * 80) return {"status": "completed", "samples": result.stdout}

    ============================================================================= LOCAL ENTRYPOINT - Run everything in sequence¶

    In [ ]:
    Copied!
    @app.local_entrypoint()
    def main():
        """Run the complete pipeline: prepare data -> train -> sample"""
        print("🚀 Starting nanoGPT pipeline...")
    
        # Prepare data
        print("📁 Preparing dataset...")
        prepare_data.remote()
    
        # Train model
        print("🏋️ Training model...")
        train.remote(max_iters=1000, eval_interval=250, batch_size=64)
    
        # Generate samples
        print("✨ Generating samples...")
        sample.remote(num_samples=3, max_new_tokens=300)
    
        print("🎉 Pipeline completed!")
    
    @app.local_entrypoint() def main(): """Run the complete pipeline: prepare data -> train -> sample""" print("🚀 Starting nanoGPT pipeline...") # Prepare data print("📁 Preparing dataset...") prepare_data.remote() # Train model print("🏋️ Training model...") train.remote(max_iters=1000, eval_interval=250, batch_size=64) # Generate samples print("✨ Generating samples...") sample.remote(num_samples=3, max_new_tokens=300) print("🎉 Pipeline completed!")
    October 5, 2025 October 5, 2025
    Copyright © 2024 Adithya S Kolavi
    Made with Material for MkDocs