import torch
import torch.nn as nn
import torch.nn.functional as F

batch_size, hidden_size = 4, 8

torch.manual_seed(0)
batch = torch.rand((batch_size, hidden_size))

print(batch)

tensor([[0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964],
        [0.4556, 0.6323, 0.3489, 0.4017, 0.0223, 0.1689, 0.2939, 0.5185],
        [0.6977, 0.8000, 0.1610, 0.2823, 0.6816, 0.9152, 0.3971, 0.8742],
        [0.4194, 0.5529, 0.9527, 0.0362, 0.1852, 0.3734, 0.3051, 0.9320]])

class Expert(nn.Module):

    def __init__(self, hidden_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(hidden_size, hidden_size*4),
            nn.ReLU(),
            nn.Linear(hidden_size*4, hidden_size)
        )

    def forward(self, x):
        return self.net(x)

expert = Expert(hidden_size)

print(expert)

Expert(
  (net): Sequential(
    (0): Linear(in_features=8, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=8, bias=True)
  )
)

expert_output = expert(batch)

print(expert_output.shape)

torch.Size([4, 8])

class Gate(nn.Module):
    def __init__(self, hidden_size, num_experts):
        super().__init__()
        # Define a basic Feed Forward Network as the gate
        self.net = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Linear(hidden_size//2, num_experts),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.net(x)

num_experts = 6

torch.manual_seed(11)
gate = Gate(hidden_size, num_experts)

print(gate)

Gate(
  (net): Sequential(
    (0): Linear(in_features=8, out_features=4, bias=True)
    (1): ReLU()
    (2): Linear(in_features=4, out_features=6, bias=True)
    (3): Softmax(dim=-1)
  )
)

gate_output = gate(batch)

print(gate_output)

tensor([[0.1974, 0.1054, 0.1133, 0.1181, 0.1990, 0.2668],
        [0.2016, 0.1062, 0.1127, 0.1197, 0.2015, 0.2584],
        [0.1961, 0.1022, 0.1130, 0.1183, 0.1975, 0.2730],
        [0.2046, 0.0972, 0.1106, 0.1229, 0.2007, 0.2640]],
       grad_fn=<SoftmaxBackward0>)

class DenseMoE(nn.Module):

    def __init__(self, hidden_size, num_experts):
        super().__init__()
        self.num_experts = num_experts

        # Create gate
        self.gate = Gate(hidden_size, num_experts)
        
        # Create the list of experts
        self.experts = nn.ModuleList([Expert(hidden_size) for _ in range(self.num_experts)])


    def forward(self, x):
        # (1) Calculate probability distribution G(x)
        gate_probs = self.gate(x) # (batch_size, num_experts)

        # (2) Get outputs from all experts and collect them in a single tensor
        outputs = torch.stack([expert(x) for expert in self.experts], dim=-1)
        # (batch_size, hidden_size, num_experts)
        
        # (3) Adjust shape of gate tensor to enable calculation of weighted sum
        gate_probs = gate_probs.unsqueeze(dim=1) # (batch_size, 1, num_experts)
        
        # (4) Calculate weighted sum of outputs based on expert probabilities
        output = torch.sum(outputs*gate_probs, dim=-1) # (batch_size, hidden_size)
        
        return output

torch.manual_seed(11)
dense_moe = DenseMoE(hidden_size, num_experts)

print(dense_moe)

DenseMoE(
  (gate): Gate(
    (net): Sequential(
      (0): Linear(in_features=8, out_features=4, bias=True)
      (1): ReLU()
      (2): Linear(in_features=4, out_features=6, bias=True)
      (3): Softmax(dim=-1)
    )
  )
  (experts): ModuleList(
    (0-5): 6 x Expert(
      (net): Sequential(
        (0): Linear(in_features=8, out_features=32, bias=True)
        (1): ReLU()
        (2): Linear(in_features=32, out_features=8, bias=True)
      )
    )
  )
)

dense_moe_output = dense_moe(batch)

print(dense_moe_output.shape)
print(dense_moe_output)

torch.Size([4, 8])
tensor([[-0.0966, -0.1017,  0.0983, -0.0452,  0.1274,  0.0453,  0.1386,  0.1311],
        [-0.0383, -0.0851,  0.0737,  0.0064,  0.0743,  0.0227,  0.0922,  0.0402],
        [-0.0893, -0.1092,  0.1192, -0.0358,  0.1767,  0.0488,  0.1595,  0.1423],
        [-0.0378, -0.1017,  0.1023, -0.0241,  0.1521,  0.0122,  0.1428,  0.0672]],
       grad_fn=<SumBackward1>)

class Router(nn.Module):

    def __init__(self, top_k):
        super().__init__()
        self.top_k = top_k

    def forward(self, gate_probs):
        # (1) Get initial gate probabilities and indices od top-k experts
        topk_probs, topk_indices = torch.topk(gate_probs, self.top_k, dim=-1)

        # (2) Create a tensor the will hold all final probabilities
        # Initialize tensor with -infinity => 0 after softmax
        router_probs = torch.full_like(gate_probs, float('-inf'))  # (batch_size, hidden_size)

        # (3) Copy only the top-k probabilities over into the tensor
        router_probs.scatter_(-1, topk_indices, topk_probs)

        # (4) Use softmax to rescale probabilities so they sum up to 1 again
        router_probs = F.softmax(router_probs, dim=-1)

        # Return probabilities and the indicies of the top-k experts
        return router_probs, topk_indices

top_k = 2

router = Router(top_k=top_k)

router_probs, topk_indices = router(gate_output)

print(router_probs.shape)
print(topk_indices.shape)

torch.Size([4, 6])
torch.Size([4, 2])

class SparseMoE(nn.Module):

    def __init__(self, hidden_size, num_experts, top_k):
        super().__init__()
        self.num_experts = num_experts
        self.top_k = top_k

        # Create Gate
        self.gate = Gate(hidden_size, num_experts)

        # Create Router
        self.router = Router(top_k)
        
        # Create a list of experts
        self.experts = nn.ModuleList([Expert(hidden_size) for _ in range(self.num_experts)])


    def forward(self, x):
        # (1) Calculate initial gate probabilties
        gate_probs = self.gate(x) # (batch_size, num_experts)

        # (2) Get router probabilities and get indices of top-k experts
        router_probs, topk_indices = self.router(gate_probs) # (batch_size, num_experts)

        # (3) Create tensor that will hold the final output
        output = torch.zeros_like(x)

        for idx, expert in enumerate(self.experts):
            # (4) Check if the expert is needed at all (i.e., at least 1 sample is passed to that expert)
            expert_mask = (topk_indices == idx).any(dim=-1)

            # (5) Find the indices of all samples in the batch that are passed to the current expert
            selected_indices = torch.nonzero(expert_mask).squeeze(-1)

            if selected_indices.numel() > 0:
                # (6) Push the relevant subset of samples through the expert subnetwork layer
                expert_output = expert(x[selected_indices])
                # (7) Extract the probabilities for the expert and each sample
                expert_probs = router_probs[selected_indices, idx].unsqueeze(1)
                # (8) Recalculate the expert output by multiplying the corresponding probabilities
                export_output = expert_output * expert_probs
                # (9) Update the final output tensor
                output.index_add_(0, selected_indices, export_output)
                
        return output

torch.manual_seed(11)
sparse_moe = SparseMoE(hidden_size, num_experts, top_k)

# Print model
print(sparse_moe)

SparseMoE(
  (gate): Gate(
    (net): Sequential(
      (0): Linear(in_features=8, out_features=4, bias=True)
      (1): ReLU()
      (2): Linear(in_features=4, out_features=6, bias=True)
      (3): Softmax(dim=-1)
    )
  )
  (router): Router()
  (experts): ModuleList(
    (0-5): 6 x Expert(
      (net): Sequential(
        (0): Linear(in_features=8, out_features=32, bias=True)
        (1): ReLU()
        (2): Linear(in_features=32, out_features=8, bias=True)
      )
    )
  )
)

sparse_moe_output = sparse_moe(batch)

print(sparse_moe_output.shape)

torch.Size([4, 8])

Mixture of Experts (MoE)¶

Setting up the Notebook¶

Make Required Imports¶

Generate Example Data¶

MoE: Basic Model Architecture¶

MoE: Core Components¶

Experts¶

Gating Mechanism¶

Basic Routing Strategies¶

Dense Mixture of Experts¶

Basic Implementation¶

(1) Calculate Gate Probabilities¶

(2) Calculate & Collect Outputs from all Experts¶

(3) Prepare Gate Probabilities¶

(4) Calculate Final Output as Weighted Sum of all Expert Outputs¶

Discussion & Limitations¶

Sparse Mixture of Experts¶

Basic Implementation¶

(1) Find the top-k gate probabilities and expert indices¶

(2) Initialize Output Tensor¶

(3) Copy over Top-k Probabilities¶

(4) Recalculate Gate Probabilities¶

(1)+(2) Calculate Router Probabilities and Indices of Experts¶

(3) Initialize Output Tensor¶

(4) Calculate Expert Mask¶

(5) Extract Indices of Training Samples¶

(6) Get Output of Expert¶

(7) Extract Indices of Training Samples¶

(8) Rescaling Expert Output¶

(9) Update Final Output Tensor¶

Discussion & Limitations¶

Discussion — What's Next?¶

Improving Sparse MoE Models¶

Beyond Simple Inputs¶

MoEs in Action¶

Summary¶