how to

MoE

Nov 21, 2024
notesjulyfun技术学习models
1 Minutes
163 Words
  • MoE: 门控选专家 → Top-K激活 → 加权求和
1
class MoELayer:
2
def __init__(self, hidden_size=768, num_experts=8, top_k=2):
3
self.gate = Linear(hidden_size, num_experts)
4
self.experts = ModuleList([Expert(hidden_size) for _ in
5
range(num_experts)])
6
self.top_k = top_k
7
8
def forward(self, x):
9
B, S, H = x.shape
10
x = x.view(-1, H)
11
12
gates = softmax(self.gate(x), dim=-1)
13
probs, indices = topk(gates, self.top_k, dim=-1)
14
probs = probs / probs.sum(dim=-1, keepdim=True)
15
28 collapsed lines
16
out = zeros_like(x)
17
for i in range(x.shape[0]):
18
for j in range(self.top_k):
19
expert = self.experts[indices[i, j]]
20
out[i] += probs[i, j] * expert(x[i:i+1])
21
22
return out.view(B, S, H)
23
24
class Expert:
25
def __init__(self, hidden_size):
26
self.w1 = Linear(hidden_size, hidden_size * 4)
27
self.w2 = Linear(hidden_size * 4, hidden_size)
28
self.act = GELU()
29
30
def forward(self, x):
31
return self.w2(self.act(self.w1(x)))
32
33
class TransformerBlock:
34
def __init__(self, hidden_size, num_heads, num_experts=8):
35
self.attn = MultiHeadAttention(hidden_size, num_heads)
36
self.moe = MoELayer(hidden_size, num_experts)
37
self.norm1 = LayerNorm(hidden_size)
38
self.norm2 = LayerNorm(hidden_size)
39
40
def forward(self, x):
41
x = self.norm1(x + self.attn(x))
42
x = self.norm2(x + self.moe(x))
43
return x
  • 负载均衡:与均匀分布做一个 loss
Article title:MoE
Article author:Julyfun
Release time:Nov 21, 2024
Copyright 2025
Sitemap