6. 预训练与加载模型

Reading time: 31 minutes

tip

学习和实践 AWS 黑客技术：HackTricks Training AWS Red Team Expert (ARTE)
学习和实践 GCP 黑客技术：HackTricks Training GCP Red Team Expert (GRTE) 学习和实践 Azure 黑客技术：HackTricks Training Azure Red Team Expert (AzRTE)

支持 HackTricks

查看 订阅计划!
加入 💬 Discord 群组 或 Telegram 群组 或在 Twitter 🐦 上关注我们 @hacktricks_live.
通过向 HackTricks 和 HackTricks Cloud GitHub 仓库提交 PR 来分享黑客技巧。

文本生成

为了训练一个模型，我们需要该模型能够生成新的标记。然后，我们将生成的标记与预期的标记进行比较，以便训练模型学习它需要生成的标记。

如前面的例子中，我们已经预测了一些标记，可以重用该功能来实现这个目的。

tip

第六个阶段的目标非常简单：从头开始训练模型。为此，将使用之前的LLM架构，并通过定义的损失函数和优化器对数据集进行循环，以训练模型的所有参数。

文本评估

为了进行正确的训练，需要测量检查获得的预测与预期标记的匹配情况。训练的目标是最大化正确标记的可能性，这涉及到相对于其他标记增加其概率。

为了最大化正确标记的概率，必须修改模型的权重，以使该概率最大化。权重的更新是通过反向传播完成的。这需要一个要最大化的损失函数。在这种情况下，函数将是执行的预测与期望预测之间的差异。

然而，它将使用以n为底的对数，而不是处理原始预测。因此，如果当前对预期标记的预测是7.4541e-05，则7.4541e-05的自然对数（以e为底）大约是**-9.5042**。
然后，对于每个具有5个标记上下文长度的条目，例如，模型需要预测5个标记，前4个标记是输入的最后一个，第五个是预测的标记。因此，在这种情况下，对于每个条目我们将有5个预测（即使前4个在输入中，模型并不知道这一点），因此有5个预期标记，因此有5个概率需要最大化。

因此，在对每个预测执行自然对数后，计算平均值，去掉负号（这称为_交叉熵损失_），这是需要尽可能接近0的数字，因为1的自然对数是0：

https://camo.githubusercontent.com/3c0ab9c55cefa10b667f1014b6c42df901fa330bb2bc9cea88885e784daec8ba/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830355f636f6d707265737365642f63726f73732d656e74726f70792e776562703f313233

衡量模型好坏的另一种方法称为困惑度。困惑度是用于评估概率模型预测样本的好坏的指标。在语言建模中，它表示模型在预测序列中的下一个标记时的不确定性。
例如，困惑度值为48725，意味着在需要预测一个标记时，它对词汇表中48,725个标记中的哪个是正确的感到不确定。

预训练示例

这是在https://github.com/rasbt/LLMs-from-scratch/blob/main/ch05/01_main-chapter-code/ch05.ipynb中提出的初始代码，有时会稍作修改。

这里使用的先前代码，但在前面的部分中已经解释过

python

"""
This is code explained before so it won't be exaplained
"""

import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
def __init__(self, txt, tokenizer, max_length, stride):
self.input_ids = []
self.target_ids = []

# Tokenize the entire text
token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

# Use a sliding window to chunk the book into overlapping sequences of max_length
for i in range(0, len(token_ids) - max_length, stride):
input_chunk = token_ids[i:i + max_length]
target_chunk = token_ids[i + 1: i + max_length + 1]
self.input_ids.append(torch.tensor(input_chunk))
self.target_ids.append(torch.tensor(target_chunk))

def __len__(self):
return len(self.input_ids)

def __getitem__(self, idx):
return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256,
stride=128, shuffle=True, drop_last=True, num_workers=0):
# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Create dataset
dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

# Create dataloader
dataloader = DataLoader(
dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

return dataloader


class MultiHeadAttention(nn.Module):
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
super().__init__()
assert d_out % num_heads == 0, "d_out must be divisible by n_heads"

self.d_out = d_out
self.num_heads = num_heads
self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
self.dropout = nn.Dropout(dropout)
self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

def forward(self, x):
b, num_tokens, d_in = x.shape

keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
queries = self.W_query(x)
values = self.W_value(x)

# We implicitly split the matrix by adding a `num_heads` dimension
# Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
values = values.view(b, num_tokens, self.num_heads, self.head_dim)
queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

# Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
keys = keys.transpose(1, 2)
queries = queries.transpose(1, 2)
values = values.transpose(1, 2)

# Compute scaled dot-product attention (aka self-attention) with a causal mask
attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

# Original mask truncated to the number of tokens and converted to boolean
mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

# Use the mask to fill attention scores
attn_scores.masked_fill_(mask_bool, -torch.inf)

attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
attn_weights = self.dropout(attn_weights)

# Shape: (b, num_tokens, num_heads, head_dim)
context_vec = (attn_weights @ values).transpose(1, 2)

# Combine heads, where self.d_out = self.num_heads * self.head_dim
context_vec = context_vec.reshape(b, num_tokens, self.d_out)
context_vec = self.out_proj(context_vec)  # optional projection

return context_vec


class LayerNorm(nn.Module):
def __init__(self, emb_dim):
super().__init__()
self.eps = 1e-5
self.scale = nn.Parameter(torch.ones(emb_dim))
self.shift = nn.Parameter(torch.zeros(emb_dim))

def forward(self, x):
mean = x.mean(dim=-1, keepdim=True)
var = x.var(dim=-1, keepdim=True, unbiased=False)
norm_x = (x - mean) / torch.sqrt(var + self.eps)
return self.scale * norm_x + self.shift


class GELU(nn.Module):
def __init__(self):
super().__init__()

def forward(self, x):
return 0.5 * x * (1 + torch.tanh(
torch.sqrt(torch.tensor(2.0 / torch.pi)) *
(x + 0.044715 * torch.pow(x, 3))
))


class FeedForward(nn.Module):
def __init__(self, cfg):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
GELU(),
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
)

def forward(self, x):
return self.layers(x)


class TransformerBlock(nn.Module):
def __init__(self, cfg):
super().__init__()
self.att = MultiHeadAttention(
d_in=cfg["emb_dim"],
d_out=cfg["emb_dim"],
context_length=cfg["context_length"],
num_heads=cfg["n_heads"],
dropout=cfg["drop_rate"],
qkv_bias=cfg["qkv_bias"])
self.ff = FeedForward(cfg)
self.norm1 = LayerNorm(cfg["emb_dim"])
self.norm2 = LayerNorm(cfg["emb_dim"])
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

def forward(self, x):
# Shortcut connection for attention block
shortcut = x
x = self.norm1(x)
x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
x = self.drop_shortcut(x)
x = x + shortcut  # Add the original input back

# Shortcut connection for feed-forward block
shortcut = x
x = self.norm2(x)
x = self.ff(x)
x = self.drop_shortcut(x)
x = x + shortcut  # Add the original input back

return x


class GPTModel(nn.Module):
def __init__(self, cfg):
super().__init__()
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
self.drop_emb = nn.Dropout(cfg["drop_rate"])

self.trf_blocks = nn.Sequential(
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

self.final_norm = LayerNorm(cfg["emb_dim"])
self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

def forward(self, in_idx):
batch_size, seq_len = in_idx.shape
tok_embeds = self.tok_emb(in_idx)
pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
x = self.drop_emb(x)
x = self.trf_blocks(x)
x = self.final_norm(x)
logits = self.out_head(x)
return logits

python

# Download contents to train the data with
import os
import urllib.request

file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode('utf-8')
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
else:
with open(file_path, "r", encoding="utf-8") as file:
text_data = file.read()

total_characters = len(text_data)
tokenizer = tiktoken.get_encoding("gpt2")
total_tokens = len(tokenizer.encode(text_data))

print("Data downloaded")
print("Characters:", total_characters)
print("Tokens:", total_tokens)

# Model initialization
GPT_CONFIG_124M = {
"vocab_size": 50257,   # Vocabulary size
"context_length": 256, # Shortened context length (orig: 1024)
"emb_dim": 768,        # Embedding dimension
"n_heads": 12,         # Number of attention heads
"n_layers": 12,        # Number of layers
"drop_rate": 0.1,      # Dropout rate
"qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()
print ("Model initialized")


# Functions to transform from tokens to ids and from to ids to tokens
def text_to_token_ids(text, tokenizer):
encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
flat = token_ids.squeeze(0) # remove batch dimension
return tokenizer.decode(flat.tolist())



# Define loss functions
def calc_loss_batch(input_batch, target_batch, model, device):
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
logits = model(input_batch)
loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
total_loss = 0.
if len(data_loader) == 0:
return float("nan")
elif num_batches is None:
num_batches = len(data_loader)
else:
# Reduce the number of batches to match the total number of batches in the data loader
# if num_batches exceeds the number of batches in the data loader
num_batches = min(num_batches, len(data_loader))
for i, (input_batch, target_batch) in enumerate(data_loader):
if i < num_batches:
loss = calc_loss_batch(input_batch, target_batch, model, device)
total_loss += loss.item()
else:
break
return total_loss / num_batches


# Apply Train/validation ratio and create dataloaders
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

torch.manual_seed(123)

train_loader = create_dataloader_v1(
train_data,
batch_size=2,
max_length=GPT_CONFIG_124M["context_length"],
stride=GPT_CONFIG_124M["context_length"],
drop_last=True,
shuffle=True,
num_workers=0
)

val_loader = create_dataloader_v1(
val_data,
batch_size=2,
max_length=GPT_CONFIG_124M["context_length"],
stride=GPT_CONFIG_124M["context_length"],
drop_last=False,
shuffle=False,
num_workers=0
)


# Sanity checks
if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
print("Not enough tokens for the training loader. "
"Try to lower the `GPT_CONFIG_124M['context_length']` or "
"increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
print("Not enough tokens for the validation loader. "
"Try to lower the `GPT_CONFIG_124M['context_length']` or "
"decrease the `training_ratio`")

print("Train loader:")
for x, y in train_loader:
print(x.shape, y.shape)

print("\nValidation loader:")
for x, y in val_loader:
print(x.shape, y.shape)

train_tokens = 0
for input_batch, target_batch in train_loader:
train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in val_loader:
val_tokens += input_batch.numel()

print("Training tokens:", train_tokens)
print("Validation tokens:", val_tokens)
print("All tokens:", train_tokens + val_tokens)


# Indicate the device to use
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")

print(f"Using {device} device.")

model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes



# Pre-calculate losses without starting yet
torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader

with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
train_loss = calc_loss_loader(train_loader, model, device)
val_loss = calc_loss_loader(val_loader, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)


# Functions to train the data
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
eval_freq, eval_iter, start_context, tokenizer):
# Initialize lists to track losses and tokens seen
train_losses, val_losses, track_tokens_seen = [], [], []
tokens_seen, global_step = 0, -1

# Main training loop
for epoch in range(num_epochs):
model.train()  # Set model to training mode

for input_batch, target_batch in train_loader:
optimizer.zero_grad() # Reset loss gradients from previous batch iteration
loss = calc_loss_batch(input_batch, target_batch, model, device)
loss.backward() # Calculate loss gradients
optimizer.step() # Update model weights using loss gradients
tokens_seen += input_batch.numel()
global_step += 1

# Optional evaluation step
if global_step % eval_freq == 0:
train_loss, val_loss = evaluate_model(
model, train_loader, val_loader, device, eval_iter)
train_losses.append(train_loss)
val_losses.append(val_loss)
track_tokens_seen.append(tokens_seen)
print(f"Ep {epoch+1} (Step {global_step:06d}): "
f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

# Print a sample text after each epoch
generate_and_print_sample(
model, tokenizer, device, start_context
)

return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
model.eval()
with torch.no_grad():
train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
model.train()
return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
model.eval()
context_size = model.pos_emb.weight.shape[0]
encoded = text_to_token_ids(start_context, tokenizer).to(device)
with torch.no_grad():
token_ids = generate_text(
model=model, idx=encoded,
max_new_tokens=50, context_size=context_size
)
decoded_text = token_ids_to_text(token_ids, tokenizer)
print(decoded_text.replace("\n", " "))  # Compact print format
model.train()


# Start training!
import time
start_time = time.time()

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
model, train_loader, val_loader, optimizer, device,
num_epochs=num_epochs, eval_freq=5, eval_iter=5,
start_context="Every effort moves you", tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")



# Show graphics with the training process
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import math
def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
fig, ax1 = plt.subplots(figsize=(5, 3))
ax1.plot(epochs_seen, train_losses, label="Training loss")
ax1.plot(
epochs_seen, val_losses, linestyle="-.", label="Validation loss"
)
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Loss")
ax1.legend(loc="upper right")
ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
ax2 = ax1.twiny()
ax2.plot(tokens_seen, train_losses, alpha=0)
ax2.set_xlabel("Tokens seen")
fig.tight_layout()
plt.show()

# Compute perplexity from the loss values
train_ppls = [math.exp(loss) for loss in train_losses]
val_ppls = [math.exp(loss) for loss in val_losses]
# Plot perplexity over tokens seen
plt.figure()
plt.plot(tokens_seen, train_ppls, label='Training Perplexity')
plt.plot(tokens_seen, val_ppls, label='Validation Perplexity')
plt.xlabel('Tokens Seen')
plt.ylabel('Perplexity')
plt.title('Perplexity over Training')
plt.legend()
plt.show()

epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)


torch.save({
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
},
"/tmp/model_and_optimizer.pth"
)

让我们逐步解释

文本 <--> ids 转换的函数

这些是一些简单的函数，可以用于将词汇中的文本转换为 ids 及其反向转换。这在文本处理的开始和预测的结束时是必要的：

python

# Functions to transform from tokens to ids and from to ids to tokens
def text_to_token_ids(text, tokenizer):
encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
flat = token_ids.squeeze(0) # remove batch dimension
return tokenizer.decode(flat.tolist())

生成文本函数

在前面的部分中，一个函数仅在获取 logits 后获取 最可能的标记。然而，这意味着对于每个输入，总是会生成相同的输出，这使得结果非常确定性。

以下的 generate_text 函数将应用 top-k、temperature 和 multinomial 概念。

top-k 意味着我们将开始将所有标记的概率减少到 -inf，除了前 k 个标记。因此，如果 k=3，在做出决策之前，只有 3 个最可能的标记的概率将与 -inf 不同。
temperature 意味着每个概率将被温度值除以。值为 0.1 将提高最高概率与最低概率的比较，而温度为 5 的情况下，例如将使其更加平坦。这有助于提高我们希望 LLM 具有的响应变化。
在应用温度后，再次应用 softmax 函数，使所有剩余标记的总概率为 1。
最后，函数不再选择概率最大的标记，而是应用 multinomial 来 根据最终概率预测下一个标记。因此，如果标记 1 的概率为 70%，标记 2 的概率为 20%，标记 3 的概率为 10%，那么标记 1 将被选择 70% 的时间，标记 2 将被选择 20% 的时间，标记 3 将被选择 10% 的时间。

python

# Generate text function
def generate_text(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

# For-loop is the same as before: Get logits, and only focus on last time step
for _ in range(max_new_tokens):
idx_cond = idx[:, -context_size:]
with torch.no_grad():
logits = model(idx_cond)
logits = logits[:, -1, :]

# New: Filter logits with top_k sampling
if top_k is not None:
# Keep only top_k values
top_logits, _ = torch.topk(logits, top_k)
min_val = top_logits[:, -1]
logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

# New: Apply temperature scaling
if temperature > 0.0:
logits = logits / temperature

# Apply softmax to get probabilities
probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

# Sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

# Otherwise same as before: get idx of the vocab entry with the highest logits value
else:
idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
break

# Same as before: append sampled index to the running sequence
idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

return idx

tip

有一种常见的替代方法叫做 top-p，也称为核采样，它不是获取具有最高概率的 k 个样本，而是 **按概率对所有结果的 vocabulary 进行排序，并从最高概率到最低概率累加，直到达到阈值。

然后，只有这些单词 的 vocabulary 将根据它们的相对概率被考虑。

这使得不需要选择一个数量为 k 的样本，因为最佳的 k 可能在每种情况下都不同，而是 仅需要一个阈值。

请注意，这一改进未包含在之前的代码中。

tip

改进生成文本的另一种方法是使用 Beam search，而不是在此示例中使用的贪婪搜索。
与贪婪搜索不同，贪婪搜索在每一步选择最可能的下一个单词并构建单个序列，beam search 在每一步跟踪得分最高的 𝑘 k 个部分序列（称为“beams”）。通过同时探索多种可能性，它在效率和质量之间取得平衡，增加了 找到更好整体 序列的机会，这可能由于早期的次优选择而被贪婪方法遗漏。

请注意，这一改进未包含在之前的代码中。

Loss functions

calc_loss_batch 函数计算单个批次预测的交叉熵。
calc_loss_loader 获取所有批次的交叉熵并计算 平均交叉熵。

python

# Define loss functions
def calc_loss_batch(input_batch, target_batch, model, device):
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
logits = model(input_batch)
loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
total_loss = 0.
if len(data_loader) == 0:
return float("nan")
elif num_batches is None:
num_batches = len(data_loader)
else:
# Reduce the number of batches to match the total number of batches in the data loader
# if num_batches exceeds the number of batches in the data loader
num_batches = min(num_batches, len(data_loader))
for i, (input_batch, target_batch) in enumerate(data_loader):
if i < num_batches:
loss = calc_loss_batch(input_batch, target_batch, model, device)
total_loss += loss.item()
else:
break
return total_loss / num_batches

tip

梯度裁剪是一种用于增强大型神经网络训练稳定性的技术，通过设置梯度幅度的最大阈值来实现。当梯度超过这个预定义的max_norm时，它们会按比例缩小，以确保对模型参数的更新保持在可管理的范围内，防止出现梯度爆炸等问题，并确保更可控和稳定的训练。

请注意，这一改进未包含在之前的代码中。

查看以下示例：

加载数据

函数create_dataloader_v1和create_dataloader_v1在前面的部分已经讨论过。

从这里可以注意到，定义了90%的文本将用于训练，而10%将用于验证，两个数据集存储在两个不同的数据加载器中。
请注意，有时数据集的一部分也会留作测试集，以更好地评估模型的性能。

两个数据加载器使用相同的批量大小、最大长度和步幅，以及工作线程数（在这种情况下为0）。
主要的区别在于每个加载器使用的数据，以及验证器不会丢弃最后一部分数据，也不会打乱数据，因为这对验证目的并不需要。

此外，步幅与上下文长度一样大，意味着用于训练数据的上下文之间不会重叠（减少过拟合，但也减少了训练数据集）。

此外，请注意，在这种情况下，批量大小为2，以将数据分为2个批次，主要目的是允许并行处理并减少每个批次的消耗。

python

train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

torch.manual_seed(123)

train_loader = create_dataloader_v1(
train_data,
batch_size=2,
max_length=GPT_CONFIG_124M["context_length"],
stride=GPT_CONFIG_124M["context_length"],
drop_last=True,
shuffle=True,
num_workers=0
)

val_loader = create_dataloader_v1(
val_data,
batch_size=2,
max_length=GPT_CONFIG_124M["context_length"],
stride=GPT_CONFIG_124M["context_length"],
drop_last=False,
shuffle=False,
num_workers=0
)

Sanity Checks

目标是检查是否有足够的训练令牌，形状是否符合预期，并获取有关用于训练和验证的令牌数量的一些信息：

python

# Sanity checks
if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
print("Not enough tokens for the training loader. "
"Try to lower the `GPT_CONFIG_124M['context_length']` or "
"increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
print("Not enough tokens for the validation loader. "
"Try to lower the `GPT_CONFIG_124M['context_length']` or "
"decrease the `training_ratio`")

print("Train loader:")
for x, y in train_loader:
print(x.shape, y.shape)

print("\nValidation loader:")
for x, y in val_loader:
print(x.shape, y.shape)

train_tokens = 0
for input_batch, target_batch in train_loader:
train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in val_loader:
val_tokens += input_batch.numel()

print("Training tokens:", train_tokens)
print("Validation tokens:", val_tokens)
print("All tokens:", train_tokens + val_tokens)

选择用于训练和预计算的设备

以下代码仅选择要使用的设备，并计算训练损失和验证损失（在尚未进行任何训练的情况下）作为起点。

python

# Indicate the device to use

if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")

print(f"Using {device} device.")

model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes

# Pre-calculate losses without starting yet
torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader

with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
train_loss = calc_loss_loader(train_loader, model, device)
val_loss = calc_loss_loader(val_loader, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

训练函数

函数 generate_and_print_sample 将获取一个上下文并生成一些标记，以便了解模型在该时刻的表现。这由 train_model_simple 在每一步调用。

函数 evaluate_model 会根据训练函数的指示频繁调用，用于测量模型训练时的训练损失和验证损失。

然后，主要的函数 train_model_simple 实际上是训练模型的函数。它期望：

训练数据加载器（数据已经分离并准备好进行训练）
验证器加载器
训练期间使用的 优化器：这是将使用梯度并更新参数以减少损失的函数。在这种情况下，如你所见，使用的是 AdamW，但还有许多其他选择。
调用 optimizer.zero_grad() 以在每轮重置梯度，防止它们累积。
lr 参数是 学习率，决定在优化过程中更新模型参数时采取的 步长大小。较小的学习率意味着优化器对权重进行 较小的更新，这可能导致更精确的收敛，但可能会减慢训练。较大的学习率可以加快训练，但 有可能超越 损失函数的最小值（跳过损失函数最小化的点）。
权重衰减 通过添加一个额外的项来修改 损失计算 步骤，以惩罚大权重。这鼓励优化器找到具有较小权重的解决方案，在良好拟合数据和保持模型简单之间取得平衡，防止机器学习模型过拟合，避免模型对任何单一特征赋予过多重要性。
传统优化器如带有 L2 正则化的 SGD 将权重衰减与损失函数的梯度结合。然而，AdamW（Adam 优化器的一个变体）将权重衰减与梯度更新解耦，从而实现更有效的正则化。
用于训练的设备
训练轮数：遍历训练数据的次数
评估频率：调用 evaluate_model 的频率
评估迭代：在调用 generate_and_print_sample 时评估模型当前状态时使用的批次数
开始上下文：调用 generate_and_print_sample 时使用的起始句子
分词器

python

# Functions to train the data
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
eval_freq, eval_iter, start_context, tokenizer):
# Initialize lists to track losses and tokens seen
train_losses, val_losses, track_tokens_seen = [], [], []
tokens_seen, global_step = 0, -1

# Main training loop
for epoch in range(num_epochs):
model.train()  # Set model to training mode

for input_batch, target_batch in train_loader:
optimizer.zero_grad() # Reset loss gradients from previous batch iteration
loss = calc_loss_batch(input_batch, target_batch, model, device)
loss.backward() # Calculate loss gradients
optimizer.step() # Update model weights using loss gradients
tokens_seen += input_batch.numel()
global_step += 1

# Optional evaluation step
if global_step % eval_freq == 0:
train_loss, val_loss = evaluate_model(
model, train_loader, val_loader, device, eval_iter)
train_losses.append(train_loss)
val_losses.append(val_loss)
track_tokens_seen.append(tokens_seen)
print(f"Ep {epoch+1} (Step {global_step:06d}): "
f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

# Print a sample text after each epoch
generate_and_print_sample(
model, tokenizer, device, start_context
)

return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
model.eval() # Set in eval mode to avoid dropout
with torch.no_grad():
train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
model.train() # Back to training model applying all the configurations
return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
model.eval() # Set in eval mode to avoid dropout
context_size = model.pos_emb.weight.shape[0]
encoded = text_to_token_ids(start_context, tokenizer).to(device)
with torch.no_grad():
token_ids = generate_text(
model=model, idx=encoded,
max_new_tokens=50, context_size=context_size
)
decoded_text = token_ids_to_text(token_ids, tokenizer)
print(decoded_text.replace("\n", " "))  # Compact print format
model.train() # Back to training model applying all the configurations

tip

为了提高学习率，有几个相关的技术称为 linear warmup 和 cosine decay.

Linear warmup 的定义是设定一个初始学习率和一个最大学习率，并在每个 epoch 后持续更新。这是因为以较小的权重更新开始训练可以降低模型在训练阶段遇到大且不稳定更新的风险。
Cosine decay 是一种技术，它在 warmup 阶段后 逐渐减少学习率，遵循半余弦曲线，减缓权重更新以 最小化超调 损失最小值的风险，并确保后期训练的稳定性。

请注意，这些改进未包含在之前的代码中。

Start training

python

import time
start_time = time.time()

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
model, train_loader, val_loader, optimizer, device,
num_epochs=num_epochs, eval_freq=5, eval_iter=5,
start_context="Every effort moves you", tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

打印训练演变

使用以下函数，可以打印模型在训练过程中的演变。

python

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import math
def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
fig, ax1 = plt.subplots(figsize=(5, 3))
ax1.plot(epochs_seen, train_losses, label="Training loss")
ax1.plot(
epochs_seen, val_losses, linestyle="-.", label="Validation loss"
)
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Loss")
ax1.legend(loc="upper right")
ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
ax2 = ax1.twiny()
ax2.plot(tokens_seen, train_losses, alpha=0)
ax2.set_xlabel("Tokens seen")
fig.tight_layout()
plt.show()

# Compute perplexity from the loss values
train_ppls = [math.exp(loss) for loss in train_losses]
val_ppls = [math.exp(loss) for loss in val_losses]
# Plot perplexity over tokens seen
plt.figure()
plt.plot(tokens_seen, train_ppls, label='Training Perplexity')
plt.plot(tokens_seen, val_ppls, label='Validation Perplexity')
plt.xlabel('Tokens Seen')
plt.ylabel('Perplexity')
plt.title('Perplexity over Training')
plt.legend()
plt.show()

epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

保存模型

如果您想稍后继续训练，可以保存模型和优化器：

python

# Save the model and the optimizer for later training
torch.save({
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
},
"/tmp/model_and_optimizer.pth"
)
# Note that this model with the optimizer occupied close to 2GB

# Restore model and optimizer for training
checkpoint = torch.load("/tmp/model_and_optimizer.pth", map_location=device)

model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train(); # Put in training mode

或者仅仅是模型，如果您只打算使用它：

python

# Save the model
torch.save(model.state_dict(), "model.pth")

# Load it
model = GPTModel(GPT_CONFIG_124M)

model.load_state_dict(torch.load("model.pth", map_location=device))

model.eval() # Put in eval mode

加载 GPT2 权重

有两个快速脚本可以在本地加载 GPT2 权重。对于这两个脚本，您可以在本地克隆仓库 https://github.com/rasbt/LLMs-from-scratch，然后：

脚本 https://github.com/rasbt/LLMs-from-scratch/blob/main/ch05/01_main-chapter-code/gpt_generate.py 将下载所有权重并将格式从 OpenAI 转换为我们 LLM 所期望的格式。该脚本还准备了所需的配置和提示：“每一次努力都在推动你”
脚本 https://github.com/rasbt/LLMs-from-scratch/blob/main/ch05/02_alternative_weight_loading/weight-loading-hf-transformers.ipynb 允许您在本地加载任何 GPT2 权重（只需更改 CHOOSE_MODEL 变量）并根据一些提示预测文本。

参考文献

https://www.manning.com/books/build-a-large-language-model-from-scratch

tip

支持 HackTricks

查看 订阅计划!
加入 💬 Discord 群组 或 Telegram 群组 或在 Twitter 🐦 上关注我们 @hacktricks_live.
通过向 HackTricks 和 HackTricks Cloud GitHub 仓库提交 PR 来分享黑客技巧。