for epoch inrange(num_epochs): net.train() train_loss = [] for X, y in tqdm(train_iter): X, y = X.to(device), y.to(device) optimizer.zero_grad() loss = loss_function(net(X), y) loss.mean().backward() optimizer.step()
基于 k = 1, 4, 16, 64,通过对整个序列预测的计算,让我们更仔细地看一下 k 步预测的困难:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
max_steps = 64 features = torch.zeros((T - tau - max_steps + 1, tau + max_steps)) # 列i(i<tau)是来自x的观测,其时间步从(i)到(i+T-tau-max_steps+1) for i inrange(tau): features[:, i] = x[i:i + T - tau - max_steps + 1]
# 列i(i>=tau)是来自(i-tau+1)步的预测,其时间步从(i)到(i+T-tau-max_steps+1) for i inrange(tau, tau + max_steps): features[:, i] = net(features[:, i - tau:i].to(device)).reshape(-1)
steps = (1, 4, 16, 64) d2l.plot([time[tau + i - 1:T - max_steps + i] for i in steps], [features[:, tau + i - 1].cpu().detach().numpy() for i in steps], 'time', 'x', legend=[f'{i}-step preds'for i in steps], xlim=[5, 1000], figsize=(6, 4)) plt.show()
defread_time_machine(): """将时间机器数据集加载到文本行的列表中,同时将非大小写字母外的所有字符替换为空格""" withopen('../data/timemachine.txt', 'r') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
lines = read_time_machine() print(f'文本总行数: {len(lines)}') # 文本总行数: 3221 print(lines[0]) # the time machine by h g wells print(lines[10]) # twinkled and his usually pale face was flushed and animated the
deftokenize(lines, token='word'): """将文本行拆分为单词或字符词元""" if token == 'word': return [line.split() for line in lines] elif token == 'char': return [list(line) for line in lines] # list(str)能将字符串中的每个字符分隔开形成list else: print('错误!未知词元类型:' + token)
classVocab: """文本词表""" def__init__(self, tokens=None, min_freq=0, reserved_tokens=None): if tokens isNone: tokens = [] if reserved_tokens isNone: reserved_tokens = [] # 按出现频率从大到小排序 counter = count_corpus(tokens) self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True) # 构建索引到词元与词元到索引的映射,未知词元的索引为0 self.idx_to_token = ['<unk>'] + reserved_tokens self.token_to_idx = {token: idx for idx, token inenumerate(self.idx_to_token)} for token, freq in self._token_freqs: if freq < min_freq: # 如果token出现的次数少于min_freq次则直接丢弃 break if token notin self.token_to_idx: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1
def__len__(self): returnlen(self.idx_to_token)
def__getitem__(self, tokens): ifnotisinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) # tokens不存在则返回0 return [self.__getitem__(token) for token in tokens]
defto_tokens(self, indices): ifnotisinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices]
defcount_corpus(tokens): """统计词元的频率""" # 这里的tokens是1D列表或2D列表 iflen(tokens) == 0orisinstance(tokens[0], list): tokens = [token for line in tokens for token in line] # 将词元列表展平成一个列表 return collections.Counter(tokens)
时光机器数据集中的每个文本行不一定是一个句子或一个段落,还可能是一个单词,因此返回的 corpus 仅处理为单个列表,而不是使用多词元列表构成的一个列表。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
defload_corpus_time_machine(max_tokens=-1): """返回时光机器数据集的词元索引列表和词表""" lines = read_time_machine() tokens = tokenize(lines, 'char') vocab = Vocab(tokens) # 因为时光机器数据集中的每个文本行不一定是一个句子或一个段落,所以将所有文本行展平到一个列表中 corpus = [vocab[token] for line in tokens for token in line] if max_tokens > 0: corpus = corpus[:max_tokens] return corpus, vocab
num_batches = num_subseqs // batch_size for i inrange(0, batch_size * num_batches, batch_size): # 在这里,initial_indices包含子序列的随机起始索引 initial_indices_per_batch = initial_indices[i:i + batch_size] X = [data(j) for j in initial_indices_per_batch] Y = [data(j + 1) for j in initial_indices_per_batch] yield torch.tensor(X), torch.tensor(Y)
import math import torch from torch.utils.tensorboard import SummaryWriter from torch import nn from torch.nn import functional as F from d2l import torch as d2l from tqdm import tqdm
batch_size, num_steps = 32, 35 train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps, max_tokens=10000) print(len(train_iter.corpus)) # 10000 for X, y in train_iter: print(X.shape, y.shape) # torch.Size([32, 35]) torch.Size([32, 35]) break
defpredict(prefix, num_preds, net, vocab, device): """在prefix后面生成新字符""" state = net.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1, 1)) for y in prefix[1:]: # 预热期 _, state = net(get_input(), state) outputs.append(vocab[y]) for _ inrange(num_preds): # 预测num_preds步 y, state = net(get_input(), state) outputs.append(int(y.argmax(dim=1).reshape(1))) return''.join([vocab.idx_to_token[i] for i in outputs])
现在我们可以测试 predict 函数。我们将前缀指定为 time traveller,并基于这个前缀生成10个后续字符。鉴于我们还没有训练网络,它会生成荒谬的预测结果:
下面我们定义一个函数来裁剪模型的梯度,模型是从零开始实现的模型或由高级 API 构建的模型。我们在此计算了所有模型参数的梯度的范数:
1 2 3 4 5 6 7 8 9 10
defgrad_clipping(net, theta): """裁剪梯度""" ifisinstance(net, nn.Module): params = [p for p in net.parameters() if p.requires_grad] else: params = net.params norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params)) if norm > theta: for param in params: param.grad[:] *= theta / norm
具体来说,当使用顺序分区时,我们只在每个迭代周期的开始位置初始化隐状态。由于下一个小批量数据中的第 i 个子序列样本与当前第 i 个子序列样本相邻,因此当前小批量数据最后一个样本的隐状态,将用于初始化下一个小批量数据第一个样本的隐状态。这样,存储在隐状态中的序列的历史信息可以在一个迭代周期内流经相邻的子序列。然而,在任何一点隐状态的计算,都依赖于同一迭代周期中前面所有的小批量数据,这使得梯度计算变得复杂。为了降低计算量,在处理任何一个小批量数据之前,我们先分离梯度,使得隐状态的梯度计算总是限制在一个小批量数据的时间步内。
num_epochs, lr = 500, 1 train(net, train_iter, vocab, lr, num_epochs, device) # Perplexity: 1.0 # time travelleryou can show black is white by argument said filby # travelleryou can show black is white by argument said filby
4.2 循环神经网络的简洁实现
虽然从零开始实现循环神经网络对了解网络的实现方式具有指导意义,但并不方便。本节将展示如何使用深度学习框架的高级 API 提供的函数更有效地实现相同的语言模型。我们仍然从读取时光机器数据集开始:
1 2 3 4 5 6 7 8 9 10
import math import torch from torch.utils.tensorboard import SummaryWriter from torch import nn from torch.nn import functional as F from d2l import torch as d2l from tqdm import tqdm
deftrain_epoch(net, train_iter, loss_function, optimizer, device, use_random_iter): state = None train_loss = [] for X, Y in tqdm(train_iter): if state isNoneor use_random_iter: state = net.begin_state(batch_size=X.shape[0], device=device) else: ifisinstance(net, nn.Module) andnotisinstance(state, tuple): state.detach_() else: for s in state: s.detach_() y = Y.T.reshape(-1) X, y = X.to(device), y.to(device) loss_function.to(device) y_hat, state = net(X, state) loss = loss_function(y_hat, y.long()).mean() optimizer.zero_grad() loss.backward() d2l.grad_clipping(net, 1) # 与上一节中的grad_clipping函数相同 optimizer.step() train_loss.append(loss) # 因为已经调用了mean函数 return math.exp(sum(train_loss) / len(train_loss))
num_epochs, lr = 500, 1 train(net, train_iter, vocab, lr, num_epochs, device) # Perplexity: 1.3 # time traveller for so ig will aboca thoursugli gpseknop how stac # travelleryou can space of the simestiok satt or al and wisc