1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
|
import hashlib import os import tarfile import zipfile import requests import pandas as pd import torch from torch import nn from torch.utils import data
DATA_HUB = dict() DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
def download(name, cache_dir=os.path.join('..', 'data')): """下载一个DATA_HUB中的文件,返回本地文件名""" assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}" url, sha1_hash = DATA_HUB[name] os.makedirs(cache_dir, exist_ok=True) fname = os.path.join(cache_dir, url.split('/')[-1]) if os.path.exists(fname): sha1 = hashlib.sha1() with open(fname, 'rb') as f: while True: data = f.read(1048576) if not data: break sha1.update(data) if sha1.hexdigest() == sha1_hash: return fname print(f'正在从{url}下载{fname}...') r = requests.get(url, stream=True, verify=True) with open(fname, 'wb') as f: f.write(r.content) return fname
def download_extract(name, folder=None): """下载并解压zip/tar文件""" fname = download(name) base_dir = os.path.dirname(fname) data_dir, ext = os.path.splitext(fname) if ext == '.zip': fp = zipfile.ZipFile(fname, 'r') elif ext in ('.tar', '.gz'): fp = tarfile.open(fname, 'r') else: assert False, '只有zip/tar文件可以被解压缩' fp.extractall(base_dir) return os.path.join(base_dir, folder) if folder else data_dir
def download_all(): """下载DATA_HUB中的所有文件""" for name in DATA_HUB: download(name)
DATA_HUB['kaggle_house_train'] = (DATA_URL + 'kaggle_house_pred_train.csv', '585e9cc93e70b39160e7921475f9bcd7d31219ce') DATA_HUB['kaggle_house_test'] = (DATA_URL + 'kaggle_house_pred_test.csv', 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
train_data = pd.read_csv(download('kaggle_house_train')) test_data = pd.read_csv(download('kaggle_house_test'))
print(train_data.shape) print(test_data.shape)
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)
all_features = pd.get_dummies(all_features, dummy_na=True) print(all_features.shape)
n_train = train_data.shape[0] train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32) test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32) train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)
loss_function = nn.MSELoss() in_features = train_features.shape[1]
net = nn.Sequential(nn.Linear(in_features, 128), nn.ReLU(), nn.Dropout(0.1), nn.Linear(128, 1))
def log_rmse(net, features, labels): clipped_preds = torch.clamp(net(features), 1, float('inf')) rmse = torch.sqrt(loss_function(torch.log(clipped_preds), torch.log(labels))) return rmse.item()
def load_array(data_arrays, batch_size, is_train=True): dataset = data.TensorDataset(*data_arrays) return data.DataLoader(dataset, batch_size, shuffle=is_train)
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay, batch_size): train_ls, test_ls = [], [] train_iter = load_array((train_features, train_labels), batch_size)
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)
for epoch in range(num_epochs): for X, y in train_iter: optimizer.zero_grad() loss = loss_function(net(X), y) loss.backward() optimizer.step()
train_ls.append(log_rmse(net, train_features, train_labels)) if test_labels is not None: test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
def get_k_fold_data(k, i, X, y): assert k > 1 fold_size = X.shape[0] // k X_train, y_train = None, None for j in range(k): idx = slice(j * fold_size, (j + 1) * fold_size) X_part, y_part = X[idx, :], y[idx] if j == i: X_valid, y_valid = X_part, y_part elif X_train is None: X_train, y_train = X_part, y_part else: X_train = torch.cat([X_train, X_part], 0) y_train = torch.cat([y_train, y_part], 0) return X_train, y_train, X_valid, y_valid
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size): train_l_sum, valid_l_sum = 0, 0 for i in range(k): data = get_k_fold_data(k, i, X_train, y_train) train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size) train_l_sum += train_ls[-1] valid_l_sum += valid_ls[-1] print(f'折{i + 1},训练log rmse: {float(train_ls[-1]):f},验证log rmse: {float(valid_ls[-1]):f}') return train_l_sum / k, valid_l_sum / k
k, num_epochs, lr, weight_decay, batch_size = 5, 250, 0.5, 1e-3, 32
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size) print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f},'f'平均验证log rmse: {float(valid_l):f}')
def train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size): train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size) print(f'训练log rmse: {float(train_ls[-1]):f}') preds = net(test_features).detach().numpy() test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0]) submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1) submission.to_csv('../data/submission.csv', index=False)
train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)
|