Kaggle项目实战

  1. 1. 实战Kaggle比赛:预测房价

记录 Kaggle 中的一些经典竞赛,也当做自己的练手小项目。

1. 实战Kaggle比赛:预测房价

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# 比赛链接:https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques

import hashlib
import os
import tarfile
import zipfile
import requests
import pandas as pd
import torch
from torch import nn
from torch.utils import data

DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

def download(name, cache_dir=os.path.join('..', 'data')):
"""下载一个DATA_HUB中的文件,返回本地文件名"""
assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
url, sha1_hash = DATA_HUB[name]
os.makedirs(cache_dir, exist_ok=True)
fname = os.path.join(cache_dir, url.split('/')[-1])
if os.path.exists(fname):
sha1 = hashlib.sha1()
with open(fname, 'rb') as f:
while True:
data = f.read(1048576)
if not data:
break
sha1.update(data)
if sha1.hexdigest() == sha1_hash:
return fname # 命中缓存
print(f'正在从{url}下载{fname}...')
r = requests.get(url, stream=True, verify=True)
with open(fname, 'wb') as f:
f.write(r.content)
return fname

def download_extract(name, folder=None):
"""下载并解压zip/tar文件"""
fname = download(name)
base_dir = os.path.dirname(fname)
data_dir, ext = os.path.splitext(fname)
if ext == '.zip':
fp = zipfile.ZipFile(fname, 'r')
elif ext in ('.tar', '.gz'):
fp = tarfile.open(fname, 'r')
else:
assert False, '只有zip/tar文件可以被解压缩'
fp.extractall(base_dir)
return os.path.join(base_dir, folder) if folder else data_dir

def download_all():
"""下载DATA_HUB中的所有文件"""
for name in DATA_HUB:
download(name)

# 下载并缓存Kaggle房屋数据集
DATA_HUB['kaggle_house_train'] = (DATA_URL + 'kaggle_house_pred_train.csv', '585e9cc93e70b39160e7921475f9bcd7d31219ce')
DATA_HUB['kaggle_house_test'] = (DATA_URL + 'kaggle_house_pred_test.csv', 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

# 使用pandas分别加载包含训练数据和测试数据的两个CSV文件
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))

# 训练数据集包括1460个样本,每个样本80个特征和1个标签,而测试数据集包含1459个样本,每个样本80个特征
print(train_data.shape)
print(test_data.shape)

# 看看前四个和最后两个特征,以及相应标签(房价)
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

# 在每个样本中,第一个特征是ID,这有助于模型识别每个训练样本。虽然这很方便,但它不携带任何用于预测的信息。因此,在将数据提供给模型之前,我们将其从数据集中删除
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:])) # train数据的最后一列为label

# 在开始建模之前,我们需要对数据进行预处理。首先,我们将所有缺失的值替换为相应特征的平均值
# 若无法获得测试数据,则可根据训练数据计算均值和标准差
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index # 数值类型特征的下标
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std())) # 将所有数值特征的均值变成0,方差变成1
# 在标准化数据之后,所有均值消失,因此我们可以将缺失值设置为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)

# 接下来,我们处理离散值。这包括诸如“MSZoning”之类的特征。我们用独热编码替换它们,pandas软件包会自动为我们实现这一点
# dummy_na=True将'na'(缺失值)视为有效的特征值,并为其创建指示符特征
all_features = pd.get_dummies(all_features, dummy_na=True)
print(all_features.shape) # (2919, 331)

# 通过values属性,我们可以从pandas格式中提取NumPy格式,并将其转换为张量表示用于训练
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)

# 训练
loss_function = nn.MSELoss()
in_features = train_features.shape[1]

net = nn.Sequential(nn.Linear(in_features, 128),
nn.ReLU(),
nn.Dropout(0.1),
nn.Linear(128, 1))

# 房价就像股票价格一样,我们关心的是相对数量,而不是绝对数量,我们更关心相对误差,即:(真实值-预测值)/真实值,解决这个问题的一种方法是用价格预测的对数来衡量差异
def log_rmse(net, features, labels):
# 为了在取对数时进一步稳定该值,将小于1的值设置为1
clipped_preds = torch.clamp(net(features), 1, float('inf')) # 将inf变成1
rmse = torch.sqrt(loss_function(torch.log(clipped_preds), torch.log(labels)))
return rmse.item()

def load_array(data_arrays, batch_size, is_train=True):
dataset = data.TensorDataset(*data_arrays)
return data.DataLoader(dataset, batch_size, shuffle=is_train)

def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay, batch_size):
train_ls, test_ls = [], []
train_iter = load_array((train_features, train_labels), batch_size)

optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay) # 这里使用的是Adam优化算法

for epoch in range(num_epochs):
for X, y in train_iter:
optimizer.zero_grad()
loss = loss_function(net(X), y)
loss.backward()
optimizer.step()

train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))

return train_ls, test_ls

# K折交叉验证,有助于模型选择和超参数调整,首先需要定义一个函数,在K折交叉验证过程中返回第i折的数据
def get_k_fold_data(k, i, X, y):
assert k > 1
fold_size = X.shape[0] // k # 每一折的大小
X_train, y_train = None, None
for j in range(k): # 分成k份
idx = slice(j * fold_size, (j + 1) * fold_size)
X_part, y_part = X[idx, :], y[idx]
if j == i: # 将第i折的数据作为验证数据
X_valid, y_valid = X_part, y_part
elif X_train is None:
X_train, y_train = X_part, y_part
else:
X_train = torch.cat([X_train, X_part], 0)
y_train = torch.cat([y_train, y_part], 0)
return X_train, y_train, X_valid, y_valid

def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
train_l_sum, valid_l_sum = 0, 0
for i in range(k):
data = get_k_fold_data(k, i, X_train, y_train)
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]
print(f'折{i + 1},训练log rmse: {float(train_ls[-1]):f},验证log rmse: {float(valid_ls[-1]):f}')
return train_l_sum / k, valid_l_sum / k

k, num_epochs, lr, weight_decay, batch_size = 5, 250, 0.5, 1e-3, 32

train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f},'f'平均验证log rmse: {float(valid_l):f}')

# 使用所有数据对其进行训练
def train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size):
train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
print(f'训练log rmse: {float(train_ls[-1]):f}')
# 将网络应用于测试集
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle,将预测保存在CSV文件中可以简化将结果上传到Kaggle的过程
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('../data/submission.csv', index=False)

train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)