max_degree = 20 # 多项式的最大阶数
n_train, n_test = 100, 100 # 训练和测试数据集大小
true_w = np.zeros(max_degree) # 分配大量的空间
true_w[0:4] = np.array([5, 1.2, -3.4, 5.6])
features = np.random.normal(size=(n_train + n_test, 1))
np.random.shuffle(features)
poly_features = np.power(features, np.arange(max_degree).reshape(1, -1))
for i in range(max_degree):
poly_features[:, i] /= math.gamma(i + 1) # gamma(n)=(n-1)!
# labels的维度:(n_train+n_test,)
labels = np.dot(poly_features, true_w)
labels += np.random.normal(scale=0.1, size=labels.shape)
[07:01:21] ../src/storage/storage.cc:196: Using Pooled (Naive) StorageManager for CPU
max_degree = 20 # 多项式的最大阶数
n_train, n_test = 100, 100 # 训练和测试数据集大小
true_w = np.zeros(max_degree) # 分配大量的空间
true_w[0:4] = np.array([5, 1.2, -3.4, 5.6])
features = np.random.normal(size=(n_train + n_test, 1))
np.random.shuffle(features)
poly_features = np.power(features, np.arange(max_degree).reshape(1, -1))
for i in range(max_degree):
poly_features[:, i] /= math.gamma(i + 1) # gamma(n)=(n-1)!
# labels的维度:(n_train+n_test,)
labels = np.dot(poly_features, true_w)
labels += np.random.normal(scale=0.1, size=labels.shape)
max_degree = 20 # 多项式的最大阶数
n_train, n_test = 100, 100 # 训练和测试数据集大小
true_w = np.zeros(max_degree) # 分配大量的空间
true_w[0:4] = np.array([5, 1.2, -3.4, 5.6])
features = np.random.normal(size=(n_train + n_test, 1))
np.random.shuffle(features)
poly_features = np.power(features, np.arange(max_degree).reshape(1, -1))
for i in range(max_degree):
poly_features[:, i] /= math.gamma(i + 1) # gamma(n)=(n-1)!
# labels的维度:(n_train+n_test,)
labels = np.dot(poly_features, true_w)
labels += np.random.normal(scale=0.1, size=labels.shape)
max_degree = 20 # 多项式的最大阶数
n_train, n_test = 100, 100 # 训练和测试数据集大小
true_w = np.zeros(max_degree) # 分配大量的空间
true_w[0:4] = np.array([5, 1.2, -3.4, 5.6])
features = np.random.normal(size=(n_train
+ n_test, 1))
np.random.shuffle(features)
poly_features = np.power(features, np.arange(max_degree).reshape(1, -1))
for i in range(max_degree):
poly_features[:, i] /= math.gamma(i + 1) # gamma(n)=(n-1)!
# labels的维度:(n_train+n_test,)
labels = np.dot(poly_features, true_w)
labels += np.random.normal(scale=0.1, size=labels.shape)
同样,存储在poly_features中的单项式由gamma函数重新缩放,
其中\(\Gamma(n)=(n-1)!\)。 从生成的数据集中查看一下前2个样本,
第一个值是与偏置相对应的常量特征。
features[:2], poly_features[:2, :], labels[:2]
(array([[-0.03716067],
[-1.1468065 ]]),
array([[ 1.0000000e+00, -3.7160669e-02, 6.9045764e-04, -8.5526226e-06,
7.9455290e-08, -5.9052235e-10, 3.6573678e-12, -1.9415747e-14,
9.0187767e-17, -3.7238198e-19, 1.3837963e-21, -4.6747996e-24,
1.4476556e-26, -4.1381425e-29, 1.0984010e-31, -2.7211542e-34,
6.3199942e-37, -1.3815009e-39, 2.8516424e-42, -5.6051939e-45],
[ 1.0000000e+00, -1.1468065e+00, 6.5758252e-01, -2.5137332e-01,
7.2069131e-02, -1.6529869e-02, 3.1594271e-03, -5.1760738e-04,
7.4199430e-05, -9.4547095e-06, 1.0842723e-06, -1.1304095e-07,
1.0803007e-08, -9.5299690e-10, 7.8064499e-11, -5.9683248e-12,
4.2778208e-13, -2.8857840e-14, 1.8385754e-15, -1.1097317e-16]]),
array([ 5.1432443 , -0.06415121]))
# NumPy ndarray转换为tensor
true_w, features, poly_features, labels = [torch.tensor(x, dtype=
torch.float32) for x in [true_w, features, poly_features, labels]]
features[:2], poly_features[:2, :], labels[:2]
(tensor([[ 1.6580],
[-1.6392]]),
tensor([[ 1.0000e+00, 1.6580e+00, 1.3745e+00, 7.5967e-01, 3.1489e-01,
1.0442e-01, 2.8855e-02, 6.8346e-03, 1.4165e-03, 2.6096e-04,
4.3267e-05, 6.5217e-06, 9.0110e-07, 1.1493e-07, 1.3611e-08,
1.5045e-09, 1.5590e-10, 1.5206e-11, 1.4006e-12, 1.2223e-13],
[ 1.0000e+00, -1.6392e+00, 1.3435e+00, -7.3408e-01, 3.0082e-01,
-9.8622e-02, 2.6944e-02, -6.3094e-03, 1.2928e-03, -2.3546e-04,
3.8597e-05, -5.7516e-06, 7.8567e-07, -9.9066e-08, 1.1599e-08,
-1.2676e-09, 1.2986e-10, -1.2522e-11, 1.1403e-12, -9.8378e-14]]),
tensor([ 6.6262, -5.4505]))
# NumPy ndarray转换为tensor
true_w, features, poly_features, labels = [tf.constant(x, dtype=
tf.float32) for x in [true_w, features, poly_features, labels]]
features[:2], poly_features[:2, :], labels[:2]
(<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[ 0.6081675 ],
[-0.68940073]], dtype=float32)>,
<tf.Tensor: shape=(2, 20), dtype=float32, numpy=
array([[ 1.00000000e+00, 6.08167529e-01, 1.84933856e-01,
3.74902524e-02, 5.70008857e-03, 6.93321694e-04,
7.02759571e-05, 6.10565030e-06, 4.64157239e-07,
3.13650403e-08, 1.90751970e-09, 1.05462868e-10,
5.34492416e-12, 2.50046837e-13, 1.08621693e-14,
4.40401202e-16, 1.67398559e-17, 5.98860979e-19,
2.02337667e-20, 6.47658893e-22],
[ 1.00000000e+00, -6.89400733e-01, 2.37636685e-01,
-5.46089672e-02, 9.41186585e-03, -1.29770942e-03,
1.49106971e-04, -1.46849225e-05, 1.26547457e-06,
-9.69354517e-08, 6.68273703e-09, -4.18825807e-10,
2.40615687e-11, -1.27600480e-12, 6.28341915e-14,
-2.88786248e-15, 1.24430913e-16, -5.04604451e-18,
1.93263719e-19, -7.01242849e-21]], dtype=float32)>,
<tf.Tensor: shape=(2,), dtype=float32, numpy=array([5.3732677, 2.9457862], dtype=float32)>)
# NumPy ndarray转换为tensor
true_w, features, poly_features, labels = [paddle.to_tensor(x, dtype=
paddle.float32) for x in [true_w, features, poly_features, labels]]
features[:2], poly_features[:2, :], labels[:2]
(Tensor(shape=[2, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
[[-1.81616867],
[ 2.37323785]]),
Tensor(shape=[2, 20], dtype=float32, place=Place(cpu), stop_gradient=True,
[[ 1. , -1.81616867, 1.64923418, -0.99842918, 0.45332894,
-0.16466436, 0.04984304, -0.01293191, 0.00293582, -0.00059244,
0.00010760, -0.00001776, 0.00000269, -0.00000038, 0.00000005,
-0.00000001, 0.00000000, -0.00000000, 0.00000000, -0.00000000],
[ 1. , 2.37323785, 2.81612873, 2.22778106, 1.32176352,
0.62737185, 0.24815042, 0.08413142, 0.02495798, 0.00658125,
0.00156189, 0.00033698, 0.00006664, 0.00001217, 0.00000206,
0.00000033, 0.00000005, 0.00000001, 0.00000000, 0.00000000]]),
Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
[-8.40298176, 10.88386822]))
4.4.4.2. 对模型进行训练和测试
首先让我们实现一个函数来评估模型在给定数据集上的损失。
def evaluate_loss(net, data_iter, loss): #@save
"""评估给定数据集上模型的损失"""
metric = d2l.Accumulator(2) # 损失的总和,样本数量
for X, y in data_iter:
l = loss(net(X), y)
metric.add(l.sum(), d2l.size(l))
return metric[0] / metric[1]
def evaluate_loss(net, data_iter, loss): #@save
"""评估给定数据集上模型的损失"""
metric = d2l.Accumulator(2) # 损失的总和,样本数量
for X, y in data_iter:
out = net(X)
y = y.reshape(out.shape)
l = loss(out, y)
metric.add(l.sum(), l.numel())
return metric[0] / metric[1]
def evaluate_loss(net, data_iter, loss): #@save
"""评估给定数据集上模型的损失"""
metric = d2l.Accumulator(2) # 损失的总和,样本数量
for X, y in data_iter:
l = loss(net(X), y)
metric.add(tf.reduce_sum(l), d2l.size(l))
return metric[0] / metric[1]
def evaluate_loss(net, data_iter, loss): #@save
"""评估给定数据集上模型的损失。"""
metric = d2l.Accumulator(2) # 损失的总和, 样本数量
for X, y in data_iter:
out = net(X)
y = y.reshape(out.shape)
l = loss(out, y)
metric.add(l.sum(), l.numel())
return metric[0] / metric[1]
现在定义训练函数。
def train(train_features, test_features, train_labels, test_labels,
num_epochs=400):
loss = gluon.loss.L2Loss()
net = nn.Sequential()
# 不设置偏置,因为我们已经在多项式中实现了它
net.add(nn.Dense(1, use_bias=False))
net.initialize()
batch_size = min(10, train_labels.shape[0])
train_iter = d2l.load_array((train_features, train_labels), batch_size)
test_iter = d2l.load_array((test_features, test_labels), batch_size,
is_train=False)
trainer = gluon.Trainer(net.collect_params(), 'sgd',
{'learning_rate': 0.01})
animator = d2l.Animator(
xlabel='epoch', ylabel='loss', yscale='log',
xlim=[1, num_epochs], ylim=[1e-3, 1e2],
legend=['train', 'test'])
for epoch in range(num_epochs):
d2l.train_epoch_ch3(net, train_iter, loss, trainer)
if epoch == 0 or (epoch + 1) % 20 == 0:
animator.add(epoch + 1, (evaluate_loss(net, train_iter, loss),
evaluate_loss(net, test_iter, loss)))
print('weight:', net[0].weight.data().asnumpy())
def train(train_features, test_features, train_labels, test_labels,
num_epochs=400):
loss = nn.MSELoss(reduction='none')
input_shape = train_features.shape[-1]
# 不设置偏置,因为我们已经在多项式中实现了它
net = nn.Sequential(nn.Linear(input_shape, 1, bias=False))
batch_size = min(10, train_labels.shape[0])
train_iter = d2l.load_array((train_features, train_labels.reshape(-1,1)),
batch_size)
test_iter = d2l.load_array((test_features, test_labels.reshape(-1,1)),
batch_size, is_train=False)
trainer = torch.optim.SGD(net.parameters(), lr=0.01)
animator = d2l.Animator(xlabel='epoch', ylabel='loss', yscale='log',
xlim=[1, num_epochs], ylim=[1e-3, 1e2],
legend=['train', 'test'])
for epoch in range(num_epochs):
d2l.train_epoch_ch3(net, train_iter, loss, trainer)
if epoch == 0 or (epoch + 1) % 20 == 0:
animator.add(epoch + 1, (evaluate_loss(net, train_iter, loss),
evaluate_loss(net, test_iter, loss)))
print('weight:', net[0].weight.data.numpy())
def train(train_features, test_features, train_labels, test_labels,
num_epochs=400):
loss = tf.losses.MeanSquaredError()
input_shape = train_features.shape[-1]
# 不设置偏置,因为我们已经在多项式中实现了它
net = tf.keras.Sequential()
net.add(tf.keras.layers.Dense(1, use_bias=False))
batch_size = min(10, train_labels.shape[0])
train_iter = d2l.load_array((train_features, train_labels), batch_size)
test_iter = d2l.load_array((test_features, test_labels), batch_size,
is_train=False)
trainer = tf.keras.optimizers.SGD(learning_rate=.01)
animator = d2l.Animator(xlabel='epoch', ylabel='loss', yscale='log',
xlim=[1, num_epochs], ylim=[1e-3, 1e2],
legend=['train', 'test'])
for epoch in range(num_epochs):
d2l.train_epoch_ch3(net, train_iter, loss, trainer)
if epoch == 0 or (epoch + 1) % 20 == 0:
animator.add(epoch + 1, (evaluate_loss(net, train_iter, loss),
evaluate_loss(net, test_iter, loss)))
print('weight:', net.get_weights()[0].T)
def train(train_features, test_features, train_labels, test_labels,
num_epochs=400):
loss = nn.MSELoss()
input_shape = train_features.shape[-1]
# 不设置偏置,因为我们已经在多项式特征中实现了它
net = nn.Sequential(nn.Linear(input_shape, 1, bias_attr=False))
batch_size = min(10, train_labels.shape[0])
print(batch_size)
train_iter = d2l.load_array(((train_features, train_labels.reshape([-1,1]))),
batch_size)
test_iter = d2l
.load_array((test_features, test_labels.reshape([-1,1])),
batch_size, is_train=False)
trainer = paddle.optimizer.SGD(parameters=net.parameters(), learning_rate=0.01)
animator = d2l.Animator(xlabel='epoch', ylabel='loss', yscale='log',
xlim=[1, num_epochs], ylim=[1e-3, 1e2],
legend=['train', 'test'])
for epoch in range(num_epochs):
d2l.train_epoch_ch3(net, train_iter, loss, trainer)
if epoch == 0 or (epoch + 1) % 20 == 0:
animator.add(epoch + 1, (evaluate_loss(net, train_iter, loss),
evaluate_loss(net, test_iter, loss)))
print('weight:', net[0].weight.numpy())
我们将首先使用三阶多项式函数,它与数据生成函数的阶数相同。
结果表明,该模型能有效降低训练损失和测试损失。
学习到的模型参数也接近真实值\(w = [5, 1.2, -3.4, 5.6]\)。
# 从多项式特征中选择前4个维度,即1,x,x^2/2!,x^3/3!
train(poly_features[:n_train, :4], poly_features[n_train:, :4],
labels[:n_train], labels[n_train:])
weight: [[ 5.019143 1.2220006 -3.4235666 5.571755 ]]
# 从多项式特征中选择前4个维度,即1,x,x^2/2!,x^3/3!
train(poly_features[:n_train, :4], poly_features[n_train:, :4],
labels[:n_train], labels[n_train:])
weight: [[ 5.010476 1.2354498 -3.4229028 5.503297 ]]
# 从多项式特征中选择前4个维度,即1,x,x^2/2!,x^3/3!
train(poly_features[:n_train, :4], poly_features[n_train:, :4],
labels[:n_train], labels[n_train:])
weight: [[ 5.0158176 1.2786117 -3.5013676 5.3130755]]
# 从多项式特征中选择前4个维度,即1,x,x^2/2!,x^3/3!
train(poly_features[:n_train, :4], poly_features[n_train:, :4],
labels[:n_train], labels[n_train:])
weight: [[ 4.9773173]
[ 1.2528117]
[-3.3558397]
[ 5.4857626]]
在最后一个迭代周期完成后,训练损失仍然很高。
当用来拟合非线性模式(如这里的三阶多项式函数)时,线性模型容易欠拟合。
# 从多项式特征中选择前2个维度,即1和x
train(poly_features[:n_train, :2], poly_features[n_train:, :2],
labels[:n_train], labels[n_train:])
weight: [[2.6972198 4.2307568]]
# 从多项式特征中选择前2个维度,即1和x
train(poly_features[:n_train, :2], poly_features[n_train:, :2],
labels[:n_train], labels[n_train:])
weight: [[3.4049764 3.9939284]]
# 从多项式特征中选择前2个维度,即1和x
train(poly_features[:n_train, :2], poly_features[n_train:, :2],
labels[:n_train], labels[n_train:])
weight: [[3.4558759 3.9041805]]
# 从多项式特征中选择前2个维度,即1和x
train(poly_features[:n_train, :2], poly_features[n_train:, :2],
labels[:n_train], labels[n_train:])
weight: [[3.6002903]
[2.922718 ]]
虽然训练损失可以有效地降低,但测试损失仍然很高。
结果表明,复杂模型对数据造成了过拟合。
# 从多项式特征中选取所有维度
train(poly_features[:n_train, :], poly_features[n_train:, :],
labels[:n_train], labels[n_train:], num_epochs=1500)
weight: [[ 4.992059 1.3058902 -3.3531423 5.1164446 -0.11141003 1.3032119
0.12679203 0.16655836 0.05129979 -0.02275036 0.0080622 -0.05167824
-0.02426303 -0.01502204 -0.04941358 0.06389866 -0.04761846 -0.04380165
-0.05188227 0.05655775]]
# 从多项式特征中选取所有维度
train(poly_features[:n_train, :], poly_features[n_train:, :],
labels[:n_train], labels[n_train:], num_epochs=1500)
weight: [[ 4.9849787 1.2896876 -3.2996354 5.145749 -0.34205326 1.2237961
0.20393135 0.3027379 -0.20079008 -0.16337848 0.11026663 0.21135856
-0.00940325 0.11873583 -0.15114897 -0.05347819 0.17096086 0.1863975
-0.09107699 -0.02123026]]
# 从多项式特征中选取所有维度
train(poly_features[:n_train, :], poly_features[n_train:, :],
labels[:n_train], labels[n_train:], num_epochs=1500)
weight: [[ 4.945069 1.3180096 -3.1279085 4.986947 -1.1656193 0.46508536
-0.6674344 -0.135499 0.3463548 -0.14836456 0.02399637 -0.5209355
-0.13330926 0.49879497 0.3075923 0.4957083 -0.3921882 -0.1127753
-0.16941413 0.34570968]]
# 从多项式特征中选取所有维度
train(poly_features[:n_train, :], poly_features[n_train:, :],
labels[:n_train], labels[n_train:], num_epochs=1500)
weight: [[ 4.9901624 ]
[ 1.2774394 ]
[-3.3785357 ]
[ 5.205178 ]
[-0.06694006]
[ 1.3129296 ]
[-0.24143514]
[ 0.40482703]
[ 0.18090816]
[-0.05139386]
[ 0.4786992 ]
[ 0.27528012]
[ 0.23897053]
[ 0.17378598]
[ 0.3427784 ]
[-0.03536515]
[ 0.36817062]
[-0.1378502 ]
[ 0.11173433]
[-0.13340428]]
欠拟合是指模型无法继续减少训练误差。过拟合是指训练误差远小于验证误差。
由于不能基于训练误差来估计泛化误差,因此简单地最小化训练误差并不一定意味着泛化误差的减小。机器学习模型需要注意防止过拟合,即防止泛化误差过大。
验证集可以用于模型选择,但不能过于随意地使用它。
我们应该选择一个复杂度适当的模型,避免使用数量不足的训练样本。
4.4.6. 练习
这个多项式回归问题可以准确地解出吗?提示:使用线性代数。
考虑多项式的模型选择。
绘制训练损失与模型复杂度(多项式的阶数)的关系图。观察到了什么?需要多少阶的多项式才能将训练损失减少到0?
在这种情况下绘制测试的损失图。
生成同样的图,作为数据量的函数。
如果不对多项式特征\(x^i\)进行标准化(\(1/i!\)),会发生什么事情?能用其他方法解决这个问题吗?
泛化误差可能为零吗?