理论和代码-梯度下降的三种类型

可以有,训练数据n组,$(x^1, y^1), (x^2, y^2), …(x^n, y^n)$,其中,$x^n=[x_{1}^n, x_{2}^n, …, x_{k}^n]$,对应待求解参数$\theta=[\theta_{1}, \theta_{2},…, \theta_{k}]$,损失函数可以有$L=L(x,y;\theta)$,最优化问题转化为$\theta^{\star}=\arg {min(L(x,y;\theta))_\theta}$
为表示方便,另$g=\frac{\partial L(x,y;\theta)}{\partial{\theta}},\theta^t$表示第t轮迭代(一般讲,每轮迭代都是训练所有数据),其他情况类似

普通梯度下降

以线性规划为例,损失函数可以为$L(x,y;\theta)=\Sigma_{1}^{n}(y^n-(b+\theta.x^{n}))^2$
$\theta^{t+1}\leftarrow \theta^{t}-\frac{\eta}{\sqrt{t+1}}.g^{t}$

自适应梯度下降Adaptive Grad

以线性规划为例,损失函数可以为$L(x,y;\theta)=\Sigma_{1}^{n}(y^n-(b+\theta.x^{n}))^2$
$\theta^{t+1}\leftarrow \theta^{t}-\frac{\eta}{\sqrt{\Sigma_{t=0}^{t}(g^{t})^{2}}}.g^{t}$

随机梯度下降Stochastic Grad

与普通梯度下降的区别是,损失函数不需要处理训练集上所有数据(损失函数不$\Sigma$),而是每次按某种方法选一个
以线性规划为例,以选取数据$(x^n, y^n)$为例,损失函数可以为$L(x,y;\theta)=(y^n-(b+\theta.x^{n}))^2$
$\theta^{t+1}\leftarrow \theta^{t}-\eta.g^{t}$

以简单线性回归为例,代码如下,基本上3000次运行后能达到目标:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#! /usr/bin/pyhon
# -*- coding:utf-8 -*-
import numpy as np

# use y = w*x + b as demo model

def get_grad(x_data, y_data, weight, bias, learning_rate, t, sum_grad_weight, sum_grad_bias, grad_type):
if 2 == grad_type:
return stochastic_gradient(x_data, y_data, weight, bias, learning_rate)
elif 1 == grad_type:
return adaptive_gradient(x_data, y_data, weight, bias, learning_rate, sum_grad_weight, sum_grad_bias)
else:
return normal_gradient(x_data, y_data, weight, bias, learning_rate, t)

def stochastic_gradient(x_data, y_data, weight, bias, learning_rate):
for i in range(len(x_data)):
delt_weight = 2* (y_data[i]-(weight * x_data[i] + bias)) * -1 * x_data[i]
delt_bias = 2* (y_data[i]-(weight * x_data[i] + bias)) * -1
weight -= learning_rate * delt_weight
bias -= learning_rate * delt_bias

return weight, bias

def adaptive_gradient(x_data, y_data, weight, bias, learning_rate, sum_grad_weight, sum_grad_bias):
delt_weight = 0.0
delt_bias = 0.0
for i in range(len(x_data)):
delt_weight += 2* (y_data[i]-(weight * x_data[i] + bias)) * -1 * x_data[i]
delt_bias += 2* (y_data[i]-(weight * x_data[i] + bias)) * -1

sum_grad_weight += delt_weight ** 2
sum_grad_bias += delt_bias ** 2

delt_weight *= learning_rate / np.sqrt(sum_grad_weight)
delt_bias *= learning_rate / np.sqrt(sum_grad_bias)

return delt_weight, delt_bias, sum_grad_weight, sum_grad_bias

def normal_gradient(x_data, y_data, weight, bias, learning_rate, t):
delt_weight = 0.0
delt_bias = 0.0
for i in range(len(x_data)):
delt_weight += 2* (y_data[i]-(weight * x_data[i] + bias)) * -1 * x_data[i]
delt_bias += 2* (y_data[i]-(weight * x_data[i] + bias)) * -1

delt_weight *= learning_rate / np.sqrt(1 + t)
delt_bias *= learning_rate / np.sqrt(1 + t)

return delt_weight, delt_bias

# train data
x_data = np.random.rand(100).astype(np.float32)
y_data = x_data * 0.3 + 0.2

# init
weight = 0.2
bias = 0.0
learning_rate = 0.01
iteration = 10000

# for 3 kind grad
NORMAL = 0
ADAPTIVE = 1
STOCHASTIC = 2

# for stochastic grad
sum_grad_weight = 0.0
sum_grad_bias = 0.0


print weight
print bias
for t in range(iteration):
# normal
'''
delt_weight, delt_bias = get_grad(x_data, y_data, weight, bias, learning_rate, t, sum_grad_weight, sum_grad_bias, NORMAL)
weight -= delt_weight
bias -= delt_bias
'''

# adaptive
#'''
delt_weight, delt_bias, sum_grad_weight, sum_grad_bias = get_grad(x_data, y_data, weight, bias, learning_rate, t, sum_grad_weight, sum_grad_bias, ADAPTIVE)
weight -= delt_weight
bias -= delt_bias
#'''

# stochastic
'''
weight, bias = get_grad(x_data, y_data, weight, bias, learning_rate, t, sum_grad_weight, sum_grad_bias, STOCHASTIC)
'''

if t % 1000 == 0:
print weight
print bias

坚持原创技术分享,您的支持将鼓励我继续创作!