gru_layer
GRU layer λ?
GRU (Gated Recurrent Unit) λ μ΄μ΄λ λ°λ³΅ μ κ²½λ§ (Recurrent Neural Network, RNN)μ ν μ’ λ₯λ‘, κΈ΄ μνμ€λ₯Ό μ²λ¦¬νλ λ°μ μ¬μ©λ©λλ€.
GRUλ κΈ°λ³Έμ μΌλ‘ LSTM (Long Short-Term Memory)κ³Ό μ μ¬ν μμ΄λμ΄λ₯Ό κΈ°λ°μΌλ‘ νκ³ μμ΅λλ€. LSTMκ³Ό λ§μ°¬κ°μ§λ‘, GRUλ RNN κ³μ΄μ λ μ΄μ΄λ‘μ μνμ€ λ°μ΄ν°λ₯Ό μ²λ¦¬ν μ μμ΅λλ€. νμ§λ§ LSTMκ³Όλ λ¬λ¦¬, GRUλ κ²μ΄νΈ λ©μ»€λμ¦μ μ¬μ©νμ¬ κΈ°μ΅μ 보νΈνκ³ , μ΄μ μνμμ μ 보λ₯Ό κ°μ Έμ€λ λ°©λ²μ κ°λ¨ννμ¬ λ μ μ κ³μ°μΌλ‘ μ₯κΈ°μ μΈ μνλ₯Ό μ μ§ν μ μλλ‘ ν©λλ€.
GRUλ LSTMλ³΄λ€ λ κ°λ¨ν ꡬ쑰λ₯Ό κ°μ§κ³ μμΌλ©°, λ μ μ νλΌλ―Έν°λ₯Ό νμλ‘ ν©λλ€. GRUλ LSTMλ³΄λ€ νμ΅ μλκ° λ λΉ λ₯΄κ³ , μμ λ°μ΄ν°μ μμ λ μΌλ°μ μΈ λͺ¨λΈμ λ§λ€μ΄λ΄λ κ²½ν₯μ΄ μμ΅λλ€.
GRU λ μ΄μ΄λ 2κ°μ κ²μ΄νΈλ₯Ό μ¬μ©νμ¬ κΈ°μ΅μ μ‘°μ ν©λλ€. 첫 λ²μ§Έ κ²μ΄νΈλ "μ λ°μ΄νΈ κ²μ΄νΈ"λΌκ³ λΆλ¦¬λ©°, νμ¬ μ λ ₯κ³Ό μ΄μ μνλ₯Ό κ²°ν©νμ¬ μλ‘μ΄ μνλ₯Ό μμ±ν©λλ€. λ λ²μ§Έ κ²μ΄νΈλ "μ¬μ€μ κ²μ΄νΈ"λΌκ³ λΆλ¦¬λ©°, μ΄μ μνμ μΌλΆλ₯Ό λ²λ¦¬κ³ μλ‘μ΄ μνλ₯Ό λ§λλλ€. GRU λ μ΄μ΄λ μ΄λ¬ν κ²μ΄νΈλ€μ μ¬μ©νμ¬ μ λ ₯ μνμ€μ μ΄μ μνλ₯Ό κΈ°λ°μΌλ‘ ν λ€μ, μλ‘μ΄ μνλ₯Ό μΆλ ₯ν©λλ€.
GRU λ μ΄μ΄λ μ£Όλ‘ μνμ€ λ°μ΄ν°λ₯Ό λ€λ£¨λ μμ°μ΄ μ²λ¦¬(NLP) λΆμΌμμ μ¬μ©λ©λλ€. GRU λ μ΄μ΄λ₯Ό μ μ©ν λͺ¨λΈμ ν μ€νΈ μμ±, λ²μ, κ°μ± λΆμ λ± λ€μν νμ€ν¬μμ μ’μ μ±λ₯μ 보μ λλ€.
increment_layer
static void increment_layer(layer *l, int steps)
{
int num = l->outputs*l->batch*steps;
l->output += num;
l->delta += num;
l->x += num;
l->x_norm += num;
#ifdef GPU
l->output_gpu += num;
l->delta_gpu += num;
l->x_gpu += num;
l->x_norm_gpu += num;
#endif
}
ν¨μ μ΄λ¦: increment_layer
μ λ ₯:
layer *l: μ λ°μ΄νΈν λ μ΄μ΄
int steps: μ΄λν μ€ν μ
λμ:
layer ꡬ쑰체 ν¬μΈν°μΈ lμ output, delta, x, x_normμ stepsλ§νΌ μ΄λν ν¬μΈν°λ₯Ό ν λΉνλ€.
GPU νκ²½μμλ lμ output_gpu, delta_gpu, x_gpu, x_norm_gpuμ stepsλ§νΌ μ΄λν ν¬μΈν°λ₯Ό ν λΉνλ€.
μ€λͺ :
ν΄λΉ ν¨μλ λ μ΄μ΄μ ν¬μΈν°λ₯Ό stepsλ§νΌ μ΄λμμΌ μ λ°μ΄νΈνλ ν¨μμ΄λ€.
ν¬μΈν°λ₯Ό μ΄λμμΌμ μ΄μ μ κ°μ μ°Έμ‘°νμ§ μκ³ μλ‘μ΄ κ°μ μ°Έμ‘°ν μ μλλ‘ νλ€.
GPU νκ²½μμλ GPU λ©λͺ¨λ¦¬ μμ ν¬μΈν°λ₯Ό μ΄λμν¨λ€.
forward_gru_layer
void forward_gru_layer(layer l, network net)
{
network s = net;
s.train = net.train;
int i;
layer uz = *(l.uz);
layer ur = *(l.ur);
layer uh = *(l.uh);
layer wz = *(l.wz);
layer wr = *(l.wr);
layer wh = *(l.wh);
fill_cpu(l.outputs * l.batch * l.steps, 0, uz.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, ur.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, uh.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, wz.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, wr.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, wh.delta, 1);
if(net.train) {
fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
copy_cpu(l.outputs*l.batch, l.state, 1, l.prev_state, 1);
}
for (i = 0; i < l.steps; ++i) {
s.input = l.state;
forward_connected_layer(wz, s);
forward_connected_layer(wr, s);
s.input = net.input;
forward_connected_layer(uz, s);
forward_connected_layer(ur, s);
forward_connected_layer(uh, s);
copy_cpu(l.outputs*l.batch, uz.output, 1, l.z_cpu, 1);
axpy_cpu(l.outputs*l.batch, 1, wz.output, 1, l.z_cpu, 1);
copy_cpu(l.outputs*l.batch, ur.output, 1, l.r_cpu, 1);
axpy_cpu(l.outputs*l.batch, 1, wr.output, 1, l.r_cpu, 1);
activate_array(l.z_cpu, l.outputs*l.batch, LOGISTIC);
activate_array(l.r_cpu, l.outputs*l.batch, LOGISTIC);
copy_cpu(l.outputs*l.batch, l.state, 1, l.forgot_state, 1);
mul_cpu(l.outputs*l.batch, l.r_cpu, 1, l.forgot_state, 1);
s.input = l.forgot_state;
forward_connected_layer(wh, s);
copy_cpu(l.outputs*l.batch, uh.output, 1, l.h_cpu, 1);
axpy_cpu(l.outputs*l.batch, 1, wh.output, 1, l.h_cpu, 1);
if(l.tanh){
activate_array(l.h_cpu, l.outputs*l.batch, TANH);
} else {
activate_array(l.h_cpu, l.outputs*l.batch, LOGISTIC);
}
weighted_sum_cpu(l.state, l.h_cpu, l.z_cpu, l.outputs*l.batch, l.output);
copy_cpu(l.outputs*l.batch, l.output, 1, l.state, 1);
net.input += l.inputs*l.batch;
l.output += l.outputs*l.batch;
increment_layer(&uz, 1);
increment_layer(&ur, 1);
increment_layer(&uh, 1);
increment_layer(&wz, 1);
increment_layer(&wr, 1);
increment_layer(&wh, 1);
}
}
ν¨μ μ΄λ¦: forward_gru_layer
μ λ ₯:
layer l: GRU λ μ΄μ΄μ μ 보μ λ§€κ°λ³μλ₯Ό λ΄κ³ μλ layer ꡬ쑰체
network net: λ€νΈμν¬μ μ 보μ λ§€κ°λ³μλ₯Ό λ΄κ³ μλ network ꡬ쑰체
λμ:
μ λ ₯ λ°μ΄ν°μ GRU λ μ΄μ΄λ₯Ό ν΅ν΄ μλ°©ν₯ μ ν(forward propagation)λ₯Ό μννλ ν¨μλ‘, μ λ ₯ λ°μ΄ν°λ₯Ό GRU λ μ΄μ΄λ₯Ό ν΅ν΄ μ²λ¦¬νμ¬ μΆλ ₯ κ°μ κ³μ°νκ³ , κ·Έ κ°μ λ€μ λ μ΄μ΄μ μ λ ₯μΌλ‘ λ겨μ€.
μ΄λ, backward propagationμ μν΄ νμν μ€κ°κ°λ€μ μ μ₯ν΄ λμ.
μ€λͺ :
GRU λ μ΄μ΄μ λ§€κ°λ³μλ€ μ€μμ uz, ur, uhλ μ΄μ μν(previous state)λ‘λΆν°μ μ λ ₯(input)μ μ²λ¦¬νλ κ°μ€μΉ(weight) λ§€κ°λ³μμ΄κ³ , wz, wr, whλ νμ¬ μ λ ₯(input)μ μ²λ¦¬νλ κ°μ€μΉ λ§€κ°λ³μμ.
GRU λ μ΄μ΄λ μκ³μ΄(sequence) λ°μ΄ν°λ₯Ό μ²λ¦¬νκΈ° μν RNNμ ν μ’ λ₯λ‘, μ΄μ μμ μ μν(previous state)λ₯Ό μ¬μ¬μ©νλ λ μ΄μ΄μ.
forward_connected_layer ν¨μλ₯Ό ν΅ν΄ κ°μ€μΉμ μ λ ₯μ κ³±ν κ°κ³Ό biasλ₯Ό λν κ°μ κ³μ°νμ¬ νμ±ν ν¨μ(Logistic λλ Tanh)λ₯Ό μ μ©ν¨.
uz, ur, uh λ μ΄μ΄μμ λμ¨ μΆλ ₯κ°κ³Ό wz, wr, wh λ μ΄μ΄μμ λμ¨ μΆλ ₯κ°μ μ΄μ©νμ¬ zμ r κ°μ κ³μ°ν¨.
zκ°μ μ΄μ μνμ νμ¬ μ λ ₯μ μ‘°ν©ν ν λ‘μ§μ€ν± ν¨μλ₯Ό μ μ©νμ¬ κ³μ°ν¨.
rκ°μ zκ°κ³Ό λ§μ°¬κ°μ§λ‘ μ΄μ μνμ νμ¬ μ λ ₯μ μ‘°ν©ν ν λ‘μ§μ€ν± ν¨μλ₯Ό μ μ©νμ¬ κ³μ°ν¨.
hκ°μ zκ°κ³Ό μ΄μ μνλ₯Ό μ΄μ©νμ¬ μλ‘μ΄ μνλ₯Ό κ³μ°νκΈ° μν κ²μ΄νΈ(gate)λ₯Ό κ³μ°ν¨.
κ³μ°λ hκ°μ Tanh λλ Logistic ν¨μλ₯Ό μ μ©νμ¬ μΆλ ₯κ°(output)μ κ³μ°ν¨.
GRU λ μ΄μ΄λ μ¬λ¬ μμ (time step)μΌλ‘ ꡬμ±λμ΄ μμΌλ―λ‘, steps λ§νΌ λ°λ³΅μ μΌλ‘ forward_connected_layer ν¨μλ₯Ό νΈμΆνμ¬ μ€κ°κ°λ€μ κ³μ°ν¨.
backward_gru_layer
void backward_gru_layer(layer l, network net)
{
}
ν¨μ μ΄λ¦: backward_gru_layer
μ λ ₯:
layer l
network net (λ λ€ κ΅¬μ‘°μ²΄)
λμ:
GRU (κ²μ΄νΈ μν μ λ) λ μ΄μ΄μ μμ ν(backpropagation)λ₯Ό κ³μ°νκ³ μ΄μ λ μ΄μ΄μκ² μ€μ°¨ μ νΈ(error signal)λ₯Ό μ λ¬ν©λλ€.
μ΄λ₯Ό μν΄ μ λ ₯ μ νΈμ κ°μ€μΉ(weight)μ λν λ―ΈλΆ(gradient)μ κ³μ°ν©λλ€.
μ€λͺ :
l: GRU λ μ΄μ΄μ ꡬ쑰체λ‘, μ λ ₯ μ νΈμ κ°μ€μΉ, μΆλ ₯κ³Ό κ°μ λ€μν μ 보λ₯Ό λ΄κ³ μμ΅λλ€.
net: μ κ²½λ§ κ΅¬μ‘°μ²΄λ‘, μμ ν μμ μ΄μ λ μ΄μ΄λ‘ μ€μ°¨ μ νΈλ₯Ό μ λ¬νκΈ° μν΄ μ¬μ©λ©λλ€.
μ΄ ν¨μλ λΉ μνλ‘ λ¨κ²¨λ κ²μ΄ μλλΌ, ꡬν λ΄μ©μ΄ μλ κ²μ λλ€. ν¨μλ₯Ό νΈμΆν λ μ€μ λ‘ κ³μ°μ΄ μ΄λ£¨μ΄μ§λλ€.
update_gru_layer
void update_gru_layer(layer l, update_args a)
{
update_connected_layer(*(l.ur), a);
update_connected_layer(*(l.uz), a);
update_connected_layer(*(l.uh), a);
update_connected_layer(*(l.wr), a);
update_connected_layer(*(l.wz), a);
update_connected_layer(*(l.wh), a);
}
ν¨μ μ΄λ¦: update_gru_layer
μ λ ₯:
layer l: GRU λ μ΄μ΄ ꡬ쑰체
update_args a: μ λ°μ΄νΈ μΈμ ꡬ쑰체
λμ:
GRU λ μ΄μ΄μ κ°κ°μ μ°κ²°λ λ μ΄μ΄(ur, uz, uh, wr, wz, wh)λ€μ κ°μ€μΉ(weight)μ biasλ₯Ό μ λ°μ΄νΈνλ ν¨μ
μ€λͺ :
μ λ ₯μΌλ‘ μ£Όμ΄μ§ GRU λ μ΄μ΄ ꡬ쑰체 lμ μ°κ²°λ λ μ΄μ΄(ur, uz, uh, wr, wz, wh)λ€μ κ°μ€μΉμ biasλ₯Ό μ λ°μ΄νΈνλ ν¨μμ΄λ€.
μ΄λ₯Ό μν΄ update_connected_layer() ν¨μλ₯Ό κ° λ μ΄μ΄μ λν΄ νΈμΆνμ¬ κ°μ€μΉλ₯Ό μ λ°μ΄νΈνλ€.
make_gru_layer
layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
{
fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs);
batch = batch / steps;
layer l = {0};
l.batch = batch;
l.type = GRU;
l.steps = steps;
l.inputs = inputs;
l.uz = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.uz->batch = batch;
l.wz = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wz->batch = batch;
l.ur = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.ur->batch = batch;
l.wr = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wr->batch = batch;
l.uh = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.uh->batch = batch;
l.wh = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wh->batch = batch;
l.batch_normalize = batch_normalize;
l.outputs = outputs;
l.output = calloc(outputs*batch*steps, sizeof(float));
l.delta = calloc(outputs*batch*steps, sizeof(float));
l.state = calloc(outputs*batch, sizeof(float));
l.prev_state = calloc(outputs*batch, sizeof(float));
l.forgot_state = calloc(outputs*batch, sizeof(float));
l.forgot_delta = calloc(outputs*batch, sizeof(float));
l.r_cpu = calloc(outputs*batch, sizeof(float));
l.z_cpu = calloc(outputs*batch, sizeof(float));
l.h_cpu = calloc(outputs*batch, sizeof(float));
l.forward = forward_gru_layer;
l.backward = backward_gru_layer;
l.update = update_gru_layer;
return l;
}
ν¨μ μ΄λ¦: make_gru_layer
μ λ ₯:
int batch: λ°°μΉ ν¬κΈ°
int inputs: μ λ ₯μ ν¬κΈ°
int outputs: μΆλ ₯μ ν¬κΈ°
int steps: μκ° μ€ν μ μ
int batch_normalize: λ°°μΉ μ κ·ν μ¬μ© μ¬λΆ
int adam: Adam μ΅ν°λ§μ΄μ μ¬μ© μ¬λΆ
λμ:
GRU λ μ΄μ΄λ₯Ό μμ±νκ³ μ΄κΈ°ννλ ν¨μμ΄λ€. GRU λ μ΄μ΄λ uz, wr, uh, wh λ±μ μ°κ²° λ μ΄μ΄λ‘ ꡬμ±λμ΄ μλ€.
μ€λͺ :
μ λ ₯κ°μΌλ‘ λ°μ batch κ°μ stepsλ‘ λλμ΄μ Έμ μ¬μ©λλ€.
λ μ΄μ΄μ νμ μ GRUλ‘ μ€μ λλ€.
uz, wz, ur, wr, uh, wh λ±μ μ°κ²° λ μ΄μ΄κ° μμ±λκ³ μ΄κΈ°νλλ€.
μΆλ ₯κ°, delta, state, prev_state, forgot_state, forgot_delta, r_cpu, z_cpu, h_cpu λ±μ κ°λ€μ΄ μ΄κΈ°νλλ€.
forward, backward, update ν¨μκ° μ€μ λλ€.
μ΄κΈ°νλ GRU λ μ΄μ΄κ° λ°νλλ€.
Last updated
Was this helpful?