Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

finetune : zero the loraB initial vectors #4082

Merged
merged 3 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions common/train.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,51 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
free(rnd);
}

struct ggml_tensor * zero_tensor(struct ggml_tensor * tensor) {
float scale = 1.0f; // xavier
switch (tensor->n_dims) {
case 1:
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
*dst = 0.0f;
}
break;
case 2:
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
*dst = 0.0f;
}
}
break;
case 3:
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
*dst = 0.0f;
}
}
}
break;
case 4:
for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
*dst = 0.0f;
}
}
}
}
break;
default:
die("Unsupported tensor->n_dims");
};
return tensor;
}

struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
float scale = 1.0f; // xavier
switch (tensor->n_dims) {
Expand Down
1 change: 1 addition & 0 deletions common/train.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ struct random_uniform_distribution * init_random_uniform_distribution(int seed,
void free_random_normal_distribution (struct random_normal_distribution * rnd);
void free_random_uniform_distribution(struct random_uniform_distribution * rnd);

struct ggml_tensor * zero_tensor (struct ggml_tensor * tensor);
struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);

Expand Down
24 changes: 12 additions & 12 deletions examples/finetune/finetune.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);

randomize_tensor_normal(lora->tok_embeddings_a, rnd);
randomize_tensor_normal(lora->tok_embeddings_b, rnd);
zero_tensor(lora->tok_embeddings_b);
randomize_tensor_normal(lora->norm_a, rnd);
randomize_tensor_normal(lora->norm_b, rnd);
zero_tensor(lora->norm_b);
randomize_tensor_normal(lora->output_a, rnd);
randomize_tensor_normal(lora->output_b, rnd);
zero_tensor(lora->output_b);

for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = lora->layers[i];
randomize_tensor_normal(layer.attention_norm_a, rnd);
randomize_tensor_normal(layer.attention_norm_b, rnd);
zero_tensor(layer.attention_norm_b);

randomize_tensor_normal(layer.wq_a, rnd);
randomize_tensor_normal(layer.wq_b, rnd);
zero_tensor(layer.wq_b);
randomize_tensor_normal(layer.wk_a, rnd);
randomize_tensor_normal(layer.wk_b, rnd);
zero_tensor(layer.wk_b);
randomize_tensor_normal(layer.wv_a, rnd);
randomize_tensor_normal(layer.wv_b, rnd);
zero_tensor(layer.wv_b);
randomize_tensor_normal(layer.wo_a, rnd);
randomize_tensor_normal(layer.wo_b, rnd);
zero_tensor(layer.wo_b);

randomize_tensor_normal(layer.ffn_norm_a, rnd);
randomize_tensor_normal(layer.ffn_norm_b, rnd);
zero_tensor(layer.ffn_norm_b);

randomize_tensor_normal(layer.w1_a, rnd);
randomize_tensor_normal(layer.w1_b, rnd);
zero_tensor(layer.w1_b);
randomize_tensor_normal(layer.w2_a, rnd);
randomize_tensor_normal(layer.w2_b, rnd);
zero_tensor(layer.w2_b);
randomize_tensor_normal(layer.w3_a, rnd);
randomize_tensor_normal(layer.w3_b, rnd);
zero_tensor(layer.w3_b);
}

free_random_normal_distribution(rnd);
Expand Down