Skip to content

Commit 63fd76f

Browse files
Reduce model loading time (#43)
* Use buffering * Use vector * Minor --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 2a20f48 commit 63fd76f

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

main.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,10 @@ struct llama_model {
8787
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
8888
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
8989

90+
std::vector<char> f_buf(1024*1024);
91+
9092
auto fin = std::ifstream(fname, std::ios::binary);
93+
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
9194
if (!fin) {
9295
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
9396
return false;
@@ -325,6 +328,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
325328
printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
326329

327330
fin = std::ifstream(fname_part, std::ios::binary);
331+
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
328332
fin.seekg(file_offset);
329333

330334
// load weights

0 commit comments

Comments
 (0)