Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 72542ff

Browse files
cebtenzzrehodlen
authored andcommittedApr 1, 2024
convert : automatically fall back to HfVocab if tokenizer.model doesn't exist (ggml-org#5821)
1 parent 5f9c520 commit 72542ff

File tree

4 files changed

+43
-47
lines changed

4 files changed

+43
-47
lines changed
 

‎README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,7 @@ And after 4.45 hours, you will have the final perplexity.
786786
### Interactive mode
787787
788788
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
789-
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
789+
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
790790
791791
Here is an example of a few-shot interaction, invoked with the command
792792
@@ -850,7 +850,7 @@ Sample run:
850850
```
851851
== Running in interactive mode. ==
852852
- Press Ctrl+C to interject at any time.
853-
- Press Return to return control to LLaMa.
853+
- Press Return to return control to LLaMA.
854854
- If you want to submit another line, end your input in '\'.
855855
856856
Below is an instruction that describes a task. Write a response that appropriately completes the request.

‎convert-llama-ggml-to-gguf.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ def handle_metadata(cfg, hp):
373373
raise ValueError('Unable to load metadata')
374374
vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
375375
vocab_factory = convert.VocabFactory(vocab_path)
376-
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
376+
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
377377
convert.check_vocab_size(params, vocab)
378378
return params, vocab, special_vocab
379379

@@ -398,8 +398,8 @@ def handle_args():
398398
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
399399
parser.add_argument("--vocab-dir", type=Path,
400400
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
401-
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
402-
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
401+
parser.add_argument("--vocabtype", default="spm,hfft",
402+
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
403403
return parser.parse_args()
404404

405405

‎convert.py

+36-40
Original file line numberDiff line numberDiff line change
@@ -1282,35 +1282,32 @@ def load_some_model(path: Path) -> ModelPlus:
12821282

12831283

12841284
class VocabFactory:
1285+
_FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
1286+
12851287
def __init__(self, path: Path):
12861288
self.path = path
1287-
self.files: dict[str, Path | None] = {
1288-
"tokenizer.model": None,
1289-
"vocab.json": None,
1290-
"tokenizer.json": None,
1291-
}
1292-
self._detect_files()
1293-
1294-
def _detect_files(self):
1295-
for file in self.files.keys():
1296-
file_path = self.path / file
1297-
parent_file_path = self.path.parent / file
1298-
if file_path.exists():
1299-
self.files[file] = file_path
1300-
elif parent_file_path.exists():
1301-
self.files[file] = parent_file_path
1302-
print(f"Found vocab files: {self.files}")
1303-
1304-
def _select_file(self, vocabtype: str | None) -> Path:
1305-
if vocabtype in ["spm", "bpe"]:
1306-
for file_key in self.files.keys():
1307-
if (file := self.files[file_key]) is not None:
1308-
return file
1309-
raise FileNotFoundError(f"{vocabtype} vocab not found.")
1310-
if vocabtype == "hfft":
1311-
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
1312-
return self.path
1313-
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
1289+
self.file_paths = self._detect_files()
1290+
print(f"Found vocab files: {self.file_paths}")
1291+
1292+
def _detect_files(self) -> dict[str, Path | None]:
1293+
def locate(file: str) -> Path | None:
1294+
if (path := self.path / file).exists():
1295+
return path
1296+
if (path := self.path.parent / file).exists():
1297+
return path
1298+
return None
1299+
1300+
return {vt: locate(f) for vt, f in self._FILES.items()}
1301+
1302+
def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
1303+
for vtype in vocab_types:
1304+
try:
1305+
path = self.file_paths[vtype]
1306+
except KeyError:
1307+
raise ValueError(f"Unsupported vocabulary type {vtype}") from None
1308+
if path is not None:
1309+
return vtype, path
1310+
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
13141311

13151312
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
13161313
load_merges = vocabtype == "bpe"
@@ -1322,30 +1319,30 @@ def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path:
13221319
n_vocab=n_vocab,
13231320
)
13241321

1325-
def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
1326-
path = self._select_file(vocabtype)
1327-
print(f"Loading vocab file '{path}', type '{vocabtype}'")
1322+
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
1323+
vocab_type, path = self._select_file(vocab_types)
1324+
print(f"Loading vocab file {path!r}, type {vocab_type!r}")
13281325

13291326
added_tokens_path = path.parent / "added_tokens.json"
13301327
vocab: Vocab
1331-
if vocabtype == "bpe":
1328+
if vocab_type == "bpe":
13321329
vocab = BpeVocab(
13331330
path, added_tokens_path if added_tokens_path.exists() else None
13341331
)
1335-
elif vocabtype == "spm":
1332+
elif vocab_type == "spm":
13361333
vocab = SentencePieceVocab(
13371334
path, added_tokens_path if added_tokens_path.exists() else None
13381335
)
1339-
elif vocabtype == "hfft":
1336+
elif vocab_type == "hfft":
13401337
vocab = HfVocab(
1341-
path, added_tokens_path if added_tokens_path.exists() else None
1338+
path.parent, added_tokens_path if added_tokens_path.exists() else None
13421339
)
13431340
else:
1344-
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
1341+
raise ValueError(vocab_type)
13451342
# FIXME: Respect --vocab-dir?
13461343
special_vocab = self._create_special_vocab(
13471344
vocab,
1348-
vocabtype,
1345+
vocab_type,
13491346
model_parent_path,
13501347
)
13511348
return vocab, special_vocab
@@ -1379,15 +1376,14 @@ def main(args_in: list[str] | None = None) -> None:
13791376
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
13801377
# We currently only support Q8_0 output on little endian systems.
13811378
output_choices.append("q8_0")
1382-
vocab_types = ["spm", "bpe", "hfft"]
1383-
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
1379+
parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
13841380
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
13851381
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
13861382
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
13871383
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
13881384
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
13891385
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
1390-
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
1386+
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
13911387
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
13921388
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
13931389
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
@@ -1448,7 +1444,7 @@ def main(args_in: list[str] | None = None) -> None:
14481444
model_parent_path = model_plus.paths[0].parent
14491445
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
14501446
vocab_factory = VocabFactory(vocab_path)
1451-
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path)
1447+
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
14521448

14531449
if args.vocab_only:
14541450
if not args.outfile:

‎examples/infill/infill.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -378,10 +378,10 @@ int main(int argc, char ** argv) {
378378
if (params.interactive) {
379379
const char *control_message;
380380
if (params.multiline_input) {
381-
control_message = " - To return control to LLaMa, end your input with '\\'.\n"
381+
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
382382
" - To return control without starting a new line, end your input with '/'.\n";
383383
} else {
384-
control_message = " - Press Return to return control to LLaMa.\n"
384+
control_message = " - Press Return to return control to LLaMA.\n"
385385
" - To return control without starting a new line, end your input with '/'.\n"
386386
" - If you want to submit another line, end your input with '\\'.\n";
387387
}

0 commit comments

Comments
 (0)
Please sign in to comment.