-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprompt.py
99 lines (84 loc) · 7.88 KB
/
prompt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
def get_option_prompt(candidates, version="default"):
option_prompt = ""
options = []
for idx, candidate in enumerate(candidates):
choice = chr(ord("A") + idx)
if version == "v4":
option_prompt += f"({choice}) {candidate}\n"
else:
option_prompt += f"({choice}):{candidate} "
options.append(choice)
options = "(" + ",".join(options) + ")"
return option_prompt, options
def get_multiple_choice_prompt(model, conv_mode, question, candidates):
if conv_mode == "multiple_choice_allvideo_v4":
prompt = "You are a helpful expert in video analysis. Select the best option to answer the question. USER: <image>\nThe input consists of a sequence of key frames from a video.\nQuestion: %s\nOptions:\n%sOnly give the best option. \nASSISTANT:\nAnswer: Best option:("
option_prompt, options = get_option_prompt(candidates, version="v4")
prompt = prompt % (question, option_prompt)
elif conv_mode == "multiple_choice_allvideo_34b_v4":
prompt = "<|im_start|>system\n You are a helpful expert in video analysis. Select the best option to answer the question. <|im_end|>\n<|im_start|>user\n <image>\nThe input consists of a sequence of key frames from a video. Question: %s\nOptions:\n%sOnly give the best option. <|im_end|>\n<|im_start|>assistant\nAnswer: Best option:("
option_prompt, options = get_option_prompt(candidates, version="v4")
prompt = prompt % (question, option_prompt)
elif conv_mode == "multiple_choice_allvideo_v5":
prompt = "You are a helpful expert in video analysis. Select the best option to answer the question. USER: <image>\nThe input consists of an image arranging key frames from a video in a grid view, along with a sequence of key frames from the video.\nQuestion: %s\nOptions:\n%sOnly give the best option. \nASSISTANT:\nAnswer: Best option:("
option_prompt, options = get_option_prompt(candidates, version="v4")
prompt = prompt % (question, option_prompt)
elif conv_mode == "multiple_choice_allvideo_34b_v5":
prompt = "<|im_start|>system\n You are a helpful expert in video analysis. Select the best option to answer the question. <|im_end|>\n<|im_start|>user\n <image>\nThe input consists of an image arranging key frames from a video in a grid view, along with a sequence of key frames from the video. Question: %s\nOptions:\n%sOnly give the best option. <|im_end|>\n<|im_start|>assistant\nAnswer: Best option:("
option_prompt, options = get_option_prompt(candidates, version="v4")
prompt = prompt % (question, option_prompt)
else:
raise ValueError(f"Unknown conv_mode: {conv_mode}")
return prompt
def get_prompt(model, conv_mode, question):
if conv_mode == "image_seq_v3":
prompt = "USER: <image>\nThe input consists of a sequence of key frames from a video. Answer concisely with overall content and context of the video, highlighting any significant events, characters, or objects that appear throughout the video. Question: %s \nASSISTANT:\nAnswer: In the video,"
prompt = prompt % question
elif conv_mode == "image_seq_34b_v3":
prompt = "<|im_start|>system\n Answer the question. <|im_end|>\n<|im_start|>user\n <image>\nThe input consists of a sequence of key frames from a video. Answer concisely with overall content and context of the video, highlighting any significant events, characters, or objects that appear throughout the video. Question: %s <|im_end|>\n<|im_start|>assistant\nAnswer: In the video,"
prompt = prompt % question
elif conv_mode == "image_seq_v2":
prompt = "You are a helpful expert in video analysis. USER: <image>\nThe input consists of a sequence of key frames from a video. Answer concisely with overall content and context of the video, highlighting any significant events, characters, or objects that appear throughout the video. Question: %s \nASSISTANT:\nAnswer: In the video,"
prompt = prompt % question
elif conv_mode == "image_seq_34b_v2":
prompt = "<|im_start|>system\n You are a helpful expert in video analysis. Answer the question. <|im_end|>\n<|im_start|>user\n <image>\nThe input consists of a sequence of key frames from a video. Answer concisely with overall content and context of the video, highlighting any significant events, characters, or objects that appear throughout the video. Question: %s <|im_end|>\n<|im_start|>assistant\nAnswer: In the video,"
prompt = prompt % question
else:
if model.config.mm_use_im_start_end:
ques = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + question
else:
ques = DEFAULT_IMAGE_TOKEN + "\n" + question
conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], ques)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
return prompt
def get_option_prompt_onlychoice(candidates, version="default"):
option_prompt = ""
options = []
for idx, candidate in enumerate(candidates):
choice = chr(ord("A") + idx)
if version == "v4":
option_prompt += f"{choice}. {candidate}\n"
else:
option_prompt += f"{choice}:{candidate} "
options.append(choice)
options = "(" + ",".join(options) + ")"
return option_prompt, options
def get_multiple_choice_prompt_onlychoice(model, conv_mode, question, candidates):
# give one choice as answer only
# e.g. A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nSelect the best answer to the following multiple-choice question based on the video. Respond with only the letter (A,B,C,D,E) of the correct option. \nQuestion: how do the two man play the instrument\nOptions:\nA. roll the handle\nB. tap their feet\nC. strum the string\nD. hit with sticks\nE. pat with hand\nOnly give the best option. \nAnswer with the option's letter from the given choices directly. ASSISTANT:
if conv_mode == "multiple_choice_allvideo_v4":
# vicuna-v1: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
option_prompt, options = get_option_prompt_onlychoice(candidates, version="v4")
prompt = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nSelect the best answer to the following multiple-choice question based on the video. Respond with only the letter {options} of the correct option. \nQuestion: {question}\nOptions:\n{option_prompt}Only give the best option. \nAnswer with the option's letter from the given choices directly. ASSISTANT:"
# prompt = prompt % (options, question, option_prompt)
elif conv_mode == "multiple_choice_allvideo_34b_v4":
option_prompt, options = get_option_prompt_onlychoice(candidates, version="v4")
prompt = f"<|im_start|>system\n A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <|im_end|>\n<|im_start|>user\n <image>\nSelect the best answer to the following multiple-choice question based on the video. Respond with only the letter {options} of the correct option.\nQuestion: {question}\nOptions:\n{option_prompt}Only give the best option. <|im_end|>\n<|im_start|>assistant\nAnswer with the option's letter from the given choices directly. ASSISTANT:"
# prompt = prompt % (options, question, option_prompt)
else:
raise ValueError(f"Unknown conv_mode: {conv_mode}")
return prompt