Skip to content

Commit c69f1d7

Browse files
ggerganovolexiyb
authored andcommitted
scripts : add server-llm.sh (ggml-org#3868)
* scripts : add deploy-server.sh * scripts : rename to server-llm.sh * scripts : working curl pipe
1 parent 8ffcd4e commit c69f1d7

File tree

1 file changed

+391
-0
lines changed

1 file changed

+391
-0
lines changed

Diff for: scripts/server-llm.sh

+391
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,391 @@
1+
#!/bin/bash
2+
#
3+
# Helper script for deploying llama.cpp server with a single Bash command
4+
#
5+
# - Works on Linux and macOS
6+
# - Supports: CPU, CUDA, Metal, OpenCL
7+
# - Can run all GGUF models from HuggingFace
8+
# - Can serve requests in parallel
9+
# - Always builds latest llama.cpp from GitHub
10+
#
11+
# Limitations
12+
#
13+
# - Chat templates are poorly supported (base models recommended)
14+
# - Might be unstable!
15+
#
16+
# Usage:
17+
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
18+
#
19+
# --port: port number, default is 8888
20+
# --repo: path to a repo containing GGUF model files
21+
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
22+
# --backend: cpu, cuda, metal, opencl, depends on the OS
23+
# --gpu-id: gpu id, default is 0
24+
# --n-parallel: number of parallel requests, default is 8
25+
# --n-kv: KV cache size, default is 4096
26+
# --verbose: verbose output
27+
#
28+
# Example:
29+
#
30+
# bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
31+
#
32+
33+
set -e
34+
35+
# required utils: curl, git, make
36+
if ! command -v curl &> /dev/null; then
37+
printf "[-] curl not found\n"
38+
exit 1
39+
fi
40+
if ! command -v git &> /dev/null; then
41+
printf "[-] git not found\n"
42+
exit 1
43+
fi
44+
if ! command -v make &> /dev/null; then
45+
printf "[-] make not found\n"
46+
exit 1
47+
fi
48+
49+
# parse arguments
50+
port=8888
51+
repo=""
52+
wtype=""
53+
backend="cpu"
54+
55+
# if macOS, use metal backend by default
56+
if [[ "$OSTYPE" == "darwin"* ]]; then
57+
backend="metal"
58+
elif command -v nvcc &> /dev/null; then
59+
backend="cuda"
60+
fi
61+
62+
gpu_id=0
63+
n_parallel=8
64+
n_kv=4096
65+
verbose=0
66+
67+
function print_usage {
68+
printf "Usage:\n"
69+
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
70+
printf " --port: port number, default is 8888\n"
71+
printf " --repo: path to a repo containing GGUF model files\n"
72+
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
73+
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
74+
printf " --gpu-id: gpu id, default is 0\n"
75+
printf " --n-parallel: number of parallel requests, default is 8\n"
76+
printf " --n-kv: KV cache size, default is 4096\n"
77+
printf " --verbose: verbose output\n\n"
78+
printf "Example:\n\n"
79+
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
80+
}
81+
82+
while [[ $# -gt 0 ]]; do
83+
key="$1"
84+
case $key in
85+
--port)
86+
port="$2"
87+
shift
88+
shift
89+
;;
90+
--repo)
91+
repo="$2"
92+
shift
93+
shift
94+
;;
95+
--wtype)
96+
wtype="$2"
97+
shift
98+
shift
99+
;;
100+
--backend)
101+
backend="$2"
102+
shift
103+
shift
104+
;;
105+
--gpu-id)
106+
gpu_id="$2"
107+
shift
108+
shift
109+
;;
110+
--n-parallel)
111+
n_parallel="$2"
112+
shift
113+
shift
114+
;;
115+
--n-kv)
116+
n_kv="$2"
117+
shift
118+
shift
119+
;;
120+
--verbose)
121+
verbose=1
122+
shift
123+
;;
124+
--help)
125+
print_usage
126+
exit 0
127+
;;
128+
*)
129+
echo "Unknown argument: $key"
130+
print_usage
131+
exit 1
132+
;;
133+
esac
134+
done
135+
136+
# available weights types
137+
wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
138+
139+
wfiles=()
140+
for wt in "${wtypes[@]}"; do
141+
wfiles+=("")
142+
done
143+
144+
# sample repos
145+
repos=(
146+
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
147+
"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
148+
"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
149+
"https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
150+
"https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
151+
"https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
152+
"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
153+
"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
154+
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
155+
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
156+
)
157+
158+
printf "\n"
159+
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
160+
printf " Based on the options that follow, the script might download a model file\n"
161+
printf " from the internet, which can be a few GBs in size. The script will also\n"
162+
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
163+
printf "\n"
164+
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
165+
printf " model using llama.cpp for demonstration purposes.\n"
166+
printf "\n"
167+
printf " Please note:\n"
168+
printf "\n"
169+
printf " - All new data will be stored in the current folder\n"
170+
printf " - The server will be listening on all network interfaces\n"
171+
printf " - The server will run with default settings which are not always optimal\n"
172+
printf " - Do not judge the quality of a model based on the results from this script\n"
173+
printf " - Do not use this script to benchmark llama.cpp\n"
174+
printf " - Do not use this script in production\n"
175+
printf " - This script is only for demonstration purposes\n"
176+
printf "\n"
177+
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
178+
printf "\n"
179+
printf " Press Enter to continue ...\n\n"
180+
181+
read
182+
183+
if [[ -z "$repo" ]]; then
184+
printf "[+] No repo provided from the command line\n"
185+
printf " Please select a number from the list below or enter an URL:\n\n"
186+
187+
is=0
188+
for r in "${repos[@]}"; do
189+
printf " %2d) %s\n" $is "$r"
190+
is=$((is+1))
191+
done
192+
193+
# ask for repo until index of sample repo is provided or an URL
194+
while [[ -z "$repo" ]]; do
195+
printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
196+
read -p "[+] Select repo: " repo
197+
198+
# check if the input is a number
199+
if [[ "$repo" =~ ^[0-9]+$ ]]; then
200+
if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
201+
repo="${repos[$repo]}"
202+
else
203+
printf "[-] Invalid repo index: %s\n" "$repo"
204+
repo=""
205+
fi
206+
elif [[ "$repo" =~ ^https?:// ]]; then
207+
repo="$repo"
208+
else
209+
printf "[-] Invalid repo URL: %s\n" "$repo"
210+
repo=""
211+
fi
212+
done
213+
fi
214+
215+
# remove suffix
216+
repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
217+
218+
printf "[+] Checking for GGUF model files in %s\n" "$repo"
219+
220+
# find GGUF files in the source
221+
# TODO: better logic
222+
model_tree="${repo%/}/tree/main"
223+
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
224+
225+
# list all files in the provided git repo
226+
printf "[+] Model files:\n\n"
227+
for file in $model_files; do
228+
# determine iw by grepping the filename with wtypes
229+
iw=-1
230+
is=0
231+
for wt in "${wtypes[@]}"; do
232+
# uppercase
233+
ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
234+
if [[ "$ufile" =~ "$wt" ]]; then
235+
iw=$is
236+
break
237+
fi
238+
is=$((is+1))
239+
done
240+
241+
if [[ $iw -eq -1 ]]; then
242+
continue
243+
fi
244+
245+
wfiles[$iw]="$file"
246+
247+
have=" "
248+
if [[ -f "$file" ]]; then
249+
have="*"
250+
fi
251+
252+
printf " %2d) %s %s\n" $iw "$have" "$file"
253+
done
254+
255+
# ask for weights type until provided and available
256+
while [[ -z "$wtype" ]]; do
257+
printf "\n"
258+
read -p "[+] Select weight type: " wtype
259+
wfile="${wfiles[$wtype]}"
260+
261+
if [[ -z "$wfile" ]]; then
262+
printf "[-] Invalid weight type: %s\n" "$wtype"
263+
wtype=""
264+
fi
265+
done
266+
267+
printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
268+
269+
url="${repo%/}/resolve/main/$wfile"
270+
271+
# check file if the model has been downloaded before
272+
chk="$wfile.chk"
273+
274+
# check if we should download the file
275+
# - if $wfile does not exist
276+
# - if $wfile exists but $chk does not exist
277+
# - if $wfile exists and $chk exists but $wfile is newer than $chk
278+
# TODO: better logic using git lfs info
279+
280+
do_download=0
281+
282+
if [[ ! -f "$wfile" ]]; then
283+
do_download=1
284+
elif [[ ! -f "$chk" ]]; then
285+
do_download=1
286+
elif [[ "$wfile" -nt "$chk" ]]; then
287+
do_download=1
288+
fi
289+
290+
if [[ $do_download -eq 1 ]]; then
291+
printf "[+] Downloading weights from %s\n" "$url"
292+
293+
# download the weights file
294+
curl -o "$wfile" -# -L "$url"
295+
296+
# create a check file if successful
297+
if [[ $? -eq 0 ]]; then
298+
printf "[+] Creating check file %s\n" "$chk"
299+
touch "$chk"
300+
fi
301+
else
302+
printf "[+] Using cached weights %s\n" "$wfile"
303+
fi
304+
305+
# get latest llama.cpp and build
306+
307+
printf "[+] Downloading latest llama.cpp\n"
308+
309+
llama_cpp_dir="__llama_cpp_port_${port}__"
310+
311+
if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
312+
# if the dir exists and there isn't a file "__ggml_script__" in it, abort
313+
printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
314+
printf "[-] Please remove it and try again\n"
315+
exit 1
316+
elif [[ -d "$llama_cpp_dir" ]]; then
317+
printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
318+
printf "[+] Using cached llama.cpp\n"
319+
320+
cd "$llama_cpp_dir"
321+
git reset --hard
322+
git fetch
323+
git checkout origin/master
324+
325+
cd ..
326+
else
327+
printf "[+] Cloning llama.cpp\n"
328+
329+
git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
330+
fi
331+
332+
# mark that that the directory is made by this script
333+
touch "$llama_cpp_dir/__ggml_script__"
334+
335+
if [[ $verbose -eq 1 ]]; then
336+
set -x
337+
fi
338+
339+
# build
340+
cd "$llama_cpp_dir"
341+
342+
make clean
343+
344+
log="--silent"
345+
if [[ $verbose -eq 1 ]]; then
346+
log=""
347+
fi
348+
349+
if [[ "$backend" == "cuda" ]]; then
350+
printf "[+] Building with CUDA backend\n"
351+
LLAMA_CUBLAS=1 make -j server $log
352+
elif [[ "$backend" == "cpu" ]]; then
353+
printf "[+] Building with CPU backend\n"
354+
make -j server $log
355+
elif [[ "$backend" == "metal" ]]; then
356+
printf "[+] Building with Metal backend\n"
357+
make -j server $log
358+
elif [[ "$backend" == "opencl" ]]; then
359+
printf "[+] Building with OpenCL backend\n"
360+
LLAMA_CLBLAST=1 make -j server $log
361+
else
362+
printf "[-] Unknown backend: %s\n" "$backend"
363+
exit 1
364+
fi
365+
366+
# run the server
367+
368+
printf "[+] Running server\n"
369+
370+
args=""
371+
if [[ "$backend" == "cuda" ]]; then
372+
export CUDA_VISIBLE_DEVICES=$gpu_id
373+
args="-ngl 999"
374+
elif [[ "$backend" == "cpu" ]]; then
375+
args="-ngl 0"
376+
elif [[ "$backend" == "metal" ]]; then
377+
args="-ngl 999"
378+
elif [[ "$backend" == "opencl" ]]; then
379+
args="-ngl 999"
380+
else
381+
printf "[-] Unknown backend: %s\n" "$backend"
382+
exit 1
383+
fi
384+
385+
if [[ $verbose -eq 1 ]]; then
386+
args="$args --verbose"
387+
fi
388+
389+
./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
390+
391+
exit 0

0 commit comments

Comments
 (0)