-
Notifications
You must be signed in to change notification settings - Fork 0
/
docker-compose.yml
74 lines (70 loc) · 3.93 KB
/
docker-compose.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
services:
check:
image: ittia/check:remote
container_name: check
volumes:
- /data/cache:/data/cache
env_file:
- ./infra/env.d/check
ports:
- 8000:8000
# Remove the GPU part belwo if not inferencing locally
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: always
# Using vllm for LLM inference on Google TPU
# Tested on Google v4-8
vllm:
image: ittia/vllm:0.6.0-tpu
privileged: true
ports:
- "8010:8010"
shm_size: 128G
volumes:
- /mnt/cache:/root/.cache
env_file:
- ./env.d/huggingface
command: vllm serve mistralai/Mistral-Nemo-Instruct-2407 --tensor-parallel-size 4 --port 8010 --trust-remote-code --max-model-len 12288
restart: always
# Infinity supports embedding and rerank models, v2 version supports serving multiple models
infinity:
image: michaelf34/infinity:latest
container_name: infinity
ports:
- 7997:7997
volumes:
- /data/cache/huggingface:/cache/huggingface
env_file:
- ./infra/env.d/infinity
- ./infra/env.d/huggingface
command: ["v2"]
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: always
# Services below are not actively in use.
# Keeps here for reference.
ollama:
image: ollama/ollama
container_name: ollama
ports:
- "11434:11434"
volumes:
- /data/volumes/ollama:/root/.ollama
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: always