1
0

utils.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. import functools
  2. import os
  3. import signal
  4. import subprocess
  5. import sys
  6. import time
  7. import warnings
  8. from contextlib import contextmanager
  9. from pathlib import Path
  10. from typing import Any, Callable, Dict, List, Optional
  11. import openai
  12. import requests
  13. from transformers import AutoTokenizer
  14. from typing_extensions import ParamSpec
  15. from aphrodite.common.utils import (FlexibleArgumentParser, get_open_port,
  16. is_hip)
  17. from aphrodite.distributed import (ensure_model_parallel_initialized,
  18. init_distributed_environment)
  19. from aphrodite.endpoints.openai.args import make_arg_parser
  20. from aphrodite.platforms import current_platform
  21. if current_platform.is_rocm():
  22. from amdsmi import (amdsmi_get_gpu_vram_usage,
  23. amdsmi_get_processor_handles, amdsmi_init,
  24. amdsmi_shut_down)
  25. @contextmanager
  26. def _nvml():
  27. try:
  28. amdsmi_init()
  29. yield
  30. finally:
  31. amdsmi_shut_down()
  32. elif current_platform.is_cuda():
  33. from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
  34. nvmlInit, nvmlShutdown)
  35. @contextmanager
  36. def _nvml():
  37. try:
  38. nvmlInit()
  39. yield
  40. finally:
  41. nvmlShutdown()
  42. else:
  43. @contextmanager
  44. def _nvml():
  45. yield
  46. APHRODITE_PATH = Path(__file__).parent.parent
  47. """Path to root of the Aphrodite repository."""
  48. class RemoteOpenAIServer:
  49. DUMMY_API_KEY = "token-abc123" # Aphrodite's OpenAI server does not need API key
  50. MAX_START_WAIT_S = 240 # wait for server to start for 240 seconds
  51. def __init__(
  52. self,
  53. model: str,
  54. cli_args: List[str],
  55. *,
  56. env_dict: Optional[Dict[str, str]] = None,
  57. auto_port: bool = True,
  58. ) -> None:
  59. if auto_port:
  60. if "-p" in cli_args or "--port" in cli_args:
  61. raise ValueError("You have manually specified the port"
  62. "when `auto_port=True`.")
  63. cli_args = cli_args + ["--port", str(get_open_port())]
  64. parser = FlexibleArgumentParser(
  65. description="Aphrodite's remote OpenAI server.")
  66. parser = make_arg_parser(parser)
  67. args = parser.parse_args(cli_args)
  68. self.host = str(args.host or 'localhost')
  69. self.port = int(args.port)
  70. env = os.environ.copy()
  71. # the current process might initialize cuda,
  72. # to be safe, we should use spawn method
  73. env['APHRODITE_WORKER_MULTIPROC_METHOD'] = 'spawn'
  74. if env_dict is not None:
  75. env.update(env_dict)
  76. self.proc = subprocess.Popen(["aphrodite", "run"] + [model] + cli_args,
  77. env=env,
  78. stdout=sys.stdout,
  79. stderr=sys.stderr)
  80. self._wait_for_server(url=self.url_for("health"),
  81. timeout=self.MAX_START_WAIT_S)
  82. def __enter__(self):
  83. return self
  84. def __exit__(self, exc_type, exc_value, traceback):
  85. self.proc.terminate()
  86. try:
  87. self.proc.wait(3)
  88. except subprocess.TimeoutExpired:
  89. # force kill if needed
  90. self.proc.kill()
  91. def _wait_for_server(self, *, url: str, timeout: float):
  92. # run health check
  93. start = time.time()
  94. while True:
  95. try:
  96. if requests.get(url).status_code == 200:
  97. break
  98. except Exception as err:
  99. result = self.proc.poll()
  100. if result is not None and result != 0:
  101. raise RuntimeError("Server exited unexpectedly.") from err
  102. time.sleep(0.5)
  103. if time.time() - start > timeout:
  104. raise RuntimeError(
  105. "Server failed to start in time.") from err
  106. @property
  107. def url_root(self) -> str:
  108. return f"http://{self.host}:{self.port}"
  109. def url_for(self, *parts: str) -> str:
  110. return self.url_root + "/" + "/".join(parts)
  111. def get_client(self):
  112. return openai.OpenAI(
  113. base_url=self.url_for("v1"),
  114. api_key=self.DUMMY_API_KEY,
  115. )
  116. def get_async_client(self):
  117. return openai.AsyncOpenAI(
  118. base_url=self.url_for("v1"),
  119. api_key=self.DUMMY_API_KEY,
  120. )
  121. def compare_two_settings(model: str,
  122. arg1: List[str],
  123. arg2: List[str],
  124. env1: Optional[Dict[str, str]] = None,
  125. env2: Optional[Dict[str, str]] = None):
  126. """
  127. Launch API server with two different sets of arguments/environments
  128. and compare the results of the API calls.
  129. Args:
  130. model: The model to test.
  131. arg1: The first set of arguments to pass to the API server.
  132. arg2: The second set of arguments to pass to the API server.
  133. env1: The first set of environment variables to pass to the API server.
  134. env2: The second set of environment variables to pass to the API server.
  135. """
  136. tokenizer = AutoTokenizer.from_pretrained(model)
  137. prompt = "Hello, my name is"
  138. token_ids = tokenizer(prompt)["input_ids"]
  139. results = []
  140. for args, env in ((arg1, env1), (arg2, env2)):
  141. with RemoteOpenAIServer(model, args, env_dict=env) as server:
  142. client = server.get_client()
  143. # test models list
  144. models = client.models.list()
  145. models = models.data
  146. served_model = models[0]
  147. results.append({
  148. "test": "models_list",
  149. "id": served_model.id,
  150. "root": served_model.root,
  151. })
  152. # test with text prompt
  153. completion = client.completions.create(model=model,
  154. prompt=prompt,
  155. max_tokens=5,
  156. temperature=0.0)
  157. results.append({
  158. "test": "single_completion",
  159. "text": completion.choices[0].text,
  160. "finish_reason": completion.choices[0].finish_reason,
  161. "usage": completion.usage,
  162. })
  163. # test using token IDs
  164. completion = client.completions.create(
  165. model=model,
  166. prompt=token_ids,
  167. max_tokens=5,
  168. temperature=0.0,
  169. )
  170. results.append({
  171. "test": "token_ids",
  172. "text": completion.choices[0].text,
  173. "finish_reason": completion.choices[0].finish_reason,
  174. "usage": completion.usage,
  175. })
  176. # test seeded random sampling
  177. completion = client.completions.create(model=model,
  178. prompt=prompt,
  179. max_tokens=5,
  180. seed=33,
  181. temperature=1.0)
  182. results.append({
  183. "test": "seeded_sampling",
  184. "text": completion.choices[0].text,
  185. "finish_reason": completion.choices[0].finish_reason,
  186. "usage": completion.usage,
  187. })
  188. # test seeded random sampling with multiple prompts
  189. completion = client.completions.create(model=model,
  190. prompt=[prompt, prompt],
  191. max_tokens=5,
  192. seed=33,
  193. temperature=1.0)
  194. results.append({
  195. "test":
  196. "seeded_sampling",
  197. "text": [choice.text for choice in completion.choices],
  198. "finish_reason":
  199. [choice.finish_reason for choice in completion.choices],
  200. "usage":
  201. completion.usage,
  202. })
  203. # test simple list
  204. batch = client.completions.create(
  205. model=model,
  206. prompt=[prompt, prompt],
  207. max_tokens=5,
  208. temperature=0.0,
  209. )
  210. results.append({
  211. "test": "simple_list",
  212. "text0": batch.choices[0].text,
  213. "text1": batch.choices[1].text,
  214. })
  215. # test streaming
  216. batch = client.completions.create(
  217. model=model,
  218. prompt=[prompt, prompt],
  219. max_tokens=5,
  220. temperature=0.0,
  221. stream=True,
  222. )
  223. texts = [""] * 2
  224. for chunk in batch:
  225. assert len(chunk.choices) == 1
  226. choice = chunk.choices[0]
  227. texts[choice.index] += choice.text
  228. results.append({
  229. "test": "streaming",
  230. "texts": texts,
  231. })
  232. n = len(results) // 2
  233. arg1_results = results[:n]
  234. arg2_results = results[n:]
  235. for arg1_result, arg2_result in zip(arg1_results, arg2_results):
  236. assert arg1_result == arg2_result, (
  237. f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
  238. f"{arg1_result=} != {arg2_result=}")
  239. def init_test_distributed_environment(
  240. tp_size: int,
  241. pp_size: int,
  242. rank: int,
  243. distributed_init_port: str,
  244. local_rank: int = -1,
  245. ) -> None:
  246. distributed_init_method = f"tcp://localhost:{distributed_init_port}"
  247. init_distributed_environment(
  248. world_size=pp_size * tp_size,
  249. rank=rank,
  250. distributed_init_method=distributed_init_method,
  251. local_rank=local_rank)
  252. ensure_model_parallel_initialized(tp_size, pp_size)
  253. def multi_process_parallel(
  254. tp_size: int,
  255. pp_size: int,
  256. test_target: Any,
  257. ) -> None:
  258. import ray
  259. # Using ray helps debugging the error when it failed
  260. # as compared to multiprocessing.
  261. # NOTE: We need to set working_dir for distributed tests,
  262. # otherwise we may get import errors on ray workers
  263. ray.init(runtime_env={"working_dir": APHRODITE_PATH})
  264. distributed_init_port = get_open_port()
  265. refs = []
  266. for rank in range(tp_size * pp_size):
  267. refs.append(
  268. test_target.remote(tp_size, pp_size, rank, distributed_init_port))
  269. ray.get(refs)
  270. ray.shutdown()
  271. @contextmanager
  272. def error_on_warning():
  273. """
  274. Within the scope of this context manager, tests will fail if any warning
  275. is emitted.
  276. """
  277. with warnings.catch_warnings():
  278. warnings.simplefilter("error")
  279. yield
  280. @_nvml()
  281. def wait_for_gpu_memory_to_clear(devices: List[int],
  282. threshold_bytes: int,
  283. timeout_s: float = 120) -> None:
  284. # Use nvml instead of pytorch to reduce measurement error from torch cuda
  285. # context.
  286. start_time = time.time()
  287. while True:
  288. output: Dict[int, str] = {}
  289. output_raw: Dict[int, float] = {}
  290. for device in devices:
  291. if is_hip():
  292. dev_handle = amdsmi_get_processor_handles()[device]
  293. mem_info = amdsmi_get_gpu_vram_usage(dev_handle)
  294. gb_used = mem_info["vram_used"] / 2**10
  295. else:
  296. dev_handle = nvmlDeviceGetHandleByIndex(device)
  297. mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
  298. gb_used = mem_info.used / 2**30
  299. output_raw[device] = gb_used
  300. output[device] = f'{gb_used:.02f}'
  301. print('gpu memory used (GB): ', end='')
  302. for k, v in output.items():
  303. print(f'{k}={v}; ', end='')
  304. print('')
  305. dur_s = time.time() - start_time
  306. if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
  307. print(f'Done waiting for free GPU memory on devices {devices=} '
  308. f'({threshold_bytes/2**30=}) {dur_s=:.02f}')
  309. break
  310. if dur_s >= timeout_s:
  311. raise ValueError(f'Memory of devices {devices=} not free after '
  312. f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
  313. time.sleep(5)
  314. _P = ParamSpec("_P")
  315. def fork_new_process_for_each_test(
  316. f: Callable[_P, None]) -> Callable[_P, None]:
  317. """Decorator to fork a new process for each test function.
  318. """
  319. @functools.wraps(f)
  320. def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
  321. # Make the process the leader of its own process group
  322. # to avoid sending SIGTERM to the parent process
  323. os.setpgrp()
  324. from _pytest.outcomes import Skipped
  325. pid = os.fork()
  326. print(f"Fork a new process to run a test {pid}")
  327. if pid == 0:
  328. try:
  329. f(*args, **kwargs)
  330. except Skipped as e:
  331. # convert Skipped to exit code 0
  332. print(str(e))
  333. os._exit(0)
  334. except Exception:
  335. import traceback
  336. traceback.print_exc()
  337. os._exit(1)
  338. else:
  339. os._exit(0)
  340. else:
  341. pgid = os.getpgid(pid)
  342. _pid, _exitcode = os.waitpid(pid, 0)
  343. # ignore SIGTERM signal itself
  344. old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
  345. # kill all child processes
  346. os.killpg(pgid, signal.SIGTERM)
  347. # restore the signal handler
  348. signal.signal(signal.SIGTERM, old_signal_handler)
  349. assert _exitcode == 0, (f"function {f} failed when called with"
  350. f" args {args} and kwargs {kwargs}")
  351. return wrapper