utils.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. import functools
  2. import os
  3. import signal
  4. import subprocess
  5. import sys
  6. import time
  7. import warnings
  8. from contextlib import contextmanager
  9. from pathlib import Path
  10. from typing import Any, Callable, Dict, List, Optional
  11. import openai
  12. import pytest
  13. import requests
  14. from openai.types.completion import Completion
  15. from transformers import AutoTokenizer
  16. from typing_extensions import ParamSpec
  17. from aphrodite.common.utils import (FlexibleArgumentParser,
  18. cuda_device_count_stateless, get_open_port,
  19. is_hip)
  20. from aphrodite.distributed import (ensure_model_parallel_initialized,
  21. init_distributed_environment)
  22. from aphrodite.endpoints.openai.args import make_arg_parser
  23. from aphrodite.engine.args_tools import AsyncEngineArgs
  24. from aphrodite.modeling.model_loader.loader import get_model_loader
  25. from aphrodite.platforms import current_platform
  26. from tests.models.utils import TextTextLogprobs
  27. if current_platform.is_rocm():
  28. from amdsmi import (amdsmi_get_gpu_vram_usage,
  29. amdsmi_get_processor_handles, amdsmi_init,
  30. amdsmi_shut_down)
  31. @contextmanager
  32. def _nvml():
  33. try:
  34. amdsmi_init()
  35. yield
  36. finally:
  37. amdsmi_shut_down()
  38. elif current_platform.is_cuda():
  39. from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
  40. nvmlInit, nvmlShutdown)
  41. @contextmanager
  42. def _nvml():
  43. try:
  44. nvmlInit()
  45. yield
  46. finally:
  47. nvmlShutdown()
  48. else:
  49. @contextmanager
  50. def _nvml():
  51. yield
  52. APHRODITE_PATH = Path(__file__).parent.parent
  53. """Path to root of the Aphrodite repository."""
  54. class RemoteOpenAIServer:
  55. DUMMY_API_KEY = "token-abc123"
  56. def __init__(self,
  57. model: str,
  58. aphrodite_serve_args: List[str],
  59. *,
  60. env_dict: Optional[Dict[str, str]] = None,
  61. auto_port: bool = True,
  62. max_wait_seconds: Optional[float] = None) -> None:
  63. if auto_port:
  64. if "-p" in aphrodite_serve_args or "--port" in aphrodite_serve_args:
  65. raise ValueError("You have manually specified the port "
  66. "when `auto_port=True`.")
  67. # Don't mutate the input args
  68. aphrodite_serve_args = aphrodite_serve_args + [
  69. "--port", str(get_open_port())
  70. ]
  71. parser = FlexibleArgumentParser(
  72. description="Aphrodite's remote OpenAI server.")
  73. parser = make_arg_parser(parser)
  74. args = parser.parse_args(["--model", model, *aphrodite_serve_args])
  75. self.host = str(args.host or 'localhost')
  76. self.port = int(args.port)
  77. # download the model before starting the server to avoid timeout
  78. is_local = os.path.isdir(model)
  79. if not is_local:
  80. engine_args = AsyncEngineArgs.from_cli_args(args)
  81. model_config = engine_args.create_model_config()
  82. load_config = engine_args.create_load_config()
  83. model_loader = get_model_loader(load_config)
  84. model_loader.download_model(model_config)
  85. env = os.environ.copy()
  86. # the current process might initialize cuda,
  87. # to be safe, we should use spawn method
  88. env['APHRODITE_WORKER_MULTIPROC_METHOD'] = 'spawn'
  89. if env_dict is not None:
  90. env.update(env_dict)
  91. self.proc = subprocess.Popen(
  92. ["aphrodite", "serve", model, *aphrodite_serve_args],
  93. env=env,
  94. stdout=sys.stdout,
  95. stderr=sys.stderr,
  96. )
  97. max_wait_seconds = max_wait_seconds or 240
  98. self._wait_for_server(url=self.url_for("health"),
  99. timeout=max_wait_seconds)
  100. def __enter__(self):
  101. return self
  102. def __exit__(self, exc_type, exc_value, traceback):
  103. self.proc.terminate()
  104. try:
  105. self.proc.wait(8)
  106. except subprocess.TimeoutExpired:
  107. # force kill if needed
  108. self.proc.kill()
  109. def _wait_for_server(self, *, url: str, timeout: float):
  110. # run health check
  111. start = time.time()
  112. while True:
  113. try:
  114. if requests.get(url).status_code == 200:
  115. break
  116. except Exception as err:
  117. result = self.proc.poll()
  118. if result is not None and result != 0:
  119. raise RuntimeError("Server exited unexpectedly.") from err
  120. time.sleep(0.5)
  121. if time.time() - start > timeout:
  122. raise RuntimeError(
  123. "Server failed to start in time.") from err
  124. @property
  125. def url_root(self) -> str:
  126. return f"http://{self.host}:{self.port}"
  127. def url_for(self, *parts: str) -> str:
  128. return self.url_root + "/" + "/".join(parts)
  129. def get_client(self):
  130. return openai.OpenAI(
  131. base_url=self.url_for("v1"),
  132. api_key=self.DUMMY_API_KEY,
  133. )
  134. def get_async_client(self):
  135. return openai.AsyncOpenAI(
  136. base_url=self.url_for("v1"),
  137. api_key=self.DUMMY_API_KEY,
  138. max_retries=0,
  139. )
  140. def compare_two_settings(model: str,
  141. arg1: List[str],
  142. arg2: List[str],
  143. env1: Optional[Dict[str, str]] = None,
  144. env2: Optional[Dict[str, str]] = None,
  145. max_wait_seconds: Optional[float] = None) -> None:
  146. """
  147. Launch API server with two different sets of arguments/environments
  148. and compare the results of the API calls.
  149. Args:
  150. model: The model to test.
  151. arg1: The first set of arguments to pass to the API server.
  152. arg2: The second set of arguments to pass to the API server.
  153. env1: The first set of environment variables to pass to the API server.
  154. env2: The second set of environment variables to pass to the API server.
  155. """
  156. trust_remote_code = "--trust-remote-code"
  157. if trust_remote_code in arg1 or trust_remote_code in arg2:
  158. tokenizer = AutoTokenizer.from_pretrained(model,
  159. trust_remote_code=True)
  160. else:
  161. tokenizer = AutoTokenizer.from_pretrained(model)
  162. prompt = "Hello, my name is"
  163. token_ids = tokenizer(prompt)["input_ids"]
  164. results = []
  165. for args, env in ((arg1, env1), (arg2, env2)):
  166. with RemoteOpenAIServer(model,
  167. args,
  168. env_dict=env,
  169. max_wait_seconds=max_wait_seconds) as server:
  170. client = server.get_client()
  171. # test models list
  172. models = client.models.list()
  173. models = models.data
  174. served_model = models[0]
  175. results.append({
  176. "test": "models_list",
  177. "id": served_model.id,
  178. "root": served_model.root,
  179. })
  180. # test with text prompt
  181. completion = client.completions.create(model=model,
  182. prompt=prompt,
  183. max_tokens=5,
  184. temperature=0.0)
  185. results.append({
  186. "test": "single_completion",
  187. "text": completion.choices[0].text,
  188. "finish_reason": completion.choices[0].finish_reason,
  189. "usage": completion.usage,
  190. })
  191. # test using token IDs
  192. completion = client.completions.create(
  193. model=model,
  194. prompt=token_ids,
  195. max_tokens=5,
  196. temperature=0.0,
  197. )
  198. results.append({
  199. "test": "token_ids",
  200. "text": completion.choices[0].text,
  201. "finish_reason": completion.choices[0].finish_reason,
  202. "usage": completion.usage,
  203. })
  204. # test seeded random sampling
  205. completion = client.completions.create(model=model,
  206. prompt=prompt,
  207. max_tokens=5,
  208. seed=33,
  209. temperature=1.0)
  210. results.append({
  211. "test": "seeded_sampling",
  212. "text": completion.choices[0].text,
  213. "finish_reason": completion.choices[0].finish_reason,
  214. "usage": completion.usage,
  215. })
  216. # test seeded random sampling with multiple prompts
  217. completion = client.completions.create(model=model,
  218. prompt=[prompt, prompt],
  219. max_tokens=5,
  220. seed=33,
  221. temperature=1.0)
  222. results.append({
  223. "test":
  224. "seeded_sampling",
  225. "text": [choice.text for choice in completion.choices],
  226. "finish_reason":
  227. [choice.finish_reason for choice in completion.choices],
  228. "usage":
  229. completion.usage,
  230. })
  231. # test simple list
  232. batch = client.completions.create(
  233. model=model,
  234. prompt=[prompt, prompt],
  235. max_tokens=5,
  236. temperature=0.0,
  237. )
  238. results.append({
  239. "test": "simple_list",
  240. "text0": batch.choices[0].text,
  241. "text1": batch.choices[1].text,
  242. })
  243. # test streaming
  244. batch = client.completions.create(
  245. model=model,
  246. prompt=[prompt, prompt],
  247. max_tokens=5,
  248. temperature=0.0,
  249. stream=True,
  250. )
  251. texts = [""] * 2
  252. for chunk in batch:
  253. assert len(chunk.choices) == 1
  254. choice = chunk.choices[0]
  255. texts[choice.index] += choice.text
  256. results.append({
  257. "test": "streaming",
  258. "texts": texts,
  259. })
  260. n = len(results) // 2
  261. arg1_results = results[:n]
  262. arg2_results = results[n:]
  263. for arg1_result, arg2_result in zip(arg1_results, arg2_results):
  264. assert arg1_result == arg2_result, (
  265. f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
  266. f"{arg1_result=} != {arg2_result=}")
  267. def init_test_distributed_environment(
  268. tp_size: int,
  269. pp_size: int,
  270. rank: int,
  271. distributed_init_port: str,
  272. local_rank: int = -1,
  273. ) -> None:
  274. distributed_init_method = f"tcp://localhost:{distributed_init_port}"
  275. init_distributed_environment(
  276. world_size=pp_size * tp_size,
  277. rank=rank,
  278. distributed_init_method=distributed_init_method,
  279. local_rank=local_rank)
  280. ensure_model_parallel_initialized(tp_size, pp_size)
  281. def multi_process_parallel(
  282. tp_size: int,
  283. pp_size: int,
  284. test_target: Any,
  285. ) -> None:
  286. import ray
  287. # Using ray helps debugging the error when it failed
  288. # as compared to multiprocessing.
  289. # NOTE: We need to set working_dir for distributed tests,
  290. # otherwise we may get import errors on ray workers
  291. ray.init(runtime_env={"working_dir": APHRODITE_PATH})
  292. distributed_init_port = get_open_port()
  293. refs = []
  294. for rank in range(tp_size * pp_size):
  295. refs.append(
  296. test_target.remote(tp_size, pp_size, rank, distributed_init_port))
  297. ray.get(refs)
  298. ray.shutdown()
  299. @contextmanager
  300. def error_on_warning():
  301. """
  302. Within the scope of this context manager, tests will fail if any warning
  303. is emitted.
  304. """
  305. with warnings.catch_warnings():
  306. warnings.simplefilter("error")
  307. yield
  308. def get_physical_device_indices(devices):
  309. visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
  310. if visible_devices is None:
  311. return devices
  312. visible_indices = [int(x) for x in visible_devices.split(",")]
  313. index_mapping = {i: physical for i, physical in enumerate(visible_indices)}
  314. return [index_mapping[i] for i in devices if i in index_mapping]
  315. @_nvml()
  316. def wait_for_gpu_memory_to_clear(devices: List[int],
  317. threshold_bytes: int,
  318. timeout_s: float = 120) -> None:
  319. # Use nvml instead of pytorch to reduce measurement error from torch cuda
  320. # context.
  321. devices = get_physical_device_indices(devices)
  322. start_time = time.time()
  323. while True:
  324. output: Dict[int, str] = {}
  325. output_raw: Dict[int, float] = {}
  326. for device in devices:
  327. if is_hip():
  328. dev_handle = amdsmi_get_processor_handles()[device]
  329. mem_info = amdsmi_get_gpu_vram_usage(dev_handle)
  330. gb_used = mem_info["vram_used"] / 2**10
  331. else:
  332. dev_handle = nvmlDeviceGetHandleByIndex(device)
  333. mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
  334. gb_used = mem_info.used / 2**30
  335. output_raw[device] = gb_used
  336. output[device] = f'{gb_used:.02f}'
  337. print('gpu memory used (GB): ', end='')
  338. for k, v in output.items():
  339. print(f'{k}={v}; ', end='')
  340. print('')
  341. dur_s = time.time() - start_time
  342. if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
  343. print(f'Done waiting for free GPU memory on devices {devices=} '
  344. f'({threshold_bytes/2**30=}) {dur_s=:.02f}')
  345. break
  346. if dur_s >= timeout_s:
  347. raise ValueError(f'Memory of devices {devices=} not free after '
  348. f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
  349. time.sleep(5)
  350. _P = ParamSpec("_P")
  351. def fork_new_process_for_each_test(
  352. f: Callable[_P, None]) -> Callable[_P, None]:
  353. """Decorator to fork a new process for each test function.
  354. """
  355. @functools.wraps(f)
  356. def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
  357. # Make the process the leader of its own process group
  358. # to avoid sending SIGTERM to the parent process
  359. os.setpgrp()
  360. from _pytest.outcomes import Skipped
  361. pid = os.fork()
  362. print(f"Fork a new process to run a test {pid}")
  363. if pid == 0:
  364. try:
  365. f(*args, **kwargs)
  366. except Skipped as e:
  367. # convert Skipped to exit code 0
  368. print(str(e))
  369. os._exit(0)
  370. except Exception:
  371. import traceback
  372. traceback.print_exc()
  373. os._exit(1)
  374. else:
  375. os._exit(0)
  376. else:
  377. pgid = os.getpgid(pid)
  378. _pid, _exitcode = os.waitpid(pid, 0)
  379. # ignore SIGTERM signal itself
  380. old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
  381. # kill all child processes
  382. os.killpg(pgid, signal.SIGTERM)
  383. # restore the signal handler
  384. signal.signal(signal.SIGTERM, old_signal_handler)
  385. assert _exitcode == 0, (f"function {f} failed when called with"
  386. f" args {args} and kwargs {kwargs}")
  387. return wrapper
  388. def multi_gpu_test(*, num_gpus: int):
  389. """
  390. Decorate a test to be run only when multiple GPUs are available.
  391. """
  392. test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
  393. test_skipif = pytest.mark.skipif(
  394. cuda_device_count_stateless() < num_gpus,
  395. reason=f"Need at least {num_gpus} GPUs to run the test.",
  396. )
  397. def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
  398. return test_selector(test_skipif(fork_new_process_for_each_test(f)))
  399. return wrapper
  400. async def completions_with_server_args(
  401. prompts: List[str],
  402. model_name: str,
  403. server_cli_args: List[str],
  404. num_logprobs: Optional[int],
  405. max_wait_seconds: int = 240,
  406. ) -> Completion:
  407. '''Construct a remote OpenAI server, obtain an async client to the
  408. server & invoke the completions API to obtain completions.
  409. Args:
  410. prompts: test prompts
  411. model_name: model to spin up on the Aphrodite server
  412. server_cli_args: CLI args for starting the server
  413. num_logprobs: Number of logprobs to report (or `None`)
  414. max_wait_seconds: timeout interval for bringing up server.
  415. Default: 240sec
  416. Returns:
  417. OpenAI Completion instance
  418. '''
  419. outputs = None
  420. max_wait_seconds = 240 * 3 # 240 is default
  421. with RemoteOpenAIServer(model_name,
  422. server_cli_args,
  423. max_wait_seconds=max_wait_seconds) as server:
  424. client = server.get_async_client()
  425. outputs = await client.completions.create(model=model_name,
  426. prompt=prompts,
  427. temperature=0,
  428. stream=False,
  429. max_tokens=5,
  430. logprobs=num_logprobs)
  431. assert outputs is not None, "Completion API call failed."
  432. return outputs
  433. def get_client_text_generations(completions: Completion) -> List[str]:
  434. '''Extract generated tokens from the output of a
  435. request made to an Open-AI-protocol completions endpoint.
  436. '''
  437. return [x.text for x in completions.choices]
  438. def get_client_text_logprob_generations(
  439. completions: Completion) -> List[TextTextLogprobs]:
  440. '''Operates on the output of a request made to an Open-AI-protocol
  441. completions endpoint; obtains top-rank logprobs for each token in
  442. each :class:`SequenceGroup`
  443. '''
  444. text_generations = get_client_text_generations(completions)
  445. text = ''.join(text_generations)
  446. return [(text_generations, text,
  447. (None if x.logprobs is None else x.logprobs.top_logprobs))
  448. for x in completions.choices]