env.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. import datetime
  2. import locale
  3. import re
  4. import subprocess
  5. import sys
  6. import os
  7. from collections import namedtuple
  8. try:
  9. import torch
  10. TORCH_AVAILABLE = True
  11. except (ImportError, NameError, AttributeError, OSError):
  12. TORCH_AVAILABLE = False
  13. # System Environment Information
  14. SystemEnv = namedtuple(
  15. 'SystemEnv',
  16. [
  17. 'torch_version',
  18. 'is_debug_build',
  19. 'cuda_compiled_version',
  20. 'gcc_version',
  21. 'clang_version',
  22. 'cmake_version',
  23. 'os',
  24. 'libc_version',
  25. 'python_version',
  26. 'python_platform',
  27. 'is_cuda_available',
  28. 'cuda_runtime_version',
  29. 'cuda_module_loading',
  30. 'nvidia_driver_version',
  31. 'nvidia_gpu_models',
  32. 'cudnn_version',
  33. 'pip_version', # 'pip' or 'pip3'
  34. 'pip_packages',
  35. 'conda_packages',
  36. 'hip_compiled_version',
  37. 'hip_runtime_version',
  38. 'miopen_runtime_version',
  39. 'caching_allocator_config',
  40. 'is_xnnpack_available',
  41. 'cpu_info',
  42. 'rocm_version',
  43. 'aphrodite_version',
  44. 'aphrodite_build_flags',
  45. ])
  46. DEFAULT_CONDA_PATTERNS = {
  47. "torch",
  48. "numpy",
  49. "cudatoolkit",
  50. "soumith",
  51. "mkl",
  52. "magma",
  53. "triton",
  54. "optree",
  55. }
  56. DEFAULT_PIP_PATTERNS = {
  57. "torch",
  58. "numpy",
  59. "mypy",
  60. "flake8",
  61. "triton",
  62. "optree",
  63. "onnx",
  64. }
  65. def run(command):
  66. """Return (return-code, stdout, stderr)."""
  67. shell = isinstance(command, str)
  68. p = subprocess.Popen(command,
  69. stdout=subprocess.PIPE,
  70. stderr=subprocess.PIPE,
  71. shell=shell)
  72. raw_output, raw_err = p.communicate()
  73. rc = p.returncode
  74. enc = 'oem' if get_platform() == 'win32' else locale.getpreferredencoding()
  75. output = raw_output.decode(enc)
  76. err = raw_err.decode(enc)
  77. return rc, output.strip(), err.strip()
  78. def run_and_read_all(run_lambda, command):
  79. """
  80. Run command using run_lambda; reads and returns entire output if rc is 0.
  81. """
  82. rc, out, _ = run_lambda(command)
  83. if rc != 0:
  84. return None
  85. return out
  86. def run_and_parse_first_match(run_lambda, command, regex):
  87. """
  88. Run command using run_lambda, returns the first regex match if it exists.
  89. """
  90. rc, out, _ = run_lambda(command)
  91. if rc != 0:
  92. return None
  93. match = re.search(regex, out)
  94. if match is None:
  95. return None
  96. return match.group(1)
  97. def run_and_return_first_line(run_lambda, command):
  98. """
  99. Run command using run_lambda and returns first line if output is not empty.
  100. """
  101. rc, out, _ = run_lambda(command)
  102. if rc != 0:
  103. return None
  104. return out.split('\n')[0]
  105. def get_conda_packages(run_lambda, patterns=None):
  106. if patterns is None:
  107. patterns = DEFAULT_CONDA_PATTERNS
  108. conda = os.environ.get('CONDA_EXE', 'conda')
  109. out = run_and_read_all(run_lambda, "{} list".format(conda))
  110. if out is None:
  111. return out
  112. return "\n".join(line for line in out.splitlines()
  113. if not line.startswith("#") and any(name in line
  114. for name in patterns))
  115. def get_gcc_version(run_lambda):
  116. return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
  117. def get_clang_version(run_lambda):
  118. return run_and_parse_first_match(run_lambda, 'clang --version',
  119. r'clang version (.*)')
  120. def get_cmake_version(run_lambda):
  121. return run_and_parse_first_match(run_lambda, 'cmake --version',
  122. r'cmake (.*)')
  123. def get_nvidia_driver_version(run_lambda):
  124. if get_platform() == 'darwin':
  125. cmd = 'kextstat | grep -i cuda'
  126. return run_and_parse_first_match(run_lambda, cmd,
  127. r'com[.]nvidia[.]CUDA [(](.*?)[)]')
  128. smi = get_nvidia_smi()
  129. return run_and_parse_first_match(run_lambda, smi,
  130. r'Driver Version: (.*?) ')
  131. def get_gpu_info(run_lambda):
  132. if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(
  133. torch.version, 'hip') and torch.version.hip is not None):
  134. if TORCH_AVAILABLE and torch.cuda.is_available():
  135. if torch.version.hip is not None:
  136. prop = torch.cuda.get_device_properties(0)
  137. if hasattr(prop, "gcnArchName"):
  138. gcnArch = " ({})".format(prop.gcnArchName)
  139. else:
  140. gcnArch = "NoGCNArchNameOnOldPyTorch"
  141. else:
  142. gcnArch = ""
  143. return torch.cuda.get_device_name(None) + gcnArch
  144. return None
  145. smi = get_nvidia_smi()
  146. uuid_regex = re.compile(r' \(UUID: .+?\)')
  147. rc, out, _ = run_lambda(smi + ' -L')
  148. if rc != 0:
  149. return None
  150. # Anonymize GPUs by removing their UUID
  151. return re.sub(uuid_regex, '', out)
  152. def get_running_cuda_version(run_lambda):
  153. return run_and_parse_first_match(run_lambda, 'nvcc --version',
  154. r'release .+ V(.*)')
  155. def get_cudnn_version(run_lambda):
  156. """
  157. Return a list of libcudnn.so; it's hard to tell which one is being used.
  158. """
  159. if get_platform() == 'win32':
  160. system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
  161. cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%")
  162. where_cmd = os.path.join(system_root, 'System32', 'where')
  163. cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
  164. elif get_platform() == 'darwin':
  165. # CUDA libraries and drivers can be found in /usr/local/cuda/. See
  166. # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
  167. # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
  168. # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
  169. cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*'
  170. else:
  171. cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
  172. rc, out, _ = run_lambda(cudnn_cmd)
  173. # find will return 1 if there are permission errors or if not found
  174. if len(out) == 0 or (rc != 1 and rc != 0):
  175. length = os.environ.get('CUDNN_LIBRARY')
  176. if length is not None and os.path.isfile(length):
  177. return os.path.realpath(length)
  178. return None
  179. files_set = set()
  180. for fn in out.split('\n'):
  181. fn = os.path.realpath(fn) # eliminate symbolic links
  182. if os.path.isfile(fn):
  183. files_set.add(fn)
  184. if not files_set:
  185. return None
  186. # Alphabetize the result because the order is non-deterministic otherwise
  187. files = sorted(files_set)
  188. if len(files) == 1:
  189. return files[0]
  190. result = '\n'.join(files)
  191. return 'Probably one of the following:\n{}'.format(result)
  192. def get_nvidia_smi():
  193. # Note: nvidia-smi is currently available only on Windows and Linux
  194. smi = 'nvidia-smi'
  195. if get_platform() == 'win32':
  196. system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
  197. program_files_root = os.environ.get('PROGRAMFILES',
  198. 'C:\\Program Files')
  199. legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation',
  200. 'NVSMI', smi)
  201. new_path = os.path.join(system_root, 'System32', smi)
  202. smis = [new_path, legacy_path]
  203. for candidate_smi in smis:
  204. if os.path.exists(candidate_smi):
  205. smi = '"{}"'.format(candidate_smi)
  206. break
  207. return smi
  208. def get_rocm_version(run_lambda):
  209. """Returns the ROCm version if available, otherwise 'N/A'."""
  210. return run_and_parse_first_match(run_lambda, 'hipcc --version',
  211. r'HIP version: (\S+)')
  212. def get_aphrodite_version():
  213. try:
  214. import aphrodite
  215. return aphrodite.__version__
  216. except ImportError:
  217. return 'N/A'
  218. def summarize_aphrodite_build_flags():
  219. # This could be a static method if the flags are constant, or dynamic if
  220. # you need to check environment variables, etc.
  221. return 'CUDA Archs: {}; ROCm: {}'.format(
  222. os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
  223. 'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
  224. )
  225. def get_cpu_info(run_lambda):
  226. rc, out, err = 0, '', ''
  227. if get_platform() == 'linux':
  228. rc, out, err = run_lambda('lscpu')
  229. elif get_platform() == 'win32':
  230. rc, out, err = run_lambda(
  231. 'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType, \
  232. DeviceID,CurrentClockSpeed,MaxClockSpeed,L2CacheSize, \
  233. L2CacheSpeed,Revision /VALUE')
  234. elif get_platform() == 'darwin':
  235. rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
  236. cpu_info = out if rc == 0 else err
  237. return cpu_info
  238. def get_platform():
  239. if sys.platform.startswith('linux'):
  240. return 'linux'
  241. elif sys.platform.startswith('win32'):
  242. return 'win32'
  243. elif sys.platform.startswith('cygwin'):
  244. return 'cygwin'
  245. elif sys.platform.startswith('darwin'):
  246. return 'darwin'
  247. else:
  248. return sys.platform
  249. def get_mac_version(run_lambda):
  250. return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion',
  251. r'(.*)')
  252. def get_windows_version(run_lambda):
  253. system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
  254. wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
  255. findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
  256. return run_and_read_all(
  257. run_lambda,
  258. '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
  259. def get_lsb_version(run_lambda):
  260. return run_and_parse_first_match(run_lambda, 'lsb_release -a',
  261. r'Description:\t(.*)')
  262. def check_release_file(run_lambda):
  263. return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
  264. r'PRETTY_NAME="(.*)"')
  265. def get_os(run_lambda):
  266. from platform import machine
  267. platform = get_platform()
  268. if platform == 'win32' or platform == 'cygwin':
  269. return get_windows_version(run_lambda)
  270. if platform == 'darwin':
  271. version = get_mac_version(run_lambda)
  272. if version is None:
  273. return None
  274. return 'macOS {} ({})'.format(version, machine())
  275. if platform == 'linux':
  276. # Ubuntu/Debian based
  277. desc = get_lsb_version(run_lambda)
  278. if desc is not None:
  279. return '{} ({})'.format(desc, machine())
  280. # Try reading /etc/*-release
  281. desc = check_release_file(run_lambda)
  282. if desc is not None:
  283. return '{} ({})'.format(desc, machine())
  284. return '{} ({})'.format(platform, machine())
  285. # Unknown platform
  286. return platform
  287. def get_python_platform():
  288. import platform
  289. return platform.platform()
  290. def get_libc_version():
  291. import platform
  292. if get_platform() != 'linux':
  293. return 'N/A'
  294. return '-'.join(platform.libc_ver())
  295. def get_pip_packages(run_lambda, patterns=None):
  296. """
  297. Return `pip list` output. Note: will also find conda-installed pytorch and
  298. numpy packages.
  299. """
  300. if patterns is None:
  301. patterns = DEFAULT_PIP_PATTERNS
  302. # People generally have `pip` as `pip` or `pip3`
  303. # But here it is invoked as `python -mpip`
  304. def run_with_pip(pip):
  305. out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
  306. return "\n".join(line for line in out.splitlines()
  307. if any(name in line for name in patterns))
  308. pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
  309. out = run_with_pip([sys.executable, '-mpip'])
  310. return pip_version, out
  311. def get_cachingallocator_config():
  312. ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
  313. return ca_config
  314. def get_cuda_module_loading_config():
  315. if TORCH_AVAILABLE and torch.cuda.is_available():
  316. torch.cuda.init()
  317. config = os.environ.get('CUDA_MODULE_LOADING', '')
  318. return config
  319. else:
  320. return "N/A"
  321. def is_xnnpack_available():
  322. if TORCH_AVAILABLE:
  323. import torch.backends.xnnpack
  324. return str(
  325. torch.backends.xnnpack.enabled) # type: ignore[attr-defined]
  326. else:
  327. return "N/A"
  328. def get_env_info():
  329. run_lambda = run
  330. pip_version, pip_list_output = get_pip_packages(run_lambda)
  331. if TORCH_AVAILABLE:
  332. version_str = torch.__version__
  333. debug_mode_str = str(torch.version.debug)
  334. cuda_available_str = str(torch.cuda.is_available())
  335. cuda_version_str = torch.version.cuda
  336. if not hasattr(torch.version,
  337. 'hip') or torch.version.hip is None: # cuda version
  338. hip_compiled_version = 'N/A'
  339. hip_runtime_version = 'N/A'
  340. miopen_runtime_version = 'N/A'
  341. else: # HIP version
  342. def get_version_or_na(cfg, prefix):
  343. _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
  344. return _lst[0] if _lst else 'N/A'
  345. cfg = torch._C._show_config().split('\n')
  346. hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime')
  347. miopen_runtime_version = get_version_or_na(cfg, 'MIOpen')
  348. cuda_version_str = 'N/A'
  349. hip_compiled_version = torch.version.hip
  350. else:
  351. version_str = 'N/A'
  352. debug_mode_str = 'N/A'
  353. cuda_available_str = 'N/A'
  354. cuda_version_str = 'N/A'
  355. hip_compiled_version = 'N/A'
  356. hip_runtime_version = 'N/A'
  357. miopen_runtime_version = 'N/A'
  358. sys_version = sys.version.replace("\n", " ")
  359. conda_packages = get_conda_packages(run_lambda)
  360. rocm_version = get_rocm_version(run_lambda)
  361. aphrodite_version = get_aphrodite_version()
  362. aphrodite_build_flags = summarize_aphrodite_build_flags()
  363. return SystemEnv(
  364. torch_version=version_str,
  365. is_debug_build=debug_mode_str,
  366. python_version='{} ({}-bit runtime)'.format(
  367. sys_version,
  368. sys.maxsize.bit_length() + 1),
  369. python_platform=get_python_platform(),
  370. is_cuda_available=cuda_available_str,
  371. cuda_compiled_version=cuda_version_str,
  372. cuda_runtime_version=get_running_cuda_version(run_lambda),
  373. cuda_module_loading=get_cuda_module_loading_config(),
  374. nvidia_gpu_models=get_gpu_info(run_lambda),
  375. nvidia_driver_version=get_nvidia_driver_version(run_lambda),
  376. cudnn_version=get_cudnn_version(run_lambda),
  377. hip_compiled_version=hip_compiled_version,
  378. hip_runtime_version=hip_runtime_version,
  379. miopen_runtime_version=miopen_runtime_version,
  380. pip_version=pip_version,
  381. pip_packages=pip_list_output,
  382. conda_packages=conda_packages,
  383. os=get_os(run_lambda),
  384. libc_version=get_libc_version(),
  385. gcc_version=get_gcc_version(run_lambda),
  386. clang_version=get_clang_version(run_lambda),
  387. cmake_version=get_cmake_version(run_lambda),
  388. caching_allocator_config=get_cachingallocator_config(),
  389. is_xnnpack_available=is_xnnpack_available(),
  390. cpu_info=get_cpu_info(run_lambda),
  391. rocm_version=rocm_version,
  392. aphrodite_version=aphrodite_version,
  393. aphrodite_build_flags=aphrodite_build_flags,
  394. )
  395. env_info_fmt = """
  396. PyTorch version: {torch_version}
  397. Is debug build: {is_debug_build}
  398. CUDA used to build PyTorch: {cuda_compiled_version}
  399. ROCM used to build PyTorch: {hip_compiled_version}
  400. OS: {os}
  401. GCC version: {gcc_version}
  402. Clang version: {clang_version}
  403. CMake version: {cmake_version}
  404. Libc version: {libc_version}
  405. Python version: {python_version}
  406. Python platform: {python_platform}
  407. Is CUDA available: {is_cuda_available}
  408. CUDA runtime version: {cuda_runtime_version}
  409. CUDA_MODULE_LOADING set to: {cuda_module_loading}
  410. GPU models and configuration: {nvidia_gpu_models}
  411. Nvidia driver version: {nvidia_driver_version}
  412. cuDNN version: {cudnn_version}
  413. HIP runtime version: {hip_runtime_version}
  414. MIOpen runtime version: {miopen_runtime_version}
  415. Is XNNPACK available: {is_xnnpack_available}
  416. CPU:
  417. {cpu_info}
  418. Versions of relevant libraries:
  419. {pip_packages}
  420. {conda_packages}
  421. """.strip()
  422. env_info_fmt += """
  423. ROCM Version: {rocm_version}
  424. Aphrodite Version: {aphrodite_version}
  425. Aphrodite Build Flags:
  426. {aphrodite_build_flags}
  427. """.strip()
  428. def pretty_str(envinfo):
  429. def replace_nones(dct, replacement='Could not collect '):
  430. for key in dct:
  431. if dct[key] is not None:
  432. continue
  433. dct[key] = replacement
  434. return dct
  435. def replace_bools(dct, true='Yes', false='No'):
  436. for key in dct:
  437. if dct[key] is True:
  438. dct[key] = true
  439. elif dct[key] is False:
  440. dct[key] = false
  441. return dct
  442. def prepend(text, tag='[prepend]'):
  443. lines = text.split('\n')
  444. updated_lines = [tag + line for line in lines]
  445. return '\n'.join(updated_lines)
  446. def replace_if_empty(text, replacement='No relevant packages '):
  447. if text is not None and len(text) == 0:
  448. return replacement
  449. return text
  450. def maybe_start_on_next_line(string):
  451. # If `string` is multiline, prepend a \n to it.
  452. if string is not None and len(string.split('\n')) > 1:
  453. return '\n{}\n'.format(string)
  454. return string
  455. mutable_dict = envinfo._asdict()
  456. # If nvidia_gpu_models is multiline, start on the next line
  457. mutable_dict['nvidia_gpu_models'] = \
  458. maybe_start_on_next_line(envinfo.nvidia_gpu_models)
  459. # If the machine doesn't have CUDA, report some fields as 'No CUDA'
  460. dynamic_cuda_fields = [
  461. 'cuda_runtime_version',
  462. 'nvidia_gpu_models',
  463. 'nvidia_driver_version',
  464. ]
  465. all_cuda_fields = dynamic_cuda_fields + ['cudnn_version']
  466. all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None
  467. for field in dynamic_cuda_fields)
  468. if TORCH_AVAILABLE and not torch.cuda.is_available(
  469. ) and all_dynamic_cuda_fields_missing:
  470. for field in all_cuda_fields:
  471. mutable_dict[field] = 'No CUDA'
  472. if envinfo.cuda_compiled_version is None:
  473. mutable_dict['cuda_compiled_version'] = 'None'
  474. # Replace True with Yes, False with No
  475. mutable_dict = replace_bools(mutable_dict)
  476. # Replace all None objects with 'Could not collect'
  477. mutable_dict = replace_nones(mutable_dict)
  478. # If either of these are '', replace with 'No relevant packages'
  479. mutable_dict['pip_packages'] = replace_if_empty(
  480. mutable_dict['pip_packages'])
  481. mutable_dict['conda_packages'] = replace_if_empty(
  482. mutable_dict['conda_packages'])
  483. # Tag conda and pip packages with a prefix
  484. # If they were previously None, they'll show up as ie '[conda] Could not
  485. # collect'
  486. if mutable_dict['pip_packages']:
  487. mutable_dict['pip_packages'] = prepend(
  488. mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version))
  489. if mutable_dict['conda_packages']:
  490. mutable_dict['conda_packages'] = prepend(
  491. mutable_dict['conda_packages'], '[conda] ')
  492. mutable_dict['cpu_info'] = envinfo.cpu_info
  493. return env_info_fmt.format(**mutable_dict)
  494. def get_pretty_env_info():
  495. return pretty_str(get_env_info())
  496. def main():
  497. print("Collecting environment information...")
  498. output = get_pretty_env_info()
  499. print(output)
  500. if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(
  501. torch.utils, '_crash_handler'):
  502. minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
  503. if sys.platform == "linux" and os.path.exists(minidump_dir):
  504. dumps = [
  505. os.path.join(minidump_dir, dump)
  506. for dump in os.listdir(minidump_dir)
  507. ]
  508. latest = max(dumps, key=os.path.getctime)
  509. ctime = os.path.getctime(latest)
  510. creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
  511. '%Y-%m-%d %H:%M:%S')
  512. msg = "\n*** Detected a minidump at {} created on {}, ".format( \
  513. latest, creation_time) + \
  514. "if this is related to your bug please include it when you " \
  515. "file a report ***"
  516. print(msg, file=sys.stderr)
  517. if __name__ == '__main__':
  518. main()