-
Bug
-
Resolution: Unresolved
-
Undefined
-
None
-
None
-
False
-
-
False
-
Impediment
-
-
-
Moderate
-
Approved
To Reproduce Steps to reproduce the behavior:
- Use machine with MI210 and bootc-amd-rhel9:1.3-1732815020
- run 'ilab model serve'
- See traceback in console
ERROR 11-28 22:11:59 multiproc_worker_utils.py:120] Worker VllmWorkerProcess pid 113 died, exit code: -15 INFO 11-28 22:11:59 multiproc_worker_utils.py:124] Killing local vLLM worker processes Process SpawnProcess-1: Traceback (most recent call last): File "/usr/lib64/python3.11/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/usr/lib64/python3.11/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/opt/app-root/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 388, in run_mp_engine engine = MQLLMEngine.from_engine_args(engine_args=engine_args, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 138, in from_engine_args return cls( ^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 78, in __init__ self.engine = LLMEngine(*args, ^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/engine/llm_engine.py", line 339, in __init__ self._initialize_kv_caches() File "/opt/app-root/lib64/python3.11/site-packages/vllm/engine/llm_engine.py", line 487, in _initialize_kv_caches self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) File "/opt/app-root/lib64/python3.11/site-packages/vllm/executor/distributed_gpu_executor.py", line 63, in initialize_cache self._run_workers("initialize_cache", File "/opt/app-root/lib64/python3.11/site-packages/vllm/executor/multiproc_gpu_executor.py", line 185, in _run_workers driver_worker_output = driver_worker_method(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/worker/worker.py", line 294, in initialize_cache self._warm_up_model() File "/opt/app-root/lib64/python3.11/site-packages/vllm/worker/worker.py", line 310, in _warm_up_model self.model_runner.capture_model(self.gpu_cache) File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/worker/model_runner.py", line 1448, in capture_model graph_runner.capture(**capture_inputs) File "/opt/app-root/lib64/python3.11/site-packages/vllm/worker/model_runner.py", line 1711, in capture self.model( File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/models/granite.py", line 422, in forward model_output = self.model(input_ids, positions, kv_caches, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/models/granite.py", line 323, in forward hidden_states = layer( ^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/models/granite.py", line 242, in forward hidden_states = self.self_attn( ^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/models/granite.py", line 172, in forward qkv, _ = self.qkv_proj(hidden_states) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/layers/linear.py", line 366, in forward output_parallel = self.quant_method.apply(self, input_, bias) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/layers/linear.py", line 134, in apply return tgemm.mm(x, layer.weight, bias) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/layers/tuned_gemm.py", line 105, in mm out = self.apply_skinny(m, n, k, inp_view, weights) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/layers/tuned_gemm.py", line 70, in apply_skinny ops.wvSpltK(weights, inp_view, out, n, self.cu_count) File "/opt/app-root/lib64/python3.11/site-packages/vllm/_custom_ops.py", line 38, in wrapper return fn(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/_custom_ops.py", line 973, in wvSpltK torch.ops._rocm_C.wvSpltK(a, b, out, N, cu_count) File "/opt/app-root/lib64/python3.11/site-packages/torch/_ops.py", line 1061, in __call__ return self_._op(*args, **(kwargs or {})) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ RuntimeError: CUDA kernel failed : 98
Expected behavior
- 'ilab model serve' does not error out
Device Info (please complete the following information):
- Hardware Specs: x86_64 bare metal, 4x MI210
- OS Version: registry.stage.redhat.io/rhelai1/bootc-amd-rhel9:1.3-1732815020
- Python Version: 3.11.7
- InstructLab Version: 0.21.0
Additional context
- This only happens with MI210, it does not happen with MI300X accelerators
- clones
-
RHELAI-2438 RuntimeError: CUDA kernel failed : 98 with MI210 accelerators
- Closed
- mentioned on