-
Bug
-
Resolution: Done
-
Undefined
-
None
-
None
-
False
-
-
False
-
Impediment
-
-
-
Moderate
-
Approved
To Reproduce Steps to reproduce the behavior:
- Use machine with MI210 and bootc-amd-rhel9:1.3-1732815020
- run 'ilab model serve'
- See traceback in console
ERROR 11-28 22:11:59 multiproc_worker_utils.py:120] Worker VllmWorkerProcess pid 113 died, exit code: -15
INFO 11-28 22:11:59 multiproc_worker_utils.py:124] Killing local vLLM worker processes
Process SpawnProcess-1:
Traceback (most recent call last):
File "/usr/lib64/python3.11/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/lib64/python3.11/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/opt/app-root/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 388, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 138, in from_engine_args
return cls(
^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 78, in __init__
self.engine = LLMEngine(*args,
^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/engine/llm_engine.py", line 339, in __init__
self._initialize_kv_caches()
File "/opt/app-root/lib64/python3.11/site-packages/vllm/engine/llm_engine.py", line 487, in _initialize_kv_caches
self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
File "/opt/app-root/lib64/python3.11/site-packages/vllm/executor/distributed_gpu_executor.py", line 63, in initialize_cache
self._run_workers("initialize_cache",
File "/opt/app-root/lib64/python3.11/site-packages/vllm/executor/multiproc_gpu_executor.py", line 185, in _run_workers
driver_worker_output = driver_worker_method(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/worker/worker.py", line 294, in initialize_cache
self._warm_up_model()
File "/opt/app-root/lib64/python3.11/site-packages/vllm/worker/worker.py", line 310, in _warm_up_model
self.model_runner.capture_model(self.gpu_cache)
File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/worker/model_runner.py", line 1448, in capture_model
graph_runner.capture(**capture_inputs)
File "/opt/app-root/lib64/python3.11/site-packages/vllm/worker/model_runner.py", line 1711, in capture
self.model(
File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/models/granite.py", line 422, in forward
model_output = self.model(input_ids, positions, kv_caches,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/models/granite.py", line 323, in forward
hidden_states = layer(
^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/models/granite.py", line 242, in forward
hidden_states = self.self_attn(
^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/models/granite.py", line 172, in forward
qkv, _ = self.qkv_proj(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/layers/linear.py", line 366, in forward
output_parallel = self.quant_method.apply(self, input_, bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/layers/linear.py", line 134, in apply
return tgemm.mm(x, layer.weight, bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/layers/tuned_gemm.py", line 105, in mm
out = self.apply_skinny(m, n, k, inp_view, weights)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/model_executor/layers/tuned_gemm.py", line 70, in apply_skinny
ops.wvSpltK(weights, inp_view, out, n, self.cu_count)
File "/opt/app-root/lib64/python3.11/site-packages/vllm/_custom_ops.py", line 38, in wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/opt/app-root/lib64/python3.11/site-packages/vllm/_custom_ops.py", line 973, in wvSpltK
torch.ops._rocm_C.wvSpltK(a, b, out, N, cu_count)
File "/opt/app-root/lib64/python3.11/site-packages/torch/_ops.py", line 1061, in __call__
return self_._op(*args, **(kwargs or {}))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA kernel failed : 98
Expected behavior
- 'ilab model serve' does not error out
Device Info (please complete the following information):
- Hardware Specs: x86_64 bare metal, 4x MI210
- OS Version: registry.stage.redhat.io/rhelai1/bootc-amd-rhel9:1.3-1732815020
- Python Version: 3.11.7
- InstructLab Version: 0.21.0
Additional context
- This only happens with MI210, it does not happen with MI300X accelerators
- clones
-
RHELAI-2438 RuntimeError: CUDA kernel failed : 98 with MI210 accelerators
-
- Closed
-
- mentioned on