[rank3]: Traceback (most recent call last): [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 981, in [rank3]: main(args) [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 642, in main [rank3]: train( [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 385, in train [rank3]: output = model( [rank3]: ^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank3]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/main.py", line 140, in forward [rank3]: transformer_outputs: BaseModelOutputWithPast = self.transformer( [rank3]: ^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/base.py", line 215, in forward [rank3]: hidden_states = block( [rank3]: ^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 165, in forward [rank3]: return self.checkpoint_fn( # type: ignore[misc] [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_compile.py", line 32, in inner [rank3]: return disable_fn(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn [rank3]: return fn(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint [rank3]: return CheckpointFunction.apply(function, preserve, *args) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/autograd/function.py", line 575, in apply [rank3]: return super().apply(*args, **kwargs) # type: ignore[misc] [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 264, in forward [rank3]: outputs = run_function(*args) [rank3]: ^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 159, in my_function [rank3]: return self._checkpoint_wrapped_module( [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank3]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/layer.py", line 79, in forward [rank3]: hidden_states = self.mlp(hidden_states) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/mlp.py", line 47, in forward [rank3]: hidden_states = self.act(hidden_states) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank3]: return self._call_impl(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank3]: return forward_call(*args, **kwargs) [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank3]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/modeling_utils/activations/glu.py", line 28, in forward [rank3]: return x[0] * self.base_activation(x[1]) [rank3]: ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ [rank3]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.33 GiB. GPU 3 has a total capacity of 39.38 GiB of which 571.38 MiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 34.53 GiB is allocated by PyTorch, and 2.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank0]: Traceback (most recent call last): [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 981, in [rank0]: main(args) [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 642, in main [rank0]: train( [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 385, in train [rank0]: output = model( [rank0]: ^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank0]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/main.py", line 140, in forward [rank0]: transformer_outputs: BaseModelOutputWithPast = self.transformer( [rank0]: ^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/base.py", line 215, in forward [rank0]: hidden_states = block( [rank0]: ^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 165, in forward [rank0]: return self.checkpoint_fn( # type: ignore[misc] [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_compile.py", line 32, in inner [rank0]: return disable_fn(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn [rank0]: return fn(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint [rank0]: return CheckpointFunction.apply(function, preserve, *args) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/autograd/function.py", line 575, in apply [rank0]: return super().apply(*args, **kwargs) # type: ignore[misc] [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 264, in forward [rank0]: outputs = run_function(*args) [rank0]: ^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 159, in my_function [rank0]: return self._checkpoint_wrapped_module( [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank0]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/layer.py", line 79, in forward [rank0]: hidden_states = self.mlp(hidden_states) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/mlp.py", line 47, in forward [rank0]: hidden_states = self.act(hidden_states) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank0]: return self._call_impl(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank0]: return forward_call(*args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/modeling_utils/activations/glu.py", line 28, in forward [rank0]: return x[0] * self.base_activation(x[1]) [rank0]: ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ [rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.33 GiB. GPU 0 has a total capacity of 39.38 GiB of which 715.38 MiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 34.53 GiB is allocated by PyTorch, and 2.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank1]: Traceback (most recent call last): [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 981, in [rank1]: main(args) [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 642, in main [rank1]: train( [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 385, in train [rank1]: output = model( [rank1]: ^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank1]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/main.py", line 140, in forward [rank1]: transformer_outputs: BaseModelOutputWithPast = self.transformer( [rank1]: ^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/base.py", line 215, in forward [rank1]: hidden_states = block( [rank1]: ^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 165, in forward [rank1]: return self.checkpoint_fn( # type: ignore[misc] [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_compile.py", line 32, in inner [rank1]: return disable_fn(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn [rank1]: return fn(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint [rank1]: return CheckpointFunction.apply(function, preserve, *args) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/autograd/function.py", line 575, in apply [rank1]: return super().apply(*args, **kwargs) # type: ignore[misc] [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 264, in forward [rank1]: outputs = run_function(*args) [rank1]: ^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 159, in my_function [rank1]: return self._checkpoint_wrapped_module( [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank1]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/layer.py", line 79, in forward [rank1]: hidden_states = self.mlp(hidden_states) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/mlp.py", line 47, in forward [rank1]: hidden_states = self.act(hidden_states) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank1]: return self._call_impl(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank1]: return forward_call(*args, **kwargs) [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank1]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/modeling_utils/activations/glu.py", line 28, in forward [rank1]: return x[0] * self.base_activation(x[1]) [rank1]: ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ [rank1]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.33 GiB. GPU 1 has a total capacity of 39.38 GiB of which 571.38 MiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 34.54 GiB is allocated by PyTorch, and 2.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank2]: Traceback (most recent call last): [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 981, in [rank2]: main(args) [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 642, in main [rank2]: train( [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 385, in train [rank2]: output = model( [rank2]: ^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank2]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/main.py", line 140, in forward [rank2]: transformer_outputs: BaseModelOutputWithPast = self.transformer( [rank2]: ^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/base.py", line 215, in forward [rank2]: hidden_states = block( [rank2]: ^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 165, in forward [rank2]: return self.checkpoint_fn( # type: ignore[misc] [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_compile.py", line 32, in inner [rank2]: return disable_fn(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn [rank2]: return fn(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint [rank2]: return CheckpointFunction.apply(function, preserve, *args) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/autograd/function.py", line 575, in apply [rank2]: return super().apply(*args, **kwargs) # type: ignore[misc] [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 264, in forward [rank2]: outputs = run_function(*args) [rank2]: ^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 159, in my_function [rank2]: return self._checkpoint_wrapped_module( [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank2]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/layer.py", line 79, in forward [rank2]: hidden_states = self.mlp(hidden_states) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/mlp.py", line 47, in forward [rank2]: hidden_states = self.act(hidden_states) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank2]: return self._call_impl(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank2]: return forward_call(*args, **kwargs) [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank2]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/modeling_utils/activations/glu.py", line 28, in forward [rank2]: return x[0] * self.base_activation(x[1]) [rank2]: ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ [rank2]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.33 GiB. GPU 2 has a total capacity of 39.38 GiB of which 571.38 MiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 34.53 GiB is allocated by PyTorch, and 2.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank4]: Traceback (most recent call last): [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 981, in [rank4]: main(args) [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 642, in main [rank4]: train( [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 385, in train [rank4]: output = model( [rank4]: ^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank4]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/main.py", line 140, in forward [rank4]: transformer_outputs: BaseModelOutputWithPast = self.transformer( [rank4]: ^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/base.py", line 215, in forward [rank4]: hidden_states = block( [rank4]: ^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 165, in forward [rank4]: return self.checkpoint_fn( # type: ignore[misc] [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_compile.py", line 32, in inner [rank4]: return disable_fn(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn [rank4]: return fn(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint [rank4]: return CheckpointFunction.apply(function, preserve, *args) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/autograd/function.py", line 575, in apply [rank4]: return super().apply(*args, **kwargs) # type: ignore[misc] [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 264, in forward [rank4]: outputs = run_function(*args) [rank4]: ^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 159, in my_function [rank4]: return self._checkpoint_wrapped_module( [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank4]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/layer.py", line 79, in forward [rank4]: hidden_states = self.mlp(hidden_states) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/mlp.py", line 47, in forward [rank4]: hidden_states = self.act(hidden_states) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank4]: return self._call_impl(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank4]: return forward_call(*args, **kwargs) [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank4]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/modeling_utils/activations/glu.py", line 28, in forward [rank4]: return x[0] * self.base_activation(x[1]) [rank4]: ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ [rank4]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.33 GiB. GPU 4 has a total capacity of 39.38 GiB of which 571.38 MiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 34.53 GiB is allocated by PyTorch, and 2.66 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank5]: Traceback (most recent call last): [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 981, in [rank5]: main(args) [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 642, in main [rank5]: train( [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 385, in train [rank5]: output = model( [rank5]: ^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank5]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/main.py", line 140, in forward [rank5]: transformer_outputs: BaseModelOutputWithPast = self.transformer( [rank5]: ^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/base.py", line 215, in forward [rank5]: hidden_states = block( [rank5]: ^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 165, in forward [rank5]: return self.checkpoint_fn( # type: ignore[misc] [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_compile.py", line 32, in inner [rank5]: return disable_fn(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn [rank5]: return fn(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint [rank5]: return CheckpointFunction.apply(function, preserve, *args) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/autograd/function.py", line 575, in apply [rank5]: return super().apply(*args, **kwargs) # type: ignore[misc] [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 264, in forward [rank5]: outputs = run_function(*args) [rank5]: ^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 159, in my_function [rank5]: return self._checkpoint_wrapped_module( [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank5]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/layer.py", line 79, in forward [rank5]: hidden_states = self.mlp(hidden_states) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/mlp.py", line 47, in forward [rank5]: hidden_states = self.act(hidden_states) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank5]: return self._call_impl(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank5]: return forward_call(*args, **kwargs) [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank5]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/modeling_utils/activations/glu.py", line 28, in forward [rank5]: return x[0] * self.base_activation(x[1]) [rank5]: ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ [rank5]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.33 GiB. GPU 5 has a total capacity of 39.38 GiB of which 571.38 MiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 34.54 GiB is allocated by PyTorch, and 2.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank6]: Traceback (most recent call last): [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 981, in [rank6]: main(args) [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 642, in main [rank6]: train( [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 385, in train [rank6]: output = model( [rank6]: ^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank6]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/main.py", line 140, in forward [rank6]: transformer_outputs: BaseModelOutputWithPast = self.transformer( [rank6]: ^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/mixins/dense/base.py", line 215, in forward [rank6]: hidden_states = block( [rank6]: ^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 165, in forward [rank6]: return self.checkpoint_fn( # type: ignore[misc] [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_compile.py", line 32, in inner [rank6]: return disable_fn(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn [rank6]: return fn(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint [rank6]: return CheckpointFunction.apply(function, preserve, *args) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/autograd/function.py", line 575, in apply [rank6]: return super().apply(*args, **kwargs) # type: ignore[misc] [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/utils/checkpoint.py", line 264, in forward [rank6]: outputs = run_function(*args) [rank6]: ^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py", line 159, in my_function [rank6]: return self._checkpoint_wrapped_module( [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 864, in forward [rank6]: output = self._fsdp_wrapped_module(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/layer.py", line 79, in forward [rank6]: hidden_states = self.mlp(hidden_states) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/models/gpt_dolomite/mlp.py", line 47, in forward [rank6]: hidden_states = self.act(hidden_states) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl [rank6]: return self._call_impl(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl [rank6]: return forward_call(*args, **kwargs) [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank6]: File "/opt/app-root/lib64/python3.11/site-packages/instructlab/dolomite/hf_models/modeling_utils/activations/glu.py", line 28, in forward [rank6]: return x[0] * self.base_activation(x[1]) [rank6]: ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ [rank6]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.33 GiB. GPU 6 has a total capacity of 39.38 GiB of which 571.38 MiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 34.54 GiB is allocated by PyTorch, and 2.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) Epoch 0: 0%| | 0/39 [00:07 sys.exit(main()) ^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper return f(*args, **kwargs) ^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/run.py", line 919, in main run(args) File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/run.py", line 910, in run elastic_launch( File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/launcher/api.py", line 138, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ /opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py FAILED ------------------------------------------------------------ Failures: ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2025-02-20_12:34:30 host : host.containers.internal rank : 3 (local_rank: 3) exitcode : 1 (pid: 10364) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================ Training subprocess has not exited yet. Sending SIGTERM. Waiting for process to exit, 60s... --- Logging error --- Traceback (most recent call last): File "/opt/app-root/lib64/python3.11/site-packages/instructlab/model/accelerated_train.py", line 261, in _run_phase _training_phase( File "/opt/app-root/lib64/python3.11/site-packages/instructlab/model/accelerated_train.py", line 563, in _training_phase run_training(train_args=train_args, torch_args=torch_args) File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/__init__.py", line 36, in run_training return run_training(torch_args=torch_args, train_args=train_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 837, in run_training raise RuntimeError( RuntimeError: Suffered a failure during distributed training. Please see the training logs for more context. During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/lib64/python3.11/logging/__init__.py", line 1110, in emit msg = self.format(record) ^^^^^^^^^^^^^^^^^^^ File "/usr/lib64/python3.11/logging/__init__.py", line 953, in format return fmt.format(record) ^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/instructlab/log.py", line 19, in format return super().format(record) ^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib64/python3.11/logging/__init__.py", line 687, in format record.message = record.getMessage() ^^^^^^^^^^^^^^^^^^^ File "/usr/lib64/python3.11/logging/__init__.py", line 377, in getMessage msg = msg % self.args ~~~~^~~~~~~~~~~ TypeError: not all arguments converted during string formatting Call stack: File "/opt/app-root/bin/ilab", line 8, in sys.exit(ilab()) File "/opt/app-root/lib64/python3.11/site-packages/click/core.py", line 1161, in __call__ return self.main(*args, **kwargs) File "/opt/app-root/lib64/python3.11/site-packages/click/core.py", line 1082, in main rv = self.invoke(ctx) File "/opt/app-root/lib64/python3.11/site-packages/click/core.py", line 1697, in invoke return _process_result(sub_ctx.command.invoke(sub_ctx)) File "/opt/app-root/lib64/python3.11/site-packages/click/core.py", line 1697, in invoke return _process_result(sub_ctx.command.invoke(sub_ctx)) File "/opt/app-root/lib64/python3.11/site-packages/click/core.py", line 1443, in invoke return ctx.invoke(self.callback, **ctx.params) File "/opt/app-root/lib64/python3.11/site-packages/click/core.py", line 788, in invoke return __callback(*args, **kwargs) File "/opt/app-root/lib64/python3.11/site-packages/click/decorators.py", line 33, in new_func return f(get_current_context(), *args, **kwargs) File "/opt/app-root/lib64/python3.11/site-packages/instructlab/clickext.py", line 356, in wrapper return f(*args, **kwargs) File "/opt/app-root/lib64/python3.11/site-packages/instructlab/cli/model/train.py", line 469, in train accelerated_train.accelerated_train( File "/opt/app-root/lib64/python3.11/site-packages/instructlab/model/accelerated_train.py", line 202, in accelerated_train _run_phased_training( File "/opt/app-root/lib64/python3.11/site-packages/instructlab/model/accelerated_train.py", line 432, in _run_phased_training _run_phase( File "/opt/app-root/lib64/python3.11/site-packages/instructlab/model/accelerated_train.py", line 276, in _run_phase logger.error("Failed during training loop: ", e) Message: 'Failed during training loop: ' Arguments: (RuntimeError('Suffered a failure during distributed training. Please see the training logs for more context.'),) Accelerated Training failed with 1