/usr/lib64/python3.11/inspect.py:389: FutureWarning: `torch.distributed.reduce_op` is deprecated, please use `torch.distributed.ReduceOp` instead return isinstance(object, types.FunctionType) Detected capabilities: [-cpu -gaudi -gaudi2 +gaudi3 +index_reduce] INFO 01-31 03:20:49 api_server.py:592] vLLM API server version 0.6.4.post2 INFO 01-31 03:20:49 api_server.py:593] args: Namespace(subparser='serve', model_tag='instructlab/granite-7b-lab', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='instructlab/granite-7b-lab', task='auto', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path=None, download_dir=None, load_format='auto', weights_load_device=None, config_format=, dtype='bfloat16', kv_cache_dtype='auto', quantization_param_path=None, max_model_len=None, guided_decoding_backend='outlines', distributed_executor_backend=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=2, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=128, enable_prefix_caching=False, disable_sliding_window=False, use_v2_block_manager=False, use_padding_aware_scheduling=False, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_seqs=256, max_num_prefill_seqs=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_overrides=None, enforce_eager=False, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_disable_mqa_scorer=False, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', override_neuron_config=None, override_pooler_config=None, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, dispatch_function=) INFO 01-31 03:20:49 __init__.py:31] No plugins found. INFO 01-31 03:20:49 api_server.py:176] Multiprocessing frontend to use ipc:///tmp/abc65108-fbe2-4099-b87b-69149b58d335 for IPC Path. INFO 01-31 03:20:49 api_server.py:195] Started engine process with PID 19558 INFO 01-31 03:20:50 config.py:1874] Downcasting torch.float32 to torch.bfloat16. /usr/lib64/python3.11/inspect.py:389: FutureWarning: `torch.distributed.reduce_op` is deprecated, please use `torch.distributed.ReduceOp` instead return isinstance(object, types.FunctionType) Detected capabilities: [-cpu -gaudi -gaudi2 +gaudi3 +index_reduce] INFO 01-31 03:20:55 __init__.py:31] No plugins found. INFO 01-31 03:20:56 config.py:1874] Downcasting torch.float32 to torch.bfloat16. INFO 01-31 03:20:57 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'. INFO 01-31 03:20:57 config.py:1017] Defaulting to use mp for distributed inference WARNING 01-31 03:20:57 arg_utils.py:1092] [DEPRECATED] Block manager v1 has been removed, and setting --use-v2-block-manager to True or False has no effect on vLLM behavior. Please remove --use-v2-block-manager in your engine argument. If your use case is not supported by SelfAttnBlockSpaceManager (i.e. block manager v2), please file an issue with detailed information. You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message. INFO 01-31 03:21:04 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'. INFO 01-31 03:21:04 config.py:1017] Defaulting to use mp for distributed inference WARNING 01-31 03:21:04 arg_utils.py:1092] [DEPRECATED] Block manager v1 has been removed, and setting --use-v2-block-manager to True or False has no effect on vLLM behavior. Please remove --use-v2-block-manager in your engine argument. If your use case is not supported by SelfAttnBlockSpaceManager (i.e. block manager v2), please file an issue with detailed information. INFO 01-31 03:21:04 llm_engine.py:250] Initializing an LLM engine (v0.6.4.post2) with config: model='instructlab/granite-7b-lab', speculative_config=None, tokenizer='instructlab/granite-7b-lab', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, weights_load_device=hpu, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=hpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=instructlab/granite-7b-lab, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=True, mm_processor_kwargs=None, pooler_config=None) You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message. WARNING 01-31 03:21:04 multiproc_gpu_executor.py:56] Reducing Torch parallelism from 144 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. INFO 01-31 03:21:04 custom_cache_manager.py:17] Setting Triton cache manager to: vllm.triton_utils.custom_cache_manager:CustomCacheManager INFO 01-31 03:21:04 __init__.py:31] No plugins found. WARNING 01-31 03:21:04 utils.py:754] Pin memory is not supported on HPU. INFO 01-31 03:21:04 selector.py:174] Using HPUAttention backend. ============================= HABANA PT BRIDGE CONFIGURATION =========================== PT_HPU_LAZY_MODE = 1 PT_RECIPE_CACHE_PATH = PT_CACHE_FOLDER_DELETE = 0 PT_HPU_RECIPE_CACHE_CONFIG = PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807 PT_HPU_LAZY_ACC_PAR_MODE = 1 PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0 PT_HPU_EAGER_PIPELINE_ENABLE = 1 PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE = 1 ---------------------------: System Configuration :--------------------------- Num CPU Cores : 288 CPU RAM : 1979158020 KB ------------------------------------------------------------------------------ /usr/lib64/python3.11/inspect.py:389: FutureWarning: `torch.distributed.reduce_op` is deprecated, please use `torch.distributed.ReduceOp` instead return isinstance(object, types.FunctionType) Detected capabilities: [-cpu -gaudi -gaudi2 +gaudi3 +index_reduce] (VllmWorkerProcess pid=19699) INFO 01-31 03:21:10 __init__.py:31] No plugins found. (VllmWorkerProcess pid=19699) WARNING 01-31 03:21:11 utils.py:754] Pin memory is not supported on HPU. (VllmWorkerProcess pid=19699) INFO 01-31 03:21:11 selector.py:174] Using HPUAttention backend. (VllmWorkerProcess pid=19699) VLLM_PROMPT_BS_BUCKET_MIN=1 (default:1) (VllmWorkerProcess pid=19699) VLLM_PROMPT_BS_BUCKET_STEP=32 (default:32) (VllmWorkerProcess pid=19699) VLLM_PROMPT_BS_BUCKET_MAX=256 (default:256) (VllmWorkerProcess pid=19699) VLLM_DECODE_BS_BUCKET_MIN=1 (default:1) (VllmWorkerProcess pid=19699) VLLM_DECODE_BS_BUCKET_STEP=32 (default:32) (VllmWorkerProcess pid=19699) VLLM_DECODE_BS_BUCKET_MAX=256 (default:256) (VllmWorkerProcess pid=19699) VLLM_PROMPT_SEQ_BUCKET_MIN=128 (default:128) (VllmWorkerProcess pid=19699) VLLM_PROMPT_SEQ_BUCKET_STEP=128 (default:128) (VllmWorkerProcess pid=19699) VLLM_PROMPT_SEQ_BUCKET_MAX=1024 (default:1024) (VllmWorkerProcess pid=19699) VLLM_DECODE_BLOCK_BUCKET_MIN=128 (default:128) (VllmWorkerProcess pid=19699) VLLM_DECODE_BLOCK_BUCKET_STEP=128 (default:128) (VllmWorkerProcess pid=19699) VLLM_DECODE_BLOCK_BUCKET_MAX=4096 (default:4096) (VllmWorkerProcess pid=19699) Prompt bucket config (min, step, max_warmup) bs:[1, 32, 256], seq:[128, 128, 1024] (VllmWorkerProcess pid=19699) Decode bucket config (min, step, max_warmup) bs:[1, 32, 256], block:[128, 128, 4096] (VllmWorkerProcess pid=19699) INFO 01-31 03:21:11 multiproc_worker_utils.py:215] Worker ready; awaiting tasks ============================= HABANA PT BRIDGE CONFIGURATION =========================== PT_HPU_LAZY_MODE = 1 PT_RECIPE_CACHE_PATH = PT_CACHE_FOLDER_DELETE = 0 PT_HPU_RECIPE_CACHE_CONFIG = PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807 PT_HPU_LAZY_ACC_PAR_MODE = 1 PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0 PT_HPU_EAGER_PIPELINE_ENABLE = 1 PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE = 1 ---------------------------: System Configuration :--------------------------- Num CPU Cores : 288 CPU RAM : 1979158020 KB ------------------------------------------------------------------------------ VLLM_PROMPT_BS_BUCKET_MIN=1 (default:1) VLLM_PROMPT_BS_BUCKET_STEP=32 (default:32) VLLM_PROMPT_BS_BUCKET_MAX=256 (default:256) VLLM_DECODE_BS_BUCKET_MIN=1 (default:1) VLLM_DECODE_BS_BUCKET_STEP=32 (default:32) VLLM_DECODE_BS_BUCKET_MAX=256 (default:256) VLLM_PROMPT_SEQ_BUCKET_MIN=128 (default:128) VLLM_PROMPT_SEQ_BUCKET_STEP=128 (default:128) VLLM_PROMPT_SEQ_BUCKET_MAX=1024 (default:1024) VLLM_DECODE_BLOCK_BUCKET_MIN=128 (default:128) VLLM_DECODE_BLOCK_BUCKET_STEP=128 (default:128) VLLM_DECODE_BLOCK_BUCKET_MAX=4096 (default:4096) Prompt bucket config (min, step, max_warmup) bs:[1, 32, 256], seq:[128, 128, 1024] Decode bucket config (min, step, max_warmup) bs:[1, 32, 256], block:[128, 128, 4096] INFO 01-31 03:21:12 shm_broadcast.py:236] vLLM message queue communication handle: Handle(connect_ip='127.0.0.1', local_reader_ranks=[1], buffer=, local_subscribe_port=37221, remote_subscribe_port=None) INFO 01-31 03:21:12 loader.py:340] Loading weights on hpu... INFO 01-31 03:21:13 weight_utils.py:243] Using model weights format ['*.safetensors'] (VllmWorkerProcess pid=19699) INFO 01-31 03:21:13 loader.py:340] Loading weights on hpu... Loading safetensors checkpoint shards: 0% Completed | 0/3 [00:00' panicked at /mount/work-dir/tokenizers-0.21.0/tokenizers-0.21.0/vendor/rayon-core/src/registry.rs:168:10: The global thread pool has not been initialized.: ThreadPoolBuildError { kind: IOError(Os { code: 11, kind: WouldBlock, message: "Resource temporarily unavailable" }) } note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace ERROR: Exception in ASGI application Traceback (most recent call last): File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 179, in __call__ response = await self.dispatch_func(request, call_next) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 491, in add_request_id response = await call_next(request) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 155, in call_next raise RuntimeError("No response returned.") RuntimeError: No response returned. During handling of the above exception, another exception occurred: + Exception Group Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/uvicorn/protocols/http/httptools_impl.py", line 409, in run_asgi | result = await app( # type: ignore[func-returns-value] | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__ | return await self.app(scope, receive, send) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/fastapi/applications.py", line 1054, in __call__ | await super().__call__(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/applications.py", line 112, in __call__ | await self.middleware_stack(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/errors.py", line 165, in __call__ | await self.app(scope, receive, _send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 178, in __call__ | async with anyio.create_task_group() as task_group: | File "/opt/app-root/lib64/python3.11/site-packages/anyio/_backends/_asyncio.py", line 767, in __aexit__ | raise BaseExceptionGroup( | BaseExceptionGroup: unhandled errors in a TaskGroup (2 sub-exceptions) +-+---------------- 1 ---------------- | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 111, in receive | return self.receive_nowait() | ^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 106, in receive_nowait | raise WouldBlock | anyio.WouldBlock | | During handling of the above exception, another exception occurred: | | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 124, in receive | return receiver.item | ^^^^^^^^^^^^^ | AttributeError: 'MemoryObjectItemReceiver' object has no attribute 'item' | | During handling of the above exception, another exception occurred: | | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 148, in call_next | message = await recv_stream.receive() | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 126, in receive | raise EndOfStream | anyio.EndOfStream | | During handling of the above exception, another exception occurred: | | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 179, in __call__ | response = await self.dispatch_func(request, call_next) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 491, in add_request_id | response = await call_next(request) | ^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 155, in call_next | raise RuntimeError("No response returned.") | RuntimeError: No response returned. +---------------- 2 ---------------- | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 141, in coro | await self.app(scope, receive_or_disconnect, send_no_error) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/cors.py", line 85, in __call__ | await self.app(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/exceptions.py", line 62, in __call__ | await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app | await app(scope, receive, sender) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 715, in __call__ | await self.middleware_stack(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 735, in app | await route.handle(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 288, in handle | await self.app(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 76, in app | await wrap_app_handling_exceptions(app, request)(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app | await app(scope, receive, sender) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 73, in app | response = await f(request) | ^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/fastapi/routing.py", line 301, in app | raw_response = await run_endpoint_function( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/fastapi/routing.py", line 212, in run_endpoint_function | return await dependant.call(**values) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 348, in create_chat_completion | generator = await handler.create_chat_completion(request, raw_request) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_chat.py", line 160, in create_chat_completion | ) = await self._preprocess_chat( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 496, in _preprocess_chat | prompt_inputs = self._tokenize_prompt_input( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 326, in _tokenize_prompt_input | return next( | ^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 349, in _tokenize_prompt_inputs | yield self._normalize_prompt_text_to_input( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 234, in _normalize_prompt_text_to_input | encoded = tokenizer(prompt, add_special_tokens=add_special_tokens) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2868, in __call__ | encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2978, in _call_one | return self.encode_plus( | ^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_base.py", line 3054, in encode_plus | return self._encode_plus( | ^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 613, in _encode_plus | batched_output = self._batch_encode_plus( | ^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 539, in _batch_encode_plus | encodings = self._tokenizer.encode_batch( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | pyo3_runtime.PanicException: The global thread pool has not been initialized.: ThreadPoolBuildError { kind: IOError(Os { code: 11, kind: WouldBlock, message: "Resource temporarily unavailable" }) } +------------------------------------ INFO: 127.0.0.1:60148 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error thread '' panicked at /mount/work-dir/tokenizers-0.21.0/tokenizers-0.21.0/vendor/rayon-core/src/registry.rs:168:10: The global thread pool has not been initialized.: ThreadPoolBuildError { kind: GlobalPoolAlreadyInitialized } ERROR: Exception in ASGI application Traceback (most recent call last): File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 179, in __call__ response = await self.dispatch_func(request, call_next) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 491, in add_request_id response = await call_next(request) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 155, in call_next raise RuntimeError("No response returned.") RuntimeError: No response returned. During handling of the above exception, another exception occurred: + Exception Group Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/uvicorn/protocols/http/httptools_impl.py", line 409, in run_asgi | result = await app( # type: ignore[func-returns-value] | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__ | return await self.app(scope, receive, send) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/fastapi/applications.py", line 1054, in __call__ | await super().__call__(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/applications.py", line 112, in __call__ | await self.middleware_stack(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/errors.py", line 165, in __call__ | await self.app(scope, receive, _send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 178, in __call__ | async with anyio.create_task_group() as task_group: | File "/opt/app-root/lib64/python3.11/site-packages/anyio/_backends/_asyncio.py", line 767, in __aexit__ | raise BaseExceptionGroup( | BaseExceptionGroup: unhandled errors in a TaskGroup (2 sub-exceptions) +-+---------------- 1 ---------------- | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 111, in receive | return self.receive_nowait() | ^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 106, in receive_nowait | raise WouldBlock | anyio.WouldBlock | | During handling of the above exception, another exception occurred: | | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 124, in receive | return receiver.item | ^^^^^^^^^^^^^ | AttributeError: 'MemoryObjectItemReceiver' object has no attribute 'item' | | During handling of the above exception, another exception occurred: | | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 148, in call_next | message = await recv_stream.receive() | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 126, in receive | raise EndOfStream | anyio.EndOfStream | | During handling of the above exception, another exception occurred: | | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 179, in __call__ | response = await self.dispatch_func(request, call_next) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 491, in add_request_id | response = await call_next(request) | ^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 155, in call_next | raise RuntimeError("No response returned.") | RuntimeError: No response returned. +---------------- 2 ---------------- | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 141, in coro | await self.app(scope, receive_or_disconnect, send_no_error) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/cors.py", line 85, in __call__ | await self.app(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/exceptions.py", line 62, in __call__ | await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app | await app(scope, receive, sender) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 715, in __call__ | await self.middleware_stack(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 735, in app | await route.handle(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 288, in handle | await self.app(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 76, in app | await wrap_app_handling_exceptions(app, request)(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app | await app(scope, receive, sender) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 73, in app | response = await f(request) | ^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/fastapi/routing.py", line 301, in app | raw_response = await run_endpoint_function( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/fastapi/routing.py", line 212, in run_endpoint_function | return await dependant.call(**values) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 348, in create_chat_completion | generator = await handler.create_chat_completion(request, raw_request) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_chat.py", line 160, in create_chat_completion | ) = await self._preprocess_chat( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 496, in _preprocess_chat | prompt_inputs = self._tokenize_prompt_input( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 326, in _tokenize_prompt_input | return next( | ^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 349, in _tokenize_prompt_inputs | yield self._normalize_prompt_text_to_input( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 234, in _normalize_prompt_text_to_input | encoded = tokenizer(prompt, add_special_tokens=add_special_tokens) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2868, in __call__ | encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2978, in _call_one | return self.encode_plus( | ^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_base.py", line 3054, in encode_plus | return self._encode_plus( | ^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 613, in _encode_plus | batched_output = self._batch_encode_plus( | ^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 539, in _batch_encode_plus | encodings = self._tokenizer.encode_batch( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | pyo3_runtime.PanicException: The global thread pool has not been initialized.: ThreadPoolBuildError { kind: GlobalPoolAlreadyInitialized } +------------------------------------ INFO: 127.0.0.1:60150 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error thread '' panicked at /mount/work-dir/tokenizers-0.21.0/tokenizers-0.21.0/vendor/rayon-core/src/registry.rs:168:10: The global thread pool has not been initialized.: ThreadPoolBuildError { kind: GlobalPoolAlreadyInitialized } ERROR: Exception in ASGI application Traceback (most recent call last): File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 179, in __call__ response = await self.dispatch_func(request, call_next) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 491, in add_request_id response = await call_next(request) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 155, in call_next raise RuntimeError("No response returned.") RuntimeError: No response returned. During handling of the above exception, another exception occurred: + Exception Group Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/uvicorn/protocols/http/httptools_impl.py", line 409, in run_asgi | result = await app( # type: ignore[func-returns-value] | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__ | return await self.app(scope, receive, send) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/fastapi/applications.py", line 1054, in __call__ | await super().__call__(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/applications.py", line 112, in __call__ | await self.middleware_stack(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/errors.py", line 165, in __call__ | await self.app(scope, receive, _send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 178, in __call__ | async with anyio.create_task_group() as task_group: | File "/opt/app-root/lib64/python3.11/site-packages/anyio/_backends/_asyncio.py", line 767, in __aexit__ | raise BaseExceptionGroup( | BaseExceptionGroup: unhandled errors in a TaskGroup (2 sub-exceptions) +-+---------------- 1 ---------------- | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 111, in receive | return self.receive_nowait() | ^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 106, in receive_nowait | raise WouldBlock | anyio.WouldBlock | | During handling of the above exception, another exception occurred: | | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 124, in receive | return receiver.item | ^^^^^^^^^^^^^ | AttributeError: 'MemoryObjectItemReceiver' object has no attribute 'item' | | During handling of the above exception, another exception occurred: | | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 148, in call_next | message = await recv_stream.receive() | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/anyio/streams/memory.py", line 126, in receive | raise EndOfStream | anyio.EndOfStream | | During handling of the above exception, another exception occurred: | | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 179, in __call__ | response = await self.dispatch_func(request, call_next) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 491, in add_request_id | response = await call_next(request) | ^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 155, in call_next | raise RuntimeError("No response returned.") | RuntimeError: No response returned. +---------------- 2 ---------------- | Traceback (most recent call last): | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/base.py", line 141, in coro | await self.app(scope, receive_or_disconnect, send_no_error) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/cors.py", line 85, in __call__ | await self.app(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/middleware/exceptions.py", line 62, in __call__ | await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app | await app(scope, receive, sender) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 715, in __call__ | await self.middleware_stack(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 735, in app | await route.handle(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 288, in handle | await self.app(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 76, in app | await wrap_app_handling_exceptions(app, request)(scope, receive, send) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app | await app(scope, receive, sender) | File "/opt/app-root/lib64/python3.11/site-packages/starlette/routing.py", line 73, in app | response = await f(request) | ^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/fastapi/routing.py", line 301, in app | raw_response = await run_endpoint_function( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/fastapi/routing.py", line 212, in run_endpoint_function | return await dependant.call(**values) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 348, in create_chat_completion | generator = await handler.create_chat_completion(request, raw_request) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_chat.py", line 160, in create_chat_completion | ) = await self._preprocess_chat( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 496, in _preprocess_chat | prompt_inputs = self._tokenize_prompt_input( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 326, in _tokenize_prompt_input | return next( | ^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 349, in _tokenize_prompt_inputs | yield self._normalize_prompt_text_to_input( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/vllm/entrypoints/openai/serving_engine.py", line 234, in _normalize_prompt_text_to_input | encoded = tokenizer(prompt, add_special_tokens=add_special_tokens) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2868, in __call__ | encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs) | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2978, in _call_one | return self.encode_plus( | ^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_base.py", line 3054, in encode_plus | return self._encode_plus( | ^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 613, in _encode_plus | batched_output = self._batch_encode_plus( | ^^^^^^^^^^^^^^^^^^^^^^^^ | File "/opt/app-root/lib64/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 539, in _batch_encode_plus | encodings = self._tokenizer.encode_batch( | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | pyo3_runtime.PanicException: The global thread pool has not been initialized.: ThreadPoolBuildError { kind: GlobalPoolAlreadyInitialized } +------------------------------------ INFO: 127.0.0.1:60160 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error INFO 01-31 03:41:02 metrics.py:449] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.