I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,54 @@
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
Expiration timers are set up on the same process as the agent and
used from your script to deal with stuck workers. When you go into
a code-block that has the potential to get stuck you can acquire
an expiration timer, which instructs the timer server to kill the
process if it does not release the timer by the self-imposed expiration
deadline.
Usage::
import torchelastic.timer as timer
import torchelastic.agent.server as agent
def main():
start_method = "spawn"
message_queue = mp.get_context(start_method).Queue()
server = timer.LocalTimerServer(message, max_interval=0.01)
server.start() # non-blocking
spec = WorkerSpec(
fn=trainer_func,
args=(message_queue,),
...<OTHER_PARAMS...>)
agent = agent.LocalElasticAgent(spec, start_method)
agent.run()
def trainer_func(message_queue):
timer.configure(timer.LocalTimerClient(message_queue))
with timer.expires(after=60): # 60 second expiry
# do some work
In the example above if ``trainer_func`` takes more than 60 seconds to
complete, then the worker process is killed and the agent retries the worker group.
"""
from .api import ( # noqa: F401
configure,
expires,
TimerClient,
TimerRequest,
TimerServer,
)
from .file_based_local_timer import ( # noqa: F401
FileTimerClient,
FileTimerRequest,
FileTimerServer,
)
from .local_timer import LocalTimerClient, LocalTimerServer # noqa: F401

View File

@ -0,0 +1,283 @@
# mypy: allow-untyped-defs
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import abc
import logging
import threading
import time
from contextlib import contextmanager
from inspect import getframeinfo, stack
from typing import Any, Dict, List, Optional, Set
__all__ = [
"TimerRequest",
"TimerClient",
"RequestQueue",
"TimerServer",
"configure",
"expires",
]
logger = logging.getLogger(__name__)
class TimerRequest:
"""
Data object representing a countdown timer acquisition and release
that is used between the ``TimerClient`` and ``TimerServer``.
A negative ``expiration_time`` should be interpreted as a "release"
request.
.. note:: the type of ``worker_id`` is implementation specific.
It is whatever the TimerServer and TimerClient implementations
have on to uniquely identify a worker.
"""
__slots__ = ["worker_id", "scope_id", "expiration_time"]
def __init__(self, worker_id: Any, scope_id: str, expiration_time: float):
self.worker_id = worker_id
self.scope_id = scope_id
self.expiration_time = expiration_time
def __eq__(self, other):
if isinstance(other, TimerRequest):
return (
self.worker_id == other.worker_id
and self.scope_id == other.scope_id
and self.expiration_time == other.expiration_time
)
return False
class TimerClient(abc.ABC):
"""
Client library to acquire and release countdown timers by communicating
with the TimerServer.
"""
@abc.abstractmethod
def acquire(self, scope_id: str, expiration_time: float) -> None:
"""
Acquires a timer for the worker that holds this client object
given the scope_id and expiration_time. Typically registers
the timer with the TimerServer.
"""
@abc.abstractmethod
def release(self, scope_id: str):
"""
Releases the timer for the ``scope_id`` on the worker this
client represents. After this method is
called, the countdown timer on the scope is no longer in effect.
"""
class RequestQueue(abc.ABC):
"""
Consumer queue holding timer acquisition/release requests
"""
@abc.abstractmethod
def size(self) -> int:
"""
Returns the size of the queue at the time this method is called.
Note that by the time ``get`` is called the size of the queue
may have increased. The size of the queue should not decrease
until the ``get`` method is called. That is, the following assertion
should hold:
size = q.size()
res = q.get(size, timeout=0)
assert size == len(res)
-- or --
size = q.size()
res = q.get(size * 2, timeout=1)
assert size <= len(res) <= size * 2
"""
@abc.abstractmethod
def get(self, size: int, timeout: float) -> List[TimerRequest]:
"""
Gets up to ``size`` number of timer requests in a blocking fashion
(no more than ``timeout`` seconds).
"""
class TimerServer(abc.ABC):
"""
Entity that monitors active timers and expires them
in a timely fashion. This server is responsible for
reaping workers that have expired timers.
"""
def __init__(
self, request_queue: RequestQueue, max_interval: float, daemon: bool = True
):
"""
:param request_queue: Consumer ``RequestQueue``
:param max_interval: max time (in seconds) to wait
for an item in the request_queue
:param daemon: whether to run the watchdog thread as a daemon
"""
super().__init__()
self._request_queue = request_queue
self._max_interval = max_interval
self._daemon = daemon
self._watchdog_thread: Optional[threading.Thread] = None
self._stop_signaled = False
@abc.abstractmethod
def register_timers(self, timer_requests: List[TimerRequest]) -> None:
"""
Processes the incoming timer requests and registers them with the server.
The timer request can either be a acquire-timer or release-timer request.
Timer requests with a negative expiration_time should be interpreted
as a release-timer request.
"""
@abc.abstractmethod
def clear_timers(self, worker_ids: Set[Any]) -> None:
"""
Clears all timers for the given ``worker_ids``.
"""
@abc.abstractmethod
def get_expired_timers(self, deadline: float) -> Dict[str, List[TimerRequest]]:
"""
Returns all expired timers for each worker_id. An expired timer
is a timer for which the expiration_time is less than or equal to
the provided deadline.
"""
@abc.abstractmethod
def _reap_worker(self, worker_id: Any) -> bool:
"""
Reaps the given worker. Returns True if the worker has been
successfully reaped, False otherwise. If any uncaught exception
is thrown from this method, the worker is considered reaped
and all associated timers will be removed.
"""
def _reap_worker_no_throw(self, worker_id: Any) -> bool:
"""
Wraps ``_reap_worker(worker_id)``, if an uncaught exception is
thrown, then it considers the worker as reaped.
"""
try:
return self._reap_worker(worker_id)
except Exception:
logger.exception(
"Uncaught exception thrown from _reap_worker(), "
"check that the implementation correctly catches exceptions",
)
return True
def _watchdog_loop(self):
while not self._stop_signaled:
try:
self._run_watchdog()
except Exception:
logger.exception("Error running watchdog")
def _run_watchdog(self):
batch_size = max(1, self._request_queue.size())
timer_requests = self._request_queue.get(batch_size, self._max_interval)
self.register_timers(timer_requests)
now = time.time()
reaped_worker_ids = set()
for worker_id, expired_timers in self.get_expired_timers(now).items():
logger.info(
"Reaping worker_id=[%s]." " Expired timers: %s",
worker_id,
self._get_scopes(expired_timers),
)
if self._reap_worker_no_throw(worker_id):
logger.info("Successfully reaped worker=[%s]", worker_id)
reaped_worker_ids.add(worker_id)
else:
logger.error(
"Error reaping worker=[%s]. Will retry on next watchdog.", worker_id
)
self.clear_timers(reaped_worker_ids)
def _get_scopes(self, timer_requests):
return [r.scope_id for r in timer_requests]
def start(self) -> None:
logger.info(
"Starting %s..." " max_interval=%s," " daemon=%s",
type(self).__name__,
self._max_interval,
self._daemon,
)
self._watchdog_thread = threading.Thread(
target=self._watchdog_loop, daemon=self._daemon
)
logger.info("Starting watchdog thread...")
self._watchdog_thread.start()
def stop(self) -> None:
logger.info("Stopping %s", type(self).__name__)
self._stop_signaled = True
if self._watchdog_thread:
logger.info("Stopping watchdog thread...")
self._watchdog_thread.join(self._max_interval)
self._watchdog_thread = None
else:
logger.info("No watchdog thread running, doing nothing")
_timer_client: Optional[TimerClient] = None
def configure(timer_client: TimerClient):
"""
Configures a timer client. Must be called before using ``expires``.
"""
global _timer_client
_timer_client = timer_client
logger.info("Timer client configured to: %s", type(_timer_client).__name__)
@contextmanager
def expires(
after: float, scope: Optional[str] = None, client: Optional[TimerClient] = None
):
"""
Acquires a countdown timer that expires in ``after`` seconds from now,
unless the code-block that it wraps is finished within the timeframe.
When the timer expires, this worker is eligible to be reaped. The
exact meaning of "reaped" depends on the client implementation. In
most cases, reaping means to terminate the worker process.
Note that the worker is NOT guaranteed to be reaped at exactly
``time.now() + after``, but rather the worker is "eligible" for being
reaped and the ``TimerServer`` that the client talks to will ultimately
make the decision when and how to reap the workers with expired timers.
Usage::
torch.distributed.elastic.timer.configure(LocalTimerClient())
with expires(after=10):
torch.distributed.all_reduce(...)
"""
if client is None:
if _timer_client is None:
raise RuntimeError("Configure timer client before using countdown timers.")
client = _timer_client
if scope is None:
# grab the caller file + lineno
caller = getframeinfo(stack()[1][0])
scope = f"{caller.filename}#{caller.lineno}"
expiration = time.time() + after
client.acquire(scope, expiration)
try:
yield
finally:
client.release(scope)

View File

@ -0,0 +1,25 @@
#!/usr/bin/env python3
# mypy: allow-untyped-defs
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
from typing import Dict, List
from torch.distributed.elastic.utils.logging import get_logger
logger = get_logger(__name__)
__all__ = ["log_debug_info_for_expired_timers"]
def log_debug_info_for_expired_timers(
run_id: str,
expired_timers: Dict[int, List[str]],
):
if expired_timers:
logger.info("Timers expired for run:[%s] [%s].", run_id, expired_timers)

View File

@ -0,0 +1,393 @@
# mypy: allow-untyped-defs
# Copyright (c) Meta Platforms, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import io
import json
import os
import select
import signal
import sys
import threading
import time
from typing import Callable, Dict, List, Optional, Set, Tuple
from torch.distributed.elastic.timer.api import TimerClient, TimerRequest
from torch.distributed.elastic.timer.debug_info_logging import (
log_debug_info_for_expired_timers,
)
from torch.distributed.elastic.utils.logging import get_logger
__all__ = ["FileTimerClient", "FileTimerRequest", "FileTimerServer"]
logger = get_logger(__name__)
class FileTimerRequest(TimerRequest):
"""
Data object representing a countdown timer acquisition and release
that is used between the ``FileTimerClient`` and ``FileTimerServer``.
A negative ``expiration_time`` should be interpreted as a "release"
request.
``signal`` is the signal to reap the worker process from the server
process.
"""
__slots__ = ["version", "worker_pid", "scope_id", "expiration_time", "signal"]
def __init__(
self, worker_pid: int, scope_id: str, expiration_time: float, signal: int = 0
) -> None:
self.version = 1
self.worker_pid = worker_pid
self.scope_id = scope_id
self.expiration_time = expiration_time
self.signal = signal
def __eq__(self, other) -> bool:
if isinstance(other, FileTimerRequest):
return (
self.version == other.version
and self.worker_pid == other.worker_pid
and self.scope_id == other.scope_id
and self.expiration_time == other.expiration_time
and self.signal == other.signal
)
return False
def to_json(self) -> str:
return json.dumps(
{
"version": self.version,
"pid": self.worker_pid,
"scope_id": self.scope_id,
"expiration_time": self.expiration_time,
"signal": self.signal,
},
)
class FileTimerClient(TimerClient):
"""
Client side of ``FileTimerServer``. This client is meant to be used
on the same host that the ``FileTimerServer`` is running on and uses
pid to uniquely identify a worker.
This client uses a named_pipe to send timer requests to the
``FileTimerServer``. This client is a producer while the
``FileTimerServer`` is a consumer. Multiple clients can work with
the same ``FileTimerServer``.
Args:
file_path: str, the path of a FIFO special file. ``FileTimerServer``
must have created it by calling os.mkfifo().
signal: signal, the signal to use to kill the process. Using a
negative or zero signal will not kill the process.
"""
def __init__(
self,
file_path: str,
signal=(signal.SIGKILL if sys.platform != "win32" else signal.CTRL_C_EVENT), # type: ignore[attr-defined]
) -> None:
super().__init__()
self._file_path = file_path
self.signal = signal
def _open_non_blocking(self) -> Optional[io.TextIOWrapper]:
try:
fd = os.open(self._file_path, os.O_WRONLY | os.O_NONBLOCK)
return os.fdopen(fd, "wt")
except Exception:
return None
def _send_request(self, request: FileTimerRequest) -> None:
# The server may have crashed or may haven't started yet.
# In such case, calling open() in blocking model blocks the client.
# To avoid such issue, open it in non-blocking mode, and an OSError will
# be raised if the server is not there.
file = self._open_non_blocking()
if file is None:
raise BrokenPipeError(
"Could not send the FileTimerRequest because FileTimerServer is not available."
)
with file:
json_request = request.to_json()
# Write request with no greater than select.PIPE_BUF is guarantee to be atomic.
if len(json_request) > select.PIPE_BUF:
raise RuntimeError(
f"FileTimerRequest larger than {select.PIPE_BUF} bytes "
f"is not supported: {json_request}"
)
file.write(json_request + "\n")
def acquire(self, scope_id: str, expiration_time: float) -> None:
self._send_request(
request=FileTimerRequest(
worker_pid=os.getpid(),
scope_id=scope_id,
expiration_time=expiration_time,
signal=self.signal,
),
)
def release(self, scope_id: str) -> None:
self._send_request(
request=FileTimerRequest(
worker_pid=os.getpid(), scope_id=scope_id, expiration_time=-1, signal=0
),
)
class FileTimerServer:
"""
Server that works with ``FileTimerClient``. Clients are expected to be
running on the same host as the process that is running this server.
Each host in the job is expected to start its own timer server locally
and each server instance manages timers for local workers (running on
processes on the same host).
Args:
file_path: str, the path of a FIFO special file to be created.
max_interval: float, max interval in seconds for each watchdog loop.
daemon: bool, running the watchdog thread in daemon mode or not.
A daemon thread will not block a process to stop.
log_event: Callable[[Dict[str, str]], None], an optional callback for
logging the events in JSON format.
"""
def __init__(
self,
file_path: str,
run_id: str,
max_interval: float = 10,
daemon: bool = True,
log_event: Optional[Callable[[str, Optional[FileTimerRequest]], None]] = None,
) -> None:
self._file_path = file_path
self._run_id = run_id
self._max_interval = max_interval
self._daemon = daemon
self._timers: Dict[Tuple[int, str], FileTimerRequest] = {}
self._stop_signaled = False
self._watchdog_thread: Optional[threading.Thread] = None
if os.path.exists(self._file_path):
os.remove(self._file_path)
os.mkfifo(self._file_path)
# For test only. Count the number of requests received.
self._request_count = 0
# For test only. Process all requests and stop the server.
self._run_once = False
self._log_event = (
log_event if log_event is not None else lambda name, request: None
)
self._last_progress_time = int(time.time())
def start(self) -> None:
logger.info(
"Starting %s... max_interval=%s, daemon=%s, file_path=%s",
type(self).__name__,
self._max_interval,
self._daemon,
self._file_path,
)
self._watchdog_thread = threading.Thread(
target=self._watchdog_loop, daemon=self._daemon
)
logger.info("Starting watchdog thread...")
self._watchdog_thread.start()
self._log_event("watchdog started", None)
def stop(self) -> None:
logger.info("Stopping %s", type(self).__name__)
self._stop_signaled = True
if self._watchdog_thread:
logger.info("Stopping watchdog thread...")
self._watchdog_thread.join(self._max_interval)
self._watchdog_thread = None
else:
logger.info("No watchdog thread running, doing nothing")
if os.path.exists(self._file_path):
os.remove(self._file_path)
self._log_event("watchdog stopped", None)
def run_once(self) -> None:
self._run_once = True
if self._watchdog_thread:
logger.info("Stopping watchdog thread...")
self._watchdog_thread.join()
self._watchdog_thread = None
else:
logger.info("No watchdog thread running, doing nothing")
if os.path.exists(self._file_path):
os.remove(self._file_path)
@staticmethod
def is_process_running(pid: int):
"""
function to check process is running or not
"""
try:
# Check if the process exists and we can send signals to it
os.kill(pid, 0)
return True
except OSError:
return False
def _watchdog_loop(self) -> None:
# Open the pipe in blocking mode blocks the server thread.
# This is fine for the following reasons:
# 1. No client case usually does not happen.
# 2. We are running the watchdog loop in a separate daemon
# thread, which will not block the process to stop.
with open(self._file_path) as fd:
while not self._stop_signaled:
try:
run_once = self._run_once
self._run_watchdog(fd)
if run_once:
break
self._last_progress_time = int(time.time())
except Exception:
logger.exception("Error running watchdog")
def _run_watchdog(self, fd: io.TextIOWrapper) -> None:
timer_requests = self._get_requests(fd, self._max_interval)
self.register_timers(timer_requests)
now = time.time()
reaped_worker_pids = set()
all_expired_timers = self.get_expired_timers(now)
log_debug_info_for_expired_timers(
self._run_id,
{
pid: self._get_scopes(expired_timers)
for pid, expired_timers in all_expired_timers.items()
},
)
for worker_pid, expired_timers in all_expired_timers.items():
logger.info(
"Reaping worker_pid=[%s]. Expired timers: %s",
worker_pid,
self._get_scopes(expired_timers),
)
reaped_worker_pids.add(worker_pid)
# In case we have multiple expired timers, we find the first timer
# with a valid signal (>0) in the expiration time order.
expired_timers.sort(key=lambda timer: timer.expiration_time)
signal = 0
expired_timer = None
for timer in expired_timers:
self._log_event("timer expired", timer)
if timer.signal > 0:
signal = timer.signal
expired_timer = timer
break
if signal <= 0:
logger.info(
"No signal specified with worker=[%s]. Do not reap it.", worker_pid
)
continue
if self._reap_worker(worker_pid, signal):
logger.info(
"Successfully reaped worker=[%s] with signal=%s", worker_pid, signal
)
self._log_event("kill worker process", expired_timer)
else:
logger.error(
"Error reaping worker=[%s]. Will retry on next watchdog.",
worker_pid,
)
self.clear_timers(reaped_worker_pids)
def _get_scopes(self, timer_requests: List[FileTimerRequest]) -> List[str]:
return [r.scope_id for r in timer_requests]
def _get_requests(
self, fd: io.TextIOWrapper, max_interval: float
) -> List[FileTimerRequest]:
start = time.time()
requests = []
while not self._stop_signaled or self._run_once:
# For named pipe, readline() is blocking when at least one writer opens.
# It returns only when flush() is called at the writer side.
# Note that flush() is automatically called inside close().
# After the last writer closes, readline() is not blocking.
# It will return an empty string when it's at end-of-file.
# Since the client side always opens the pipe, writes a message and closes
# the pipe immediately, the readline() call below is not blocking for long.
json_request = fd.readline()
if len(json_request) == 0:
if self._run_once:
break
time.sleep(min(max_interval, 1))
else:
request = json.loads(json_request)
pid = request["pid"]
scope_id = request["scope_id"]
expiration_time = request["expiration_time"]
signal = request["signal"]
requests.append(
FileTimerRequest(
worker_pid=pid,
scope_id=scope_id,
expiration_time=expiration_time,
signal=signal,
)
)
now = time.time()
if now - start > max_interval:
break
return requests
def register_timers(self, timer_requests: List[FileTimerRequest]) -> None:
for request in timer_requests:
pid = request.worker_pid
scope_id = request.scope_id
expiration_time = request.expiration_time
self._request_count += 1
key = (pid, scope_id)
# negative expiration is a proxy for a release call
if expiration_time < 0:
if key in self._timers:
del self._timers[key]
else:
self._timers[key] = request
def clear_timers(self, worker_pids: Set[int]) -> None:
for pid, scope_id in list(self._timers.keys()):
if pid in worker_pids or not FileTimerServer.is_process_running(pid):
del self._timers[(pid, scope_id)]
def get_expired_timers(self, deadline: float) -> Dict[int, List[FileTimerRequest]]:
# pid -> [timer_requests...]
expired_timers: Dict[int, List[FileTimerRequest]] = {}
for request in self._timers.values():
if request.expiration_time <= deadline:
expired_scopes = expired_timers.setdefault(request.worker_pid, [])
expired_scopes.append(request)
return expired_timers
def _reap_worker(self, worker_pid: int, signal: int) -> bool:
try:
os.kill(worker_pid, signal)
return True
except ProcessLookupError:
logger.info("Process with pid=%s does not exist. Skipping", worker_pid)
return True
except Exception:
logger.exception("Error terminating pid=%s", worker_pid)
return False
def get_last_progress_time(self) -> int:
return self._last_progress_time

View File

@ -0,0 +1,128 @@
# mypy: allow-untyped-defs
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import logging
import multiprocessing as mp
import os
import signal
import time
from queue import Empty
from typing import Any, Dict, List, Set, Tuple
from .api import RequestQueue, TimerClient, TimerRequest, TimerServer
__all__ = ["LocalTimerClient", "MultiprocessingRequestQueue", "LocalTimerServer"]
logger = logging.getLogger(__name__)
class LocalTimerClient(TimerClient):
"""
Client side of ``LocalTimerServer``. This client is meant to be used
on the same host that the ``LocalTimerServer`` is running on and uses
pid to uniquely identify a worker. This is particularly useful in situations
where one spawns a subprocess (trainer) per GPU on a host with multiple
GPU devices.
"""
def __init__(self, mp_queue):
super().__init__()
self._mp_queue = mp_queue
def acquire(self, scope_id, expiration_time):
pid = os.getpid()
acquire_request = TimerRequest(pid, scope_id, expiration_time)
self._mp_queue.put(acquire_request)
def release(self, scope_id):
pid = os.getpid()
release_request = TimerRequest(pid, scope_id, -1)
self._mp_queue.put(release_request)
class MultiprocessingRequestQueue(RequestQueue):
"""
A ``RequestQueue`` backed by python ``multiprocessing.Queue``
"""
def __init__(self, mp_queue: mp.Queue):
super().__init__()
self._mp_queue = mp_queue
def size(self) -> int:
return self._mp_queue.qsize()
def get(self, size, timeout: float) -> List[TimerRequest]:
requests = []
wait = timeout
for _ in range(0, size):
start = time.time()
try:
r = self._mp_queue.get(block=True, timeout=wait)
except Empty:
break
requests.append(r)
wait = wait - (time.time() - start)
if wait <= 0:
break
return requests
class LocalTimerServer(TimerServer):
"""
Server that works with ``LocalTimerClient``. Clients are expected to be
subprocesses to the parent process that is running this server. Each host
in the job is expected to start its own timer server locally and each
server instance manages timers for local workers (running on processes
on the same host).
"""
def __init__(
self, mp_queue: mp.Queue, max_interval: float = 60, daemon: bool = True
):
super().__init__(MultiprocessingRequestQueue(mp_queue), max_interval, daemon)
self._timers: Dict[Tuple[Any, str], TimerRequest] = {}
def register_timers(self, timer_requests: List[TimerRequest]) -> None:
for request in timer_requests:
pid = request.worker_id
scope_id = request.scope_id
expiration_time = request.expiration_time
# negative expiration is a proxy for a release call
if expiration_time < 0:
self._timers.pop((pid, scope_id), None)
else:
self._timers[(pid, scope_id)] = request
def clear_timers(self, worker_ids: Set[int]) -> None:
for pid, scope_id in list(self._timers.keys()):
if pid in worker_ids:
self._timers.pop((pid, scope_id))
def get_expired_timers(self, deadline: float) -> Dict[Any, List[TimerRequest]]:
# pid -> [timer_requests...]
expired_timers: Dict[Any, List[TimerRequest]] = {}
for request in self._timers.values():
if request.expiration_time <= deadline:
expired_scopes = expired_timers.setdefault(request.worker_id, [])
expired_scopes.append(request)
return expired_timers
def _reap_worker(self, worker_id: int) -> bool:
try:
os.kill(worker_id, signal.SIGKILL)
return True
except ProcessLookupError:
logger.info("Process with pid=%s does not exist. Skipping", worker_id)
return True
except Exception:
logger.exception("Error terminating pid=%s", worker_id)
return False