Support multiple scheduler.

1) can run multiple runner (daemon) for the scheduler, each runner will listen to the `scheduler_queue`, and the prefetch_count is set to 1, thus each runner can only launch one Scheduler process. 2) The scheduler process listen to the `workgraph_queue` to launch WorkGraph 3) the scheduler recieve rpc call to launch WorkGrpah 4) user can submit workgraph to the workgraph queue, or select the shceduler to run it by pk
aiidateam · Sep 2, 2024 · d35d63e · d35d63e
1 parent d267263
commit d35d63e
Show file tree

Hide file tree

Showing 9 changed files with 343 additions and 216 deletions.
diff --git a/aiida_workgraph/cli/cmd_scheduler.py b/aiida_workgraph/cli/cmd_scheduler.py
@@ -1,27 +1,11 @@
 from aiida_workgraph.cli.cmd_workgraph import workgraph
 import click
-from pathlib import Path
 from aiida.cmdline.utils import decorators, echo
+from aiida.cmdline.commands.cmd_daemon import validate_daemon_workers
 from aiida.cmdline.params import options
 from aiida_workgraph.engine.scheduler.client import get_scheduler_client
 import sys
 
-REACT_PORT = "3000"
-
-
-def get_package_root():
-    """Returns the root directory of the package."""
-    current_file = Path(__file__)
-    # Root directory of your package
-    return current_file.parent
-
-
-def get_pid_file_path():
-    """Get the path to the PID file in the desired directory."""
-    from aiida.manage.configuration.settings import AIIDA_CONFIG_FOLDER
-
-    return AIIDA_CONFIG_FOLDER / "scheduler_processes.pid"
-
 
 @workgraph.group("scheduler")
 def scheduler():
@@ -31,7 +15,7 @@ def scheduler():
 @scheduler.command()
 def worker():
     """Start the scheduler application."""
-    from aiida_workgraph.engine.launch import start_scheduler_worker
+    from aiida_workgraph.engine.scheduler.client import start_scheduler_worker
 
     click.echo("Starting the scheduler worker...")
 
@@ -40,17 +24,20 @@ def worker():
 
 @scheduler.command()
 @click.option("--foreground", is_flag=True, help="Run in foreground.")
+@click.argument("number", required=False, type=int, callback=validate_daemon_workers)
 @options.TIMEOUT(default=None, required=False, type=int)
 @decorators.with_dbenv()
 @decorators.requires_broker
 @decorators.check_circus_zmq_version
-def start(foreground, timeout):
+def start(foreground, number, timeout):
     """Start the scheduler application."""
+    from aiida_workgraph.engine.scheduler.client import start_scheduler_process
 
     click.echo("Starting the scheduler process...")
 
     client = get_scheduler_client()
-    client.start_daemon(foreground=foreground)
+    client.start_daemon(number_workers=number, foreground=foreground, timeout=timeout)
+    start_scheduler_process(number)
 
 
 @scheduler.command()
@@ -86,18 +73,19 @@ def stop(ctx, no_wait, all_profiles, timeout):
 
 @scheduler.command(hidden=True)
 @click.option("--foreground", is_flag=True, help="Run in foreground.")
+@click.argument("number", required=False, type=int, callback=validate_daemon_workers)
 @decorators.with_dbenv()
 @decorators.requires_broker
 @decorators.check_circus_zmq_version
-def start_circus(foreground):
+def start_circus(foreground, number):
     """This will actually launch the circus daemon, either daemonized in the background or in the foreground.
 
     If run in the foreground all logs are redirected to stdout.
 
     .. note:: this should not be called directly from the commandline!
     """
 
-    get_scheduler_client()._start_daemon(foreground=foreground)
+    get_scheduler_client()._start_daemon(number_workers=number, foreground=foreground)
 
 
 @scheduler.command()

diff --git a/aiida_workgraph/engine/launch.py b/aiida_workgraph/engine/launch.py
@@ -27,6 +27,13 @@
 LOGGER = AIIDA_LOGGER.getChild("engine.launch")
 
 
+"""
+Note: I modified the run_get_node and submit functions to include the parent_pid argument.
+This is necessary for keeping track of the provenance of the processes.
+
+"""
+
+
 def run_get_node(
     process_class, *args, **kwargs
 ) -> tuple[dict[str, t.Any] | None, "ProcessNode"]:
@@ -170,80 +177,3 @@ def submit(
         time.sleep(wait_interval)
 
     return node
-
-
-def start_scheduler_worker(foreground: bool = False) -> None:
-    """Start a scheduler worker for the currently configured profile.
-
-    :param foreground: If true, the logging will be configured to write to stdout, otherwise it will be configured to
-        write to the scheduler log file.
-    """
-    import asyncio
-    import signal
-    import sys
-
-    from aiida.common.log import configure_logging
-    from aiida.manage import get_config_option, get_manager
-    from aiida_workgraph.engine.scheduler import WorkGraphScheduler
-    from aiida_workgraph.engine.scheduler.client import (
-        get_scheduler_client,
-        get_scheduler,
-    )
-    from aiida.engine.processes.launcher import ProcessLauncher
-    from aiida.engine import persistence
-    from plumpy.persistence import LoadSaveContext
-    from aiida.engine.daemon.worker import shutdown_worker
-
-    daemon_client = get_scheduler_client()
-    configure_logging(
-        daemon=not foreground, daemon_log_file=daemon_client.daemon_log_file
-    )
-
-    LOGGER.debug(f"sys.executable: {sys.executable}")
-    LOGGER.debug(f"sys.path: {sys.path}")
-
-    try:
-        manager = get_manager()
-        # runner = manager.create_daemon_runner()
-        runner = manager.create_runner(broker_submit=True)
-        manager.set_runner(runner)
-    except Exception:
-        LOGGER.exception("daemon worker failed to start")
-        raise
-
-    if isinstance(rlimit := get_config_option("daemon.recursion_limit"), int):
-        LOGGER.info("Setting maximum recursion limit of daemon worker to %s", rlimit)
-        sys.setrecursionlimit(rlimit)
-
-    signals = (signal.SIGTERM, signal.SIGINT)
-    for s in signals:
-        # https://github.com/python/mypy/issues/12557
-        runner.loop.add_signal_handler(s, lambda s=s: asyncio.create_task(shutdown_worker(runner)))  # type: ignore[misc]
-
-    try:
-        running_scheduler = get_scheduler()
-        runner_loop = runner.loop
-        task_receiver = ProcessLauncher(
-            loop=runner_loop,
-            persister=manager.get_persister(),
-            load_context=LoadSaveContext(runner=runner),
-            loader=persistence.get_object_loader(),
-        )
-        asyncio.run(
-            task_receiver._continue(
-                communicator=None, pid=running_scheduler, nowait=True
-            )
-        )
-    except ValueError:
-        print("Starting a new Scheduler")
-        process_inited = instantiate_process(runner, WorkGraphScheduler)
-        runner.loop.create_task(process_inited.step_until_terminated())
-
-    try:
-        LOGGER.info("Starting a daemon worker")
-        runner.start()
-    except SystemError as exception:
-        LOGGER.info("Received a SystemError: %s", exception)
-        runner.close()
-
-    LOGGER.info("Daemon worker started")
diff --git a/aiida_workgraph/engine/override.py b/aiida_workgraph/engine/override.py
@@ -0,0 +1,71 @@
+from plumpy.process_comms import RemoteProcessThreadController
+from typing import Any, Optional
+
+"""
+Note: I modified the the create_daemon_runner function and RemoteProcessThreadController
+to include the queue_name argument.
+
+"""
+
+
+def create_daemon_runner(
+    manager, queue_name: str = None, loop: Optional["asyncio.AbstractEventLoop"] = None
+) -> "Runner":
+    """Create and return a new daemon runner.
+    This is used by workers when the daemon is running and in testing.
+    :param loop: the (optional) asyncio event loop to use
+    :return: a runner configured to work in the daemon configuration
+    """
+    from plumpy.persistence import LoadSaveContext
+    from aiida.engine import persistence
+    from aiida.engine.processes.launcher import ProcessLauncher
+    from plumpy.communications import convert_to_comm
+
+    runner = manager.create_runner(broker_submit=True, loop=loop)
+    runner_loop = runner.loop
+    # Listen for incoming launch requests
+    task_receiver = ProcessLauncher(
+        loop=runner_loop,
+        persister=manager.get_persister(),
+        load_context=LoadSaveContext(runner=runner),
+        loader=persistence.get_object_loader(),
+    )
+
+    def callback(_comm, msg):
+        print("Received message: {}".format(msg))
+        import asyncio
+
+        asyncio.run(task_receiver(_comm, msg))
+        print("task_receiver._continue done")
+        return True
+
+    assert runner.communicator is not None, "communicator not set for runner"
+    if queue_name is not None:
+        print("queue_name: {}".format(queue_name))
+        queue = runner.communicator._communicator.task_queue(
+            queue_name, prefetch_count=1
+        )
+        # queue.add_task_subscriber(callback)
+        # important to convert the callback
+        converted = convert_to_comm(task_receiver, runner.communicator._loop)
+        queue.add_task_subscriber(converted)
+    else:
+        runner.communicator.add_task_subscriber(task_receiver)
+    return runner
+
+
+class ControllerWithQueueName(RemoteProcessThreadController):
+    def __init__(self, queue_name: str, **kwargs):
+        super().__init__(**kwargs)
+        self.queue_name = queue_name
+
+    def task_send(self, message: Any, no_reply: bool = False) -> Optional[Any]:
+        """
+        Send a task to be performed using the communicator
+
+        :param message: the task message
+        :param no_reply: if True, this call will be fire-and-forget, i.e. no return value
+        :return: the response from the remote side (if no_reply=False)
+        """
+        queue = self._communicator.task_queue(self.queue_name)
+        return queue.task_send(message, no_reply=no_reply)
diff --git a/aiida_workgraph/engine/scheduler/client.py b/aiida_workgraph/engine/scheduler/client.py
@@ -4,8 +4,11 @@
 from aiida.common.exceptions import ConfigurationError
 import os
 from typing import Optional
+from aiida.common.log import AIIDA_LOGGER
+from typing import List
 
 WORKGRAPH_BIN = shutil.which("workgraph")
+LOGGER = AIIDA_LOGGER.getChild("engine.launch")
 
 
 class SchedulerClient(DaemonClient):
@@ -102,6 +105,7 @@ def cmd_start_daemon(
             self.profile.name,
             "scheduler",
             "start-circus",
+            str(number_workers),
         ]
 
         if foreground:
@@ -114,7 +118,7 @@ def cmd_start_daemon_worker(self) -> list[str]:
         """Return the command to start a daemon worker process."""
         return [self._workgraph_bin, "-p", self.profile.name, "scheduler", "worker"]
 
-    def _start_daemon(self, foreground: bool = False) -> None:
+    def _start_daemon(self, number_workers: int = 1, foreground: bool = False) -> None:
         """Start the daemon.
 
         .. warning:: This will daemonize the current process and put it in the background. It is most likely not what
@@ -149,7 +153,7 @@ def _start_daemon(self, foreground: bool = False) -> None:
                 {
                     "cmd": " ".join(self.cmd_start_daemon_worker),
                     "name": self.daemon_name,
-                    "numprocesses": 1,
+                    "numprocesses": number_workers,
                     "virtualenv": self.virtualenv,
                     "copy_env": True,
                     "stdout_stream": {
@@ -210,7 +214,7 @@ def get_scheduler_client(profile_name: Optional[str] = None) -> "SchedulerClient
     return SchedulerClient(profile)
 
 
-def get_scheduler():
+def get_scheduler() -> List[int]:
     from aiida.orm import QueryBuilder
     from aiida_workgraph.engine.scheduler import WorkGraphScheduler
 
@@ -224,7 +228,95 @@ def get_scheduler():
     }
     qb.append(WorkGraphScheduler, filters=filters, project=projections, tag="process")
     results = qb.all()
-    if len(results) == 0:
-        raise ValueError("No scheduler found. Please start the scheduler first.")
-    scheduler_id = results[0][0]
-    return scheduler_id
+    pks = [r[0] for r in results]
+    return pks
+
+
+def start_scheduler_worker(foreground: bool = False) -> None:
+    """Start a scheduler worker for the currently configured profile.
+
+    :param foreground: If true, the logging will be configured to write to stdout, otherwise it will be configured to
+        write to the scheduler log file.
+    """
+    import asyncio
+    import signal
+    import sys
+    from aiida_workgraph.engine.scheduler.client import get_scheduler_client
+    from aiida_workgraph.engine.override import create_daemon_runner
+
+    from aiida.common.log import configure_logging
+    from aiida.manage import get_config_option
+    from aiida.engine.daemon.worker import shutdown_worker
+
+    daemon_client = get_scheduler_client()
+    configure_logging(
+        daemon=not foreground, daemon_log_file=daemon_client.daemon_log_file
+    )
+
+    LOGGER.debug(f"sys.executable: {sys.executable}")
+    LOGGER.debug(f"sys.path: {sys.path}")
+
+    try:
+        manager = get_manager()
+        runner = create_daemon_runner(manager, queue_name="scheduler_queue")
+    except Exception:
+        LOGGER.exception("daemon worker failed to start")
+        raise
+
+    if isinstance(rlimit := get_config_option("daemon.recursion_limit"), int):
+        LOGGER.info("Setting maximum recursion limit of daemon worker to %s", rlimit)
+        sys.setrecursionlimit(rlimit)
+
+    signals = (signal.SIGTERM, signal.SIGINT)
+    for s in signals:
+        # https://github.com/python/mypy/issues/12557
+        runner.loop.add_signal_handler(s, lambda s=s: asyncio.create_task(shutdown_worker(runner)))  # type: ignore[misc]
+
+    try:
+        LOGGER.info("Starting a daemon worker")
+        runner.start()
+    except SystemError as exception:
+        LOGGER.info("Received a SystemError: %s", exception)
+        runner.close()
+
+    LOGGER.info("Daemon worker started")
+
+
+def start_scheduler_process(number: int = 1) -> None:
+    """Start or restart the specified number of scheduler processes."""
+    from aiida_workgraph.engine.scheduler import WorkGraphScheduler
+    from aiida_workgraph.engine.scheduler.client import get_scheduler
+    from aiida_workgraph.utils.control import create_scheduler_action
+    from aiida_workgraph.engine.utils import instantiate_process
+
+    try:
+        schedulers: List[int] = get_scheduler()
+        existing_schedulers_count = len(schedulers)
+        print(
+            "Found {} existing scheduler(s): {}".format(
+                existing_schedulers_count, " ".join([str(pk) for pk in schedulers])
+            )
+        )
+
+        count = 0
+
+        # Restart existing schedulers if they exceed the number to start
+        for pk in schedulers[:number]:
+            create_scheduler_action(pk)
+            print(f"Scheduler with pk {pk} running.")
+            count += 1
+        # not running
+        for pk in schedulers[number:]:
+            print(f"Scheduler with pk {pk} not running.")
+
+        # Start new schedulers if more are needed
+        runner = get_manager().get_runner()
+        for i in range(count, number):
+            process_inited = instantiate_process(runner, WorkGraphScheduler)
+            process_inited.runner.persister.save_checkpoint(process_inited)
+            process_inited.close()
+            create_scheduler_action(process_inited.node.pk)
+            print(f"Scheduler with pk {process_inited.node.pk} running.")
+
+    except Exception as e:
+        raise (f"An error occurred while starting schedulers: {e}")