Skip to content

Commit

Permalink
Merge pull request #909 from lisajulia/feature/add-failureFlag-to-tas…
Browse files Browse the repository at this point in the history
…klets

Feature/add failure flag to tasklets
  • Loading branch information
bska authored Jul 15, 2024
2 parents 6110b46 + d13fc3f commit 1d84a06
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 10 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,8 @@ opm_add_test(tutorial1

opm_add_test(test_tasklets
DRIVER_ARGS --plain)
opm_add_test(test_tasklets_failure
DRIVER_ARGS --plain)

opm_add_test(test_mpiutil
PROCESSORS 4
Expand Down
25 changes: 23 additions & 2 deletions opm/models/parallel/tasklets.hh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#ifndef EWOMS_TASKLETS_HH
#define EWOMS_TASKLETS_HH

#include <atomic>
#include <stdexcept>
#include <cassert>
#include <thread>
Expand Down Expand Up @@ -198,6 +199,11 @@ public:
}
}

bool failure() const
{
return this->failureFlag_.load(std::memory_order_relaxed);
}

/*!
* \brief Returns the index of the current worker thread.
*
Expand Down Expand Up @@ -232,9 +238,11 @@ public:
}
catch (const std::exception& e) {
std::cerr << "ERROR: Uncaught std::exception when running tasklet: " << e.what() << ". Trying to continue.\n";
failureFlag_.store(true, std::memory_order_relaxed);
}
catch (...) {
std::cerr << "ERROR: Uncaught exception (general type) when running tasklet. Trying to continue.\n";
failureFlag_.store(true, std::memory_order_relaxed);
}
}
}
Expand Down Expand Up @@ -280,6 +288,16 @@ public:

barrierTasklet->wait();
}
private:
// Atomic flag that is set to failure if any of the tasklets run by the TaskletRunner fails.
// This flag is checked before new tasklets run or get dispatched and in case it is true, the
// thread execution will be stopped / no new tasklets will be started and the program will abort.
// To set the flag and load the flag, we use std::memory_order_relaxed.
// Atomic operations tagged memory_order_relaxed are not synchronization operations; they do not
// impose an order among concurrent memory accesses. They guarantee atomicity and modification order
// consistency. This is the right choice for the setting here, since it is enough to broadcast failure
// before new run or get dispatched.
std::atomic<bool> failureFlag_ = false;

protected:
// main function of the worker thread
Expand All @@ -295,6 +313,7 @@ protected:
void run_()
{
while (true) {

// wait until tasklets have been pushed to the queue. first we need to lock
// mutex for access to taskletQueue_
std::unique_lock<std::mutex> lock(taskletQueueMutex_);
Expand Down Expand Up @@ -330,10 +349,12 @@ protected:
tasklet->run();
}
catch (const std::exception& e) {
std::cerr << "ERROR: Uncaught std::exception when running tasklet: " << e.what() << ". Trying to continue.\n";
std::cerr << "ERROR: Uncaught std::exception when running tasklet: " << e.what() << ".\n";
failureFlag_.store(true, std::memory_order_relaxed);
}
catch (...) {
std::cerr << "ERROR: Uncaught exception when running tasklet. Trying to continue.\n";
std::cerr << "ERROR: Uncaught exception when running tasklet.\n";
failureFlag_.store(true, std::memory_order_relaxed);
}
}
}
Expand Down
13 changes: 5 additions & 8 deletions tests/test_tasklets.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@

std::mutex outputMutex;

Opm::TaskletRunner *runner;
// The runner is created on the heap for the assertion and outputs in the run function of the tasklets.
std::unique_ptr<Opm::TaskletRunner> runner{};

class SleepTasklet : public Opm::TaskletInterface
{
Expand All @@ -51,9 +52,8 @@ class SleepTasklet : public Opm::TaskletInterface
{
assert(0 <= runner->workerThreadIndex() && runner->workerThreadIndex() < runner->numWorkerThreads());
std::this_thread::sleep_for(std::chrono::milliseconds(mseconds_));
outputMutex.lock();
std::lock_guard<std::mutex> guard(outputMutex);
std::cout << "Sleep tasklet " << n_ << " of " << mseconds_ << " ms completed by worker thread " << runner->workerThreadIndex() << std::endl;
outputMutex.unlock();
}

private:
Expand All @@ -67,17 +67,16 @@ void sleepAndPrintFunction()
{
int ms = 100;
std::this_thread::sleep_for(std::chrono::milliseconds(ms));
outputMutex.lock();
std::lock_guard<std::mutex> guard(outputMutex);
std::cout << "Sleep completed by worker thread " << runner->workerThreadIndex() << std::endl;
outputMutex.unlock();
}

int SleepTasklet::numInstantiated_ = 0;

int main()
{
int numWorkers = 2;
runner = new Opm::TaskletRunner(numWorkers);
runner = std::make_unique<Opm::TaskletRunner>(numWorkers);

// the master thread is not a worker thread
assert(runner->workerThreadIndex() < 0);
Expand All @@ -96,8 +95,6 @@ int main()
runner->dispatchFunction(sleepAndPrintFunction);
runner->dispatchFunction(sleepAndPrintFunction, /*numInvokations=*/6);

delete runner;

return 0;
}

148 changes: 148 additions & 0 deletions tests/test_tasklets_failure.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
// vi: set et ts=4 sw=4 sts=4:
/*
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
Consult the COPYING file in the top-level source directory of this
module for the precise wording of the license and the list of
copyright holders.
*/
/*!
* \file
*
* \brief This file serves as an example of how to use the tasklet mechanism for
* asynchronous work, especially for tasklets that fail.
*/
#define BOOST_TEST_MODULE TASKLETS_FAILURE

//#include <boost/test/unit_test.hpp>
#include <boost/test/included/unit_test.hpp>
#include <chrono>
#include <iostream>
#include <mutex>
#include <thread>
#include <sys/wait.h>
#include <unistd.h>
#include <cassert>

#include "config.h"
#include <opm/models/parallel/tasklets.hh>

std::mutex outputMutex;

// The runner is created on the heap for the assertion and outputs in the run function of the tasklets.
std::unique_ptr<Opm::TaskletRunner> runner{};

class SleepTasklet : public Opm::TaskletInterface
{
public:
SleepTasklet(int mseconds, int id)
: mseconds_(mseconds),
id_(id)
{}

void run() override
{
assert(0 <= runner->workerThreadIndex() && runner->workerThreadIndex() < runner->numWorkerThreads());
std::cout << "Sleep tasklet " << id_ << " of " << mseconds_ << " ms starting sleep on worker thread " << runner->workerThreadIndex() << std::endl;
std::this_thread::sleep_for(std::chrono::milliseconds(mseconds_));
std::lock_guard<std::mutex> guard(outputMutex);
std::cout << "Sleep tasklet " << id_ << " of " << mseconds_ << " ms completed by worker thread " << runner->workerThreadIndex() << std::endl;
}

private:
int mseconds_;
int id_;
};

class FailingSleepTasklet : public Opm::TaskletInterface
{
public:
FailingSleepTasklet(int mseconds)
: mseconds_(mseconds)
{}
void run() override
{
std::this_thread::sleep_for(std::chrono::milliseconds(mseconds_));
std::lock_guard<std::mutex> guard(outputMutex);
std::cout << "Failing sleep tasklet of " << mseconds_ << " ms failing now, on work thread " << runner->workerThreadIndex() << std::endl;
throw std::logic_error("Intentional failure for testing");
}

private:
int mseconds_;
};

void execute () {
int numWorkers = 2;
runner = std::make_unique<Opm::TaskletRunner>(numWorkers);

// the master thread is not a worker thread
BOOST_REQUIRE_LT(runner->workerThreadIndex(), 0);
BOOST_REQUIRE_EQUAL(runner->numWorkerThreads(), numWorkers);

// Dispatch some successful tasklets
for (int i = 0; i < 5; ++i) {
runner->barrier();

if (runner->failure()) {
exit(EXIT_FAILURE);
}
auto st = std::make_shared<SleepTasklet>(10,i);
runner->dispatch(st);
}

runner->barrier();
if (runner->failure()) {
exit(EXIT_FAILURE);
}
// Dispatch a failing tasklet
auto failingSleepTasklet = std::make_shared<FailingSleepTasklet>(100);
runner->dispatch(failingSleepTasklet);

// Dispatch more successful tasklets
for (int i = 5; i < 10; ++i) {
runner->barrier();

if (runner->failure()) {
exit(EXIT_FAILURE);
}
auto st = std::make_shared<SleepTasklet>(10,i);
runner->dispatch(st);
}

std::cout << "before barrier" << std::endl;
runner->barrier();
}
BOOST_AUTO_TEST_SUITE(Tasklets)
BOOST_AUTO_TEST_CASE(TASKLETS_FAILURE) {
pid_t pid = fork(); // Create a new process, such that this child process can call exit(EXIT_FAILURE)
if (pid == -1) {
BOOST_FAIL("Fork failed");
} else if (pid == 0) {
// Child process
execute();
_exit(0); // Should never reach here
} else {
// Parent process
std::cout << "Checking failure of child process with parent process, process id " << pid << std::endl;
int status;
waitpid(pid, &status, 0);
BOOST_CHECK(WIFEXITED(status)); // Check if the child process exited
BOOST_CHECK_EQUAL(WEXITSTATUS(status), EXIT_FAILURE); // Check if the exit status is EXIT_FAILURE
}
}
BOOST_AUTO_TEST_SUITE_END()

0 comments on commit 1d84a06

Please sign in to comment.