Skip to content

Commit

Permalink
Hyperparameter optimization with Optuna (#24)
Browse files Browse the repository at this point in the history
* Main file for hyperparameter tuning:
1. Running with a .yaml config file will lead to usual execution.
2. Running with the --optimized flag will tune hyperparameters with optuna. No input file needed.

* Module for hyperparameter tuning.
1. objective.py defines the objective function for tuning. Contains the fastvpinns object returning metric for tuning.
2. optuna_tuner.py manages the hyperparameter tuning process.

* Black formatting for hyperparameter optimization files.

* Black formatting for hyperparameter tuning files.

* Objective function that accepts number of training iterations as an argument.

* Changes to main file to incorporate hyperparameter tuning using Optuna
1. Accept number of trials and number of training iteration for each trial as an argument.

* Changes to geometry fle:
1. Accept an is_optimized argument, True if hyperparameter optimization with Optuna is being used.
2. If is_optimized is True, geometry module doesn't print out the test mesh and VTK file for each trial.
3. Backward compatibility - default value of is_optimized is False, existing code with config file should work as is.

* Parallel runs with optuna tuner
1. Creates an SQLite database if it doesn't exist. Can be used for stalled runs or parallel implementation.
2. Lists available number of GPUs and divides jobs.

* Files for hyperparameter tuning tests

* Black formatting for main file.

* Black formatting.
  • Loading branch information
divijghose authored Sep 20, 2024
1 parent a41aadc commit a2fa2da
Show file tree
Hide file tree
Showing 12 changed files with 918 additions and 29 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ examples/**/output/
examples/**/output_*
examples/**/__pycache__/

output/*


# pytest
*.vtk
Expand All @@ -41,4 +43,7 @@ sensor_points.png
# examples/**/*.png # as the README.md uses png image files
examples/**/sensor_points.png

# Optuna files
*.sqlite3
*.db

91 changes: 67 additions & 24 deletions docker_initialise.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import subprocess
import re


def get_version_from_toml():
try:
try:
with open("pyproject.toml", "r") as file:
content = file.read()
version_match = re.search(r'version = "([^"]+)"', content)
Expand All @@ -12,19 +13,40 @@ def get_version_from_toml():
except Exception:
return "Unknown"


def check_tensorflow():
"""
To check tensorflow version and GPU Support and number of GPUs available
"""
tensor_flow_version = "Not Found"
gpu_support = "Not Found"
number_of_gpus = "Not Found"

tensorflow_version = subprocess.run(["python3", "-c", "import tensorflow as tf; print(tf.__version__)"], capture_output=True, text=True)
gpu_support = subprocess.run(["python3", "-c", "import tensorflow as tf; print(tf.test.is_gpu_available())"], capture_output=True, text=True)
number_of_gpus = subprocess.run(["python3", "-c", "import tensorflow as tf; print(len(tf.config.experimental.list_physical_devices('GPU')))"], capture_output=True, text=True)

return tensorflow_version.stdout.strip(), gpu_support.stdout.strip(), number_of_gpus.stdout.strip()
tensorflow_version = subprocess.run(
["python3", "-c", "import tensorflow as tf; print(tf.__version__)"],
capture_output=True,
text=True,
)
gpu_support = subprocess.run(
["python3", "-c", "import tensorflow as tf; print(tf.test.is_gpu_available())"],
capture_output=True,
text=True,
)
number_of_gpus = subprocess.run(
[
"python3",
"-c",
"import tensorflow as tf; print(len(tf.config.experimental.list_physical_devices('GPU')))",
],
capture_output=True,
text=True,
)

return (
tensorflow_version.stdout.strip(),
gpu_support.stdout.strip(),
number_of_gpus.stdout.strip(),
)


def get_cuda_cudnn_nvidia_versions():
Expand All @@ -34,65 +56,86 @@ def get_cuda_cudnn_nvidia_versions():
# Get CUDA version
try:
cuda_version = subprocess.run(['nvcc', '--version'], capture_output=True, text=True)
cuda_version = re.search(r'release (\d+\.\d+)', cuda_version.stdout).group(1) if cuda_version.stdout else 'Not found'
cuda_version = (
re.search(r'release (\d+\.\d+)', cuda_version.stdout).group(1)
if cuda_version.stdout
else 'Not found'
)
except Exception:
pass

# Get cuDNN version
try:
with open('/usr/local/cuda/include/cudnn_version.h', 'r') as f:
cudnn_version = f.read()
cudnn_version = re.search(r'#define CUDNN_MAJOR (\d+)\n#define CUDNN_MINOR (\d+)\n#define CUDNN_PATCHLEVEL (\d+)', cudnn_version)
cudnn_version = f"{cudnn_version.group(1)}.{cudnn_version.group(2)}.{cudnn_version.group(3)}" if cudnn_version else 'Not found'
cudnn_version = re.search(
r'#define CUDNN_MAJOR (\d+)\n#define CUDNN_MINOR (\d+)\n#define CUDNN_PATCHLEVEL (\d+)',
cudnn_version,
)
cudnn_version = (
f"{cudnn_version.group(1)}.{cudnn_version.group(2)}.{cudnn_version.group(3)}"
if cudnn_version
else 'Not found'
)
except Exception:
cudnn_version = 'Not found'

# Get NVIDIA driver version
nvidia_driver_version = 'Not found'
try:
nvidia_driver_version = subprocess.run(['nvidia-smi', '--query-gpu=driver_version', '--format=csv,noheader'], capture_output=True, text=True)
nvidia_driver_version = nvidia_driver_version.stdout.strip() if nvidia_driver_version.stdout else 'Not found'
nvidia_driver_version = subprocess.run(
['nvidia-smi', '--query-gpu=driver_version', '--format=csv,noheader'],
capture_output=True,
text=True,
)
nvidia_driver_version = (
nvidia_driver_version.stdout.strip() if nvidia_driver_version.stdout else 'Not found'
)
except Exception:
pass

return cuda_version, cudnn_version, nvidia_driver_version.split('\n')[0]



def main():
version = get_version_from_toml()
# run the ascii-image-converter in subprocess
subprocess.run(["ascii-image-converter", "Fastvpinns_logo.png", "--braille", "-d" , "70,10"])
subprocess.run(["ascii-image-converter", "Fastvpinns_logo.png", "--braille", "-d", "70,10"])
print("**********************************************************")
print(f"Official Docker Image for FastVPINNs - Version {version}")
print(f"URL: https://cmgcds.github.io/fastvpinns/")
print("Docker Image Author : Thivin Anandh")
print("**********************************************************\n")

# Execute any additional command passed to the Docker container
# Should be a security risk, so commented out
# if len(sys.argv) > 1:
# subprocess.run(sys.argv[1:])

# obtain the cuda versions
cuda_version, cudnn_version, nvidia_driver_version = get_cuda_cudnn_nvidia_versions()
if cuda_version != 'Not found' and nvidia_driver_version != 'Not found':
print(f"\033[92mGPU Checks Passed - GPU Acceleration is Available \033[0m")
else :
else:
print(f"\033[91mGPU Checks Failed - Execution is available on CPU only\033[0m")


# get tensorflow versions
tensor_flow_version, gpu_support, number_of_gpus = check_tensorflow()

column_width = 10
print("-----------------------------------------------------------------------------------------------------")
print(f"| CUDA Version: {cuda_version:<{column_width}} || cuDNN Version: {cudnn_version:<{column_width}} || NVIDIA Driver Version: {nvidia_driver_version:<{column_width}} |")
print(f"| Tensorflow Version: {tensor_flow_version:<{column_width}} || GPU Support: {gpu_support:<{column_width}} || Number of GPUs: {number_of_gpus:<{column_width}} |")
print("-----------------------------------------------------------------------------------------------------")

print(
"-----------------------------------------------------------------------------------------------------"
)
print(
f"| CUDA Version: {cuda_version:<{column_width}} || cuDNN Version: {cudnn_version:<{column_width}} || NVIDIA Driver Version: {nvidia_driver_version:<{column_width}} |"
)
print(
f"| Tensorflow Version: {tensor_flow_version:<{column_width}} || GPU Support: {gpu_support:<{column_width}} || Number of GPUs: {number_of_gpus:<{column_width}} |"
)
print(
"-----------------------------------------------------------------------------------------------------"
)



if __name__ == "__main__":
main()
main()
5 changes: 3 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import os
import sys

sys.path.insert(0, os.path.abspath('../fastvpinns'))
sys.path.insert(0, os.path.abspath('../../fastvpinns'))
import fastvpinns
Expand All @@ -46,7 +47,7 @@
'sphinx.ext.viewcode',
'sphinx.ext.mathjax',
'sphinx.ext.intersphinx',
'sphinx_copybutton',
'sphinx_copybutton',
]

# Add any paths that contain templates here, relative to this directory.
Expand Down Expand Up @@ -84,4 +85,4 @@
'logo_only': True,
'display_version': True,
'prev_next_buttons_location': 'bottom',
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def rhs(x, y):
omegaX = 4.0 * np.pi
omegaY = 4.0 * np.pi
f_temp = -2.0 * (omegaX**2) * (np.sin(omegaX * x) * np.sin(omegaY * y))

return f_temp


Expand All @@ -57,7 +57,7 @@ def exact_solution(x, y):
This function will return the exact solution at a given point
"""
# If the exact Solution does not have an analytical expression, leave the value as 0(zero)
# it can be set using `np.ones_like(x) * 0.0` and then ignore the errors and the error plots generated.
# it can be set using `np.ones_like(x) * 0.0` and then ignore the errors and the error plots generated.

omegaX = 4.0 * np.pi
omegaY = 4.0 * np.pi
Expand Down
5 changes: 4 additions & 1 deletion fastvpinns/Geometry/geometry_2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(
n_test_points_x: int,
n_test_points_y: int,
output_folder: str,
is_optimized: bool = False,
):
"""
Constructor for Geometry_2D class.
Expand All @@ -64,6 +65,7 @@ def __init__(
self.n_test_points_x = n_test_points_x
self.n_test_points_y = n_test_points_y
self.output_folder = output_folder
self.is_optimized = is_optimized

if self.mesh_generation_method not in ["internal", "external"]:
print(
Expand Down Expand Up @@ -338,7 +340,8 @@ def _temp_bd_func(start, end, num_pts):
self.bd_dict = bd_points

# generate vtk
self.generate_vtk_for_test()
if not self.is_optimized:
self.generate_vtk_for_test()

return self.cell_points, self.bd_dict

Expand Down
Empty file.
145 changes: 145 additions & 0 deletions fastvpinns/hyperparameter_tuning/objective.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""
This file contains the objective function for hyperparameter tuning of the FastVPINN model.
The objective function defines the search space for hyperparameters and evaluates the model's
performance using the suggested hyperparameter values. It sets up the geometry, finite element
space, data handler, and model based on the trial's suggestions. The model is then trained for
a fixed number of epochs, and its performance is evaluated using the relative L2 error.
Author: Divij Ghose
Changelog: 9/9/24 - Initial implementation of the objective function for hyperparameter tuning
Known issues: None
Dependencies: optuna, tensorflow, fastvpinns
"""

# objective.py
import optuna
import tensorflow as tf
import os

from fastvpinns.Geometry.geometry_2d import Geometry_2D
from fastvpinns.FE.fespace2d import Fespace2D
from fastvpinns.data.datahandler2d import DataHandler2D
from fastvpinns.model.model import DenseModel
from fastvpinns.physics.poisson2d import pde_loss_poisson
from fastvpinns.utils.compute_utils import compute_errors_combined
from sin_cos import * # Import your example-specific functions


def objective(trial, num_epochs):
# Suggest values for hyperparameters
config = {
"geometry": {
"internal_mesh_params": {
"n_cells_x": trial.suggest_int("n_cells_x", 2, 10),
"n_cells_y": trial.suggest_int("n_cells_y", 2, 10),
"n_boundary_points": trial.suggest_int("n_boundary_points", 100, 1000),
}
},
"fe": {
"fe_order": trial.suggest_int("fe_order", 2, 8),
"fe_type": trial.suggest_categorical("fe_type", ["legendre", "jacobi"]),
"quad_order": trial.suggest_int("quad_order", 3, 15),
"quad_type": trial.suggest_categorical("quad_type", ["gauss-legendre", "gauss-jacobi"]),
},
"model": {
"model_architecture": [2]
+ [
trial.suggest_int(f"layer_{i}", 10, 100)
for i in range(trial.suggest_int("n_layers", 1, 5))
]
+ [1],
"activation": "tanh",
"use_attention": False,
"learning_rate": {
"initial_learning_rate": trial.suggest_loguniform(
"initial_learning_rate", 1e-5, 1e-2
),
"use_lr_scheduler": True,
"decay_steps": trial.suggest_int("decay_steps", 1000, 10000),
"decay_rate": trial.suggest_uniform("decay_rate", 0.9, 0.99),
},
},
"pde": {"beta": 10},
}

# Set up your model and training process using the suggested hyperparameters

output_temp_dir = "output_temp"
if not os.path.exists(output_temp_dir):
os.makedirs(output_temp_dir)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
tf.config.experimental.set_memory_growth(gpus[0], True)
except RuntimeError as e:
print(e)

domain = Geometry_2D("quadrilateral", "internal", 100, 100, output_temp_dir, is_optimized=True)
cells, boundary_points = domain.generate_quad_mesh_internal(
x_limits=[0, 1],
y_limits=[0, 1],
n_cells_x=config["geometry"]["internal_mesh_params"]["n_cells_x"],
n_cells_y=config["geometry"]["internal_mesh_params"]["n_cells_y"],
num_boundary_points=config["geometry"]["internal_mesh_params"]["n_boundary_points"],
)

fespace = Fespace2D(
mesh=domain.mesh,
cells=cells,
boundary_points=boundary_points,
cell_type=domain.mesh_type,
fe_order=config["fe"]["fe_order"],
fe_type=config["fe"]["fe_type"],
quad_order=config["fe"]["quad_order"],
quad_type=config["fe"]["quad_type"],
fe_transformation_type="bilinear",
bound_function_dict=get_boundary_function_dict(),
bound_condition_dict=get_bound_cond_dict(),
forcing_function=rhs,
output_path="output_temp",
generate_mesh_plot=False,
)

datahandler = DataHandler2D(fespace, domain, dtype=tf.float32)

params_dict = {"n_cells": fespace.n_cells}
train_dirichlet_input, train_dirichlet_output = datahandler.get_dirichlet_input()
bilinear_params_dict = datahandler.get_bilinear_params_dict_as_tensors(get_bilinear_params_dict)

model = DenseModel(
layer_dims=config["model"]["model_architecture"],
learning_rate_dict=config["model"]["learning_rate"],
params_dict=params_dict,
loss_function=pde_loss_poisson,
input_tensors_list=[datahandler.x_pde_list, train_dirichlet_input, train_dirichlet_output],
orig_factor_matrices=[
datahandler.shape_val_mat_list,
datahandler.grad_x_mat_list,
datahandler.grad_y_mat_list,
],
force_function_list=datahandler.forcing_function_list,
tensor_dtype=tf.float32,
use_attention=config["model"]["use_attention"],
activation=config["model"]["activation"],
hessian=False,
)

# Train the model for a fixed number of epochs
beta = tf.constant(config["pde"]["beta"], dtype=tf.float32)

for epoch in range(num_epochs):
loss = model.train_step(beta=beta, bilinear_params_dict=bilinear_params_dict)
# remove output_temp directory using os
test_points = domain.get_test_points()
y_exact = exact_solution(test_points[:, 0], test_points[:, 1])
y_pred = model(test_points).numpy().reshape(-1)

_, _, l2_error_relative, _, _, _ = compute_errors_combined(y_exact, y_pred)

return l2_error_relative # Return the relative L2 error as the objective to minimize
Loading

0 comments on commit a2fa2da

Please sign in to comment.