Hyperparameter optimization with Optuna (#24)

* Main file for hyperparameter tuning: 1. Running with a .yaml config file will lead to usual execution. 2. Running with the --optimized flag will tune hyperparameters with optuna. No input file needed. * Module for hyperparameter tuning. 1. objective.py defines the objective function for tuning. Contains the fastvpinns object returning metric for tuning. 2. optuna_tuner.py manages the hyperparameter tuning process. * Black formatting for hyperparameter optimization files. * Black formatting for hyperparameter tuning files. * Objective function that accepts number of training iterations as an argument. * Changes to main file to incorporate hyperparameter tuning using Optuna 1. Accept number of trials and number of training iteration for each trial as an argument. * Changes to geometry fle: 1. Accept an is_optimized argument, True if hyperparameter optimization with Optuna is being used. 2. If is_optimized is True, geometry module doesn't print out the test mesh and VTK file for each trial. 3. Backward compatibility - default value of is_optimized is False, existing code with config file should work as is. * Parallel runs with optuna tuner 1. Creates an SQLite database if it doesn't exist. Can be used for stalled runs or parallel implementation. 2. Lists available number of GPUs and divides jobs. * Files for hyperparameter tuning tests * Black formatting for main file. * Black formatting.
cmgcds · Sep 20, 2024 · a2fa2da · a2fa2da
1 parent a41aadc
commit a2fa2da
Show file tree

Hide file tree

Showing 12 changed files with 918 additions and 29 deletions.
diff --git a/.gitignore b/.gitignore
@@ -30,6 +30,8 @@ examples/**/output/
 examples/**/output_*
 examples/**/__pycache__/
 
+output/*
+
 
 # pytest 
 *.vtk
@@ -41,4 +43,7 @@ sensor_points.png
 # examples/**/*.png # as the README.md uses png image files
 examples/**/sensor_points.png
 
+# Optuna files
+*.sqlite3
+*.db
 
diff --git a/docker_initialise.py b/docker_initialise.py
@@ -2,8 +2,9 @@
 import subprocess
 import re
 
+
 def get_version_from_toml():
-    try: 
+    try:
         with open("pyproject.toml", "r") as file:
             content = file.read()
         version_match = re.search(r'version = "([^"]+)"', content)
@@ -12,19 +13,40 @@ def get_version_from_toml():
     except Exception:
         return "Unknown"
 
+
 def check_tensorflow():
     """
     To check tensorflow version and GPU Support and number of GPUs available
     """
     tensor_flow_version = "Not Found"
     gpu_support = "Not Found"
     number_of_gpus = "Not Found"
-
-    tensorflow_version = subprocess.run(["python3", "-c", "import tensorflow as tf; print(tf.__version__)"], capture_output=True, text=True)
-    gpu_support = subprocess.run(["python3", "-c", "import tensorflow as tf; print(tf.test.is_gpu_available())"], capture_output=True, text=True)
-    number_of_gpus = subprocess.run(["python3", "-c", "import tensorflow as tf; print(len(tf.config.experimental.list_physical_devices('GPU')))"], capture_output=True, text=True)
 
-    return tensorflow_version.stdout.strip(), gpu_support.stdout.strip(), number_of_gpus.stdout.strip()
+    tensorflow_version = subprocess.run(
+        ["python3", "-c", "import tensorflow as tf; print(tf.__version__)"],
+        capture_output=True,
+        text=True,
+    )
+    gpu_support = subprocess.run(
+        ["python3", "-c", "import tensorflow as tf; print(tf.test.is_gpu_available())"],
+        capture_output=True,
+        text=True,
+    )
+    number_of_gpus = subprocess.run(
+        [
+            "python3",
+            "-c",
+            "import tensorflow as tf; print(len(tf.config.experimental.list_physical_devices('GPU')))",
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    return (
+        tensorflow_version.stdout.strip(),
+        gpu_support.stdout.strip(),
+        number_of_gpus.stdout.strip(),
+    )
 
 
 def get_cuda_cudnn_nvidia_versions():
@@ -34,65 +56,86 @@ def get_cuda_cudnn_nvidia_versions():
     # Get CUDA version
     try:
         cuda_version = subprocess.run(['nvcc', '--version'], capture_output=True, text=True)
-        cuda_version = re.search(r'release (\d+\.\d+)', cuda_version.stdout).group(1) if cuda_version.stdout else 'Not found'
+        cuda_version = (
+            re.search(r'release (\d+\.\d+)', cuda_version.stdout).group(1)
+            if cuda_version.stdout
+            else 'Not found'
+        )
     except Exception:
         pass
 
     # Get cuDNN version
     try:
         with open('/usr/local/cuda/include/cudnn_version.h', 'r') as f:
             cudnn_version = f.read()
-        cudnn_version = re.search(r'#define CUDNN_MAJOR (\d+)\n#define CUDNN_MINOR (\d+)\n#define CUDNN_PATCHLEVEL (\d+)', cudnn_version)
-        cudnn_version = f"{cudnn_version.group(1)}.{cudnn_version.group(2)}.{cudnn_version.group(3)}" if cudnn_version else 'Not found'
+        cudnn_version = re.search(
+            r'#define CUDNN_MAJOR (\d+)\n#define CUDNN_MINOR (\d+)\n#define CUDNN_PATCHLEVEL (\d+)',
+            cudnn_version,
+        )
+        cudnn_version = (
+            f"{cudnn_version.group(1)}.{cudnn_version.group(2)}.{cudnn_version.group(3)}"
+            if cudnn_version
+            else 'Not found'
+        )
     except Exception:
         cudnn_version = 'Not found'
 
     # Get NVIDIA driver version
     nvidia_driver_version = 'Not found'
     try:
-        nvidia_driver_version = subprocess.run(['nvidia-smi', '--query-gpu=driver_version', '--format=csv,noheader'], capture_output=True, text=True)
-        nvidia_driver_version = nvidia_driver_version.stdout.strip() if nvidia_driver_version.stdout else 'Not found'
+        nvidia_driver_version = subprocess.run(
+            ['nvidia-smi', '--query-gpu=driver_version', '--format=csv,noheader'],
+            capture_output=True,
+            text=True,
+        )
+        nvidia_driver_version = (
+            nvidia_driver_version.stdout.strip() if nvidia_driver_version.stdout else 'Not found'
+        )
     except Exception:
         pass
 
     return cuda_version, cudnn_version, nvidia_driver_version.split('\n')[0]
 
 
-
 def main():
     version = get_version_from_toml()
     # run the ascii-image-converter in subprocess
-    subprocess.run(["ascii-image-converter", "Fastvpinns_logo.png", "--braille", "-d" , "70,10"])
+    subprocess.run(["ascii-image-converter", "Fastvpinns_logo.png", "--braille", "-d", "70,10"])
     print("**********************************************************")
     print(f"Official Docker Image for FastVPINNs - Version {version}")
     print(f"URL: https://cmgcds.github.io/fastvpinns/")
     print("Docker Image Author : Thivin Anandh")
     print("**********************************************************\n")
-    
+
     # Execute any additional command passed to the Docker container
     # Should be a security risk, so commented out
     # if len(sys.argv) > 1:
     #     subprocess.run(sys.argv[1:])
-    
+
     # obtain the cuda versions
     cuda_version, cudnn_version, nvidia_driver_version = get_cuda_cudnn_nvidia_versions()
     if cuda_version != 'Not found' and nvidia_driver_version != 'Not found':
         print(f"\033[92mGPU Checks Passed - GPU Acceleration is Available \033[0m")
-    else :
+    else:
         print(f"\033[91mGPU Checks Failed - Execution is available on CPU only\033[0m")
 
-
     # get tensorflow versions
     tensor_flow_version, gpu_support, number_of_gpus = check_tensorflow()
 
     column_width = 10
-    print("-----------------------------------------------------------------------------------------------------")
-    print(f"| CUDA Version:       {cuda_version:<{column_width}} || cuDNN Version: {cudnn_version:<{column_width}}   || NVIDIA Driver Version: {nvidia_driver_version:<{column_width}} |")
-    print(f"| Tensorflow Version: {tensor_flow_version:<{column_width}} || GPU Support: {gpu_support:<{column_width}}     || Number of GPUs: {number_of_gpus:<{column_width}}        |")
-    print("-----------------------------------------------------------------------------------------------------")
-
+    print(
+        "-----------------------------------------------------------------------------------------------------"
+    )
+    print(
+        f"| CUDA Version:       {cuda_version:<{column_width}} || cuDNN Version: {cudnn_version:<{column_width}}   || NVIDIA Driver Version: {nvidia_driver_version:<{column_width}} |"
+    )
+    print(
+        f"| Tensorflow Version: {tensor_flow_version:<{column_width}} || GPU Support: {gpu_support:<{column_width}}     || Number of GPUs: {number_of_gpus:<{column_width}}        |"
+    )
+    print(
+        "-----------------------------------------------------------------------------------------------------"
+    )
 
-
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/docs/conf.py b/docs/conf.py
@@ -26,6 +26,7 @@
 
 import os
 import sys
+
 sys.path.insert(0, os.path.abspath('../fastvpinns'))
 sys.path.insert(0, os.path.abspath('../../fastvpinns'))
 import fastvpinns
@@ -46,7 +47,7 @@
     'sphinx.ext.viewcode',
     'sphinx.ext.mathjax',
     'sphinx.ext.intersphinx',
-    'sphinx_copybutton', 
+    'sphinx_copybutton',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -84,4 +85,4 @@
     'logo_only': True,
     'display_version': True,
     'prev_next_buttons_location': 'bottom',
-}
+}
diff --git a/examples/forward_problems_2d/uniform_mesh/poisson_2d/sin_cos.py b/examples/forward_problems_2d/uniform_mesh/poisson_2d/sin_cos.py
@@ -48,7 +48,7 @@ def rhs(x, y):
     omegaX = 4.0 * np.pi
     omegaY = 4.0 * np.pi
     f_temp = -2.0 * (omegaX**2) * (np.sin(omegaX * x) * np.sin(omegaY * y))
-    
+
     return f_temp
 
 
@@ -57,7 +57,7 @@ def exact_solution(x, y):
     This function will return the exact solution at a given point
     """
     # If the exact Solution does not have an analytical expression, leave the value as 0(zero)
-    # it can be set using `np.ones_like(x) * 0.0` and then ignore the errors and the error plots generated. 
+    # it can be set using `np.ones_like(x) * 0.0` and then ignore the errors and the error plots generated.
 
     omegaX = 4.0 * np.pi
     omegaY = 4.0 * np.pi

diff --git a/fastvpinns/Geometry/geometry_2d.py b/fastvpinns/Geometry/geometry_2d.py
@@ -43,6 +43,7 @@ def __init__(
         n_test_points_x: int,
         n_test_points_y: int,
         output_folder: str,
+        is_optimized: bool = False,
     ):
         """
         Constructor for Geometry_2D class.
@@ -64,6 +65,7 @@ def __init__(
         self.n_test_points_x = n_test_points_x
         self.n_test_points_y = n_test_points_y
         self.output_folder = output_folder
+        self.is_optimized = is_optimized
 
         if self.mesh_generation_method not in ["internal", "external"]:
             print(
@@ -338,7 +340,8 @@ def _temp_bd_func(start, end, num_pts):
         self.bd_dict = bd_points
 
         # generate vtk
-        self.generate_vtk_for_test()
+        if not self.is_optimized:
+            self.generate_vtk_for_test()
 
         return self.cell_points, self.bd_dict
 

diff --git a/fastvpinns/hyperparameter_tuning/__init__.py b/fastvpinns/hyperparameter_tuning/__init__.py
diff --git a/fastvpinns/hyperparameter_tuning/objective.py b/fastvpinns/hyperparameter_tuning/objective.py
@@ -0,0 +1,145 @@
+"""
+This file contains the objective function for hyperparameter tuning of the FastVPINN model.
+
+The objective function defines the search space for hyperparameters and evaluates the model's
+performance using the suggested hyperparameter values. It sets up the geometry, finite element
+space, data handler, and model based on the trial's suggestions. The model is then trained for
+a fixed number of epochs, and its performance is evaluated using the relative L2 error.
+
+Author: Divij Ghose
+
+Changelog: 9/9/24 - Initial implementation of the objective function for hyperparameter tuning
+
+
+Known issues: None
+
+Dependencies: optuna, tensorflow, fastvpinns
+"""
+
+# objective.py
+import optuna
+import tensorflow as tf
+import os
+
+from fastvpinns.Geometry.geometry_2d import Geometry_2D
+from fastvpinns.FE.fespace2d import Fespace2D
+from fastvpinns.data.datahandler2d import DataHandler2D
+from fastvpinns.model.model import DenseModel
+from fastvpinns.physics.poisson2d import pde_loss_poisson
+from fastvpinns.utils.compute_utils import compute_errors_combined
+from sin_cos import *  # Import your example-specific functions
+
+
+def objective(trial, num_epochs):
+    # Suggest values for hyperparameters
+    config = {
+        "geometry": {
+            "internal_mesh_params": {
+                "n_cells_x": trial.suggest_int("n_cells_x", 2, 10),
+                "n_cells_y": trial.suggest_int("n_cells_y", 2, 10),
+                "n_boundary_points": trial.suggest_int("n_boundary_points", 100, 1000),
+            }
+        },
+        "fe": {
+            "fe_order": trial.suggest_int("fe_order", 2, 8),
+            "fe_type": trial.suggest_categorical("fe_type", ["legendre", "jacobi"]),
+            "quad_order": trial.suggest_int("quad_order", 3, 15),
+            "quad_type": trial.suggest_categorical("quad_type", ["gauss-legendre", "gauss-jacobi"]),
+        },
+        "model": {
+            "model_architecture": [2]
+            + [
+                trial.suggest_int(f"layer_{i}", 10, 100)
+                for i in range(trial.suggest_int("n_layers", 1, 5))
+            ]
+            + [1],
+            "activation": "tanh",
+            "use_attention": False,
+            "learning_rate": {
+                "initial_learning_rate": trial.suggest_loguniform(
+                    "initial_learning_rate", 1e-5, 1e-2
+                ),
+                "use_lr_scheduler": True,
+                "decay_steps": trial.suggest_int("decay_steps", 1000, 10000),
+                "decay_rate": trial.suggest_uniform("decay_rate", 0.9, 0.99),
+            },
+        },
+        "pde": {"beta": 10},
+    }
+
+    # Set up your model and training process using the suggested hyperparameters
+
+    output_temp_dir = "output_temp"
+    if not os.path.exists(output_temp_dir):
+        os.makedirs(output_temp_dir)
+
+    gpus = tf.config.list_physical_devices('GPU')
+    if gpus:
+        try:
+            tf.config.experimental.set_memory_growth(gpus[0], True)
+        except RuntimeError as e:
+            print(e)
+
+    domain = Geometry_2D("quadrilateral", "internal", 100, 100, output_temp_dir, is_optimized=True)
+    cells, boundary_points = domain.generate_quad_mesh_internal(
+        x_limits=[0, 1],
+        y_limits=[0, 1],
+        n_cells_x=config["geometry"]["internal_mesh_params"]["n_cells_x"],
+        n_cells_y=config["geometry"]["internal_mesh_params"]["n_cells_y"],
+        num_boundary_points=config["geometry"]["internal_mesh_params"]["n_boundary_points"],
+    )
+
+    fespace = Fespace2D(
+        mesh=domain.mesh,
+        cells=cells,
+        boundary_points=boundary_points,
+        cell_type=domain.mesh_type,
+        fe_order=config["fe"]["fe_order"],
+        fe_type=config["fe"]["fe_type"],
+        quad_order=config["fe"]["quad_order"],
+        quad_type=config["fe"]["quad_type"],
+        fe_transformation_type="bilinear",
+        bound_function_dict=get_boundary_function_dict(),
+        bound_condition_dict=get_bound_cond_dict(),
+        forcing_function=rhs,
+        output_path="output_temp",
+        generate_mesh_plot=False,
+    )
+
+    datahandler = DataHandler2D(fespace, domain, dtype=tf.float32)
+
+    params_dict = {"n_cells": fespace.n_cells}
+    train_dirichlet_input, train_dirichlet_output = datahandler.get_dirichlet_input()
+    bilinear_params_dict = datahandler.get_bilinear_params_dict_as_tensors(get_bilinear_params_dict)
+
+    model = DenseModel(
+        layer_dims=config["model"]["model_architecture"],
+        learning_rate_dict=config["model"]["learning_rate"],
+        params_dict=params_dict,
+        loss_function=pde_loss_poisson,
+        input_tensors_list=[datahandler.x_pde_list, train_dirichlet_input, train_dirichlet_output],
+        orig_factor_matrices=[
+            datahandler.shape_val_mat_list,
+            datahandler.grad_x_mat_list,
+            datahandler.grad_y_mat_list,
+        ],
+        force_function_list=datahandler.forcing_function_list,
+        tensor_dtype=tf.float32,
+        use_attention=config["model"]["use_attention"],
+        activation=config["model"]["activation"],
+        hessian=False,
+    )
+
+    # Train the model for a fixed number of epochs
+    beta = tf.constant(config["pde"]["beta"], dtype=tf.float32)
+
+    for epoch in range(num_epochs):
+        loss = model.train_step(beta=beta, bilinear_params_dict=bilinear_params_dict)
+    # remove output_temp directory using os
+    test_points = domain.get_test_points()
+    y_exact = exact_solution(test_points[:, 0], test_points[:, 1])
+    y_pred = model(test_points).numpy().reshape(-1)
+
+    _, _, l2_error_relative, _, _, _ = compute_errors_combined(y_exact, y_pred)
+
+    return l2_error_relative  # Return the relative L2 error as the objective to minimize