Refactor finetune (#247)

## Summary by CodeRabbit - **New Features** - Introduced a streamlined approach to finetuning within the workflow, integrating parameters directly without separate step creation. - **Bug Fixes** - Revised logic for model initialization to improve clarity and reduce complexity. - **Refactor** - Removed outdated classes and methods related to modifying training scripts, simplifying the `PrepRunDPTrain` class. - Enhanced decision-making processes by eliminating redundant conditions across various functions. - Updated test cases to remove references to outdated "finetune" operations, refining the testing strategy. - Adjusted test cases and configurations to reflect the removal of "finetune" references and enhance clarity. - **Chores** - Cleansed the logic for handling "finetune" patterns in specific utility functions for better maintainability.  --------- Signed-off-by: zjgemi <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
deepmodeling · Aug 5, 2024 · 777c7fa · 777c7fa
1 parent f5c5d95
commit 777c7fa
Show file tree

Hide file tree

Showing 10 changed files with 40 additions and 450 deletions.
diff --git a/dpgen2/entrypoint/args.py b/dpgen2/entrypoint/args.py
@@ -374,14 +374,12 @@ def input_args():
     doc_mass_map = "The mass map. e.g. [27., 24.]. Al and Mg will be set with mass 27. and 24. amu, respectively."
     doc_mixed_type = "Use `deepmd/npy/mixed` format for storing training data."
     doc_do_finetune = (
-        "Finetune the pretrained model before the first iteration. If it is set to True, then an additional step, finetune-step, "
-        'which is based on a branch of "PrepRunDPTrain," will be added before the dpgen_step. In the '
-        'finetune-step, the internal flag finetune_mode is set to "finetune," which means SuperOP "PrepRunDPTrain" '
-        'is now used as the "Finetune." In this step, we finetune the pretrained model in the train step and modify '
-        'the template after training. After that, in the normal dpgen-step, the flag do_finetune is set as "train-init," '
-        'which means we use `--init-frz-model` to train based on models from the previous iteration. The "do_finetune" flag '
-        'is set to False by default, while the internal flag finetune_mode is set to "no," which means anything related '
-        "to finetuning will not be done."
+        "Finetune the pretrained model during the first iteration. If it is set to True, then in the first iteration, "
+        'the internal flag finetune_mode is set to "finetune". In this step, we finetune the pretrained model in the '
+        'train step. After that, in the following iterations, init_model_policy is forced to be "yes", the flag '
+        'finetune_mode is set as "no", which means we use `--init-frz-model` or `--init-model` to train based on '
+        'models from the previous iteration. The "do_finetune" flag is set to False by default, while the internal '
+        'flag finetune_mode is set to "no", which means anything related to finetuning will not be done.'
     )
     doc_do_finetune = textwrap.dedent(doc_do_finetune)
     doc_init_data_prefix = "The prefix of initial data systems"

diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py
@@ -414,53 +414,6 @@ def make_optional_parameter(
     return {"data_mixed_type": mixed_type, "finetune_mode": finetune_mode}
 
 
-def make_finetune_step(
-    config,
-    prep_train_config,
-    run_train_config,
-    upload_python_packages,
-    numb_models,
-    template_script,
-    train_config,
-    init_models,
-    init_data,
-    iter_data,
-    valid_data=None,
-):
-    finetune_optional_parameter = {
-        "mixed_type": config["inputs"]["mixed_type"],
-        "finetune_mode": "finetune",
-    }
-
-    finetune_op = PrepRunDPTrain(
-        "finetune",
-        PrepDPTrain,
-        RunDPTrain,
-        prep_config=prep_train_config,
-        run_config=run_train_config,
-        upload_python_packages=upload_python_packages,
-        finetune=True,
-        valid_data=valid_data,
-    )
-    finetune_step = Step(
-        "finetune-step",
-        template=finetune_op,
-        parameters={
-            "block_id": "finetune",
-            "numb_models": numb_models,
-            "template_script": template_script,
-            "train_config": train_config,
-            "run_optional_parameter": finetune_optional_parameter,
-        },
-        artifacts={
-            "init_models": init_models,
-            "init_data": init_data,
-            "iter_data": iter_data,
-        },
-    )
-    return finetune_step
-
-
 def get_systems_from_data(data, data_prefix=None):
     data = [data] if isinstance(data, str) else data
     assert isinstance(data, list)
@@ -472,7 +425,7 @@ def get_systems_from_data(data, data_prefix=None):
 
 def workflow_concurrent_learning(
     config: Dict,
-) -> Tuple[Step, Optional[Step]]:
+) -> Step:
     default_config = config["default_step_config"]
 
     train_config = config["train"]["config"]
@@ -614,32 +567,17 @@ def workflow_concurrent_learning(
     else:
         init_models = None
 
-    finetune_step = None
     optional_parameter = make_optional_parameter(
         config["inputs"]["mixed_type"],
     )
 
     if config["inputs"].get("do_finetune", False):
-        finetune_step = make_finetune_step(
-            config,
-            prep_train_config,
-            run_train_config,
-            upload_python_packages,
-            numb_models,
-            template_script,
-            train_config,
-            init_models,
-            init_data,
-            iter_data,
-            valid_data=valid_data,
-        )
-
-        init_models = finetune_step.outputs.artifacts["models"]
-        template_script = finetune_step.outputs.parameters["template_script"]
-
+        if train_config["init_model_policy"] != "yes":
+            logging.warning("In finetune mode, init_model_policy is forced to be 'yes'")
+            train_config["init_model_policy"] = "yes"
         optional_parameter = make_optional_parameter(
             config["inputs"]["mixed_type"],
-            finetune_mode="train-init",
+            finetune_mode="finetune",
         )
 
     # here the scheduler is passed as input parameter to the concurrent_learning_op
@@ -662,7 +600,7 @@ def workflow_concurrent_learning(
             "iter_data": iter_data,
         },
     )
-    return dpgen_step, finetune_step
+    return dpgen_step
 
 
 def get_scheduler_ids(
@@ -747,9 +685,7 @@ def submit_concurrent_learning(
 
     global_config_workflow(wf_config)
 
-    dpgen_step, finetune_step = workflow_concurrent_learning(
-        wf_config,
-    )
+    dpgen_step = workflow_concurrent_learning(wf_config)
 
     if reuse_step is not None and replace_scheduler:
         scheduler_new = copy.deepcopy(
@@ -785,17 +721,9 @@ def submit_concurrent_learning(
             "conf_selector",
             selector,
         )
-        # the modify-train-script step will be added as reuse step.
-        # the following hack is not needed anymore.
-        # wf_config["inputs"]["do_finetune"] = False
-        # finetune will not be done again if the old process is reused.
 
     wf = Workflow(name=wf_config["name"], parallelism=wf_config["parallelism"])
 
-    if wf_config["inputs"].get("do_finetune", False):
-        assert finetune_step is not None
-        wf.add(finetune_step)
-
     wf.add(dpgen_step)
 
     # for debug purpose, we may not really submit the wf
@@ -889,7 +817,6 @@ def get_resubmit_keys(
         "prep-run-train",
         "prep-train",
         "run-train",
-        "modify-train-script",
         "prep-caly-input",
         "prep-caly-model-devi",
         "run-caly-model-devi",

diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py
@@ -77,6 +77,13 @@ def make_block_optional_parameter(cl_optional_parameter):
     }
 
 
+def make_next_optional_parameter(optional_parameter):
+    return {
+        "data_mixed_type": optional_parameter["data_mixed_type"],
+        "finetune_mode": "no",  # not to do finetune for `next` loop
+    }
+
+
 class SchedulerWrapper(OP):
     @classmethod
     def get_input_sign(cls):
@@ -426,7 +433,9 @@ def _loop(
         "exploration_scheduler": scheduler_step.outputs.parameters[
             "exploration_scheduler"
         ],
-        "optional_parameter": steps.inputs.parameters["optional_parameter"],
+        "optional_parameter": make_next_optional_parameter(
+            steps.inputs.parameters["optional_parameter"]
+        ),
         "expl_task_grp": scheduler_step.outputs.parameters["expl_task_grp"],
     }
     next_step = Step(

diff --git a/dpgen2/op/run_dp_train.py b/dpgen2/op/run_dp_train.py
@@ -71,10 +71,9 @@ def _make_train_command(
         return command
     # case of init model and finetune
     assert checkpoint is None
-    do_init_model_or_train_init = do_init_model or finetune_mode == "train-init"
-    case_init_model = do_init_model_or_train_init and (not init_model_with_finetune)
+    case_init_model = do_init_model and (not init_model_with_finetune)
     case_finetune = finetune_mode == "finetune" or (
-        do_init_model_or_train_init and init_model_with_finetune
+        do_init_model and init_model_with_finetune
     )
     if case_init_model:
         init_flag = "--init-frz-model" if impl == "tensorflow" else "--init-model"
@@ -101,69 +100,6 @@ def _make_train_command(
     return command
 
 
-def _make_train_command_old(
-    dp_command,
-    train_script_name,
-    impl,
-    do_init_model,
-    init_model,
-    finetune_mode,
-    finetune_args,
-    init_model_with_finetune,
-):
-    if impl == "tensorflow" and os.path.isfile("checkpoint"):
-        command = dp_command + [
-            "train",
-            "--restart",
-            "model.ckpt",
-            train_script_name,
-        ]
-    elif impl == "pytorch" and len(glob.glob("model.ckpt-[0-9]*.pt")) > 0:
-        checkpoint = "model.ckpt-%s.pt" % max(
-            [int(f[11:-3]) for f in glob.glob("model.ckpt-[0-9]*.pt")]
-        )
-        command = dp_command + [
-            "train",
-            "--restart",
-            checkpoint,
-            train_script_name,
-        ]
-    elif (
-        do_init_model or finetune_mode == "train-init"
-    ) and not init_model_with_finetune:
-        if impl == "pytorch":
-            command = dp_command + [
-                "train",
-                "--init-model",
-                str(init_model),
-                train_script_name,
-            ]
-        else:
-            command = dp_command + [
-                "train",
-                "--init-frz-model",
-                str(init_model),
-                train_script_name,
-            ]
-    elif finetune_mode == "finetune" or (
-        (do_init_model or finetune_mode == "train-init") and init_model_with_finetune
-    ):
-        command = (
-            dp_command
-            + [
-                "train",
-                train_script_name,
-                "--finetune",
-                str(init_model),
-            ]
-            + finetune_args.split()
-        )
-    else:
-        command = dp_command + ["train", train_script_name]
-
-    return command
-
-
 class RunDPTrain(OP):
     r"""Execute a DP training task. Train and freeze a DP model.