From 2404a4e78946cc222bbabae630ec40af3f63953d Mon Sep 17 00:00:00 2001 From: Rick Staa Date: Sat, 24 Feb 2024 20:20:37 +0100 Subject: [PATCH] docs: improve polyak description (#417) This commit improves the polyak parameter description to prevent confusion with papers that use the soft replacement factor. --- stable_learning_control/algos/pytorch/lac/lac.py | 11 +++++++---- stable_learning_control/algos/pytorch/sac/sac.py | 11 +++++++---- stable_learning_control/algos/tf2/lac/lac.py | 11 +++++++---- stable_learning_control/algos/tf2/sac/sac.py | 11 +++++++---- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/stable_learning_control/algos/pytorch/lac/lac.py b/stable_learning_control/algos/pytorch/lac/lac.py index af3f05f6..e4ed5da2 100644 --- a/stable_learning_control/algos/pytorch/lac/lac.py +++ b/stable_learning_control/algos/pytorch/lac/lac.py @@ -188,8 +188,10 @@ def __init__( .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually - close to 1.). Defaults to ``0.995``. + where :math:`\\rho` is polyak (Always between 0 and 1, usually close to + 1.). In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) + where :math:`\\tau` is the soft replacement factor. Defaults to + ``0.995``. target_entropy (float, optional): Initial target entropy used while learning the entropy temperature (alpha). Defaults to the maximum information (bits) contained in action space. This can be @@ -991,8 +993,9 @@ def lac( .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually - close to 1.). Defaults to ``0.995``. + where :math:`\\rho` is polyak (Always between 0 and 1, usually close to 1.). + In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) where + :math:`\\tau` is the soft replacement factor. Defaults to ``0.995``. target_entropy (float, optional): Initial target entropy used while learning the entropy temperature (alpha). Defaults to the maximum information (bits) contained in action space. This can be diff --git a/stable_learning_control/algos/pytorch/sac/sac.py b/stable_learning_control/algos/pytorch/sac/sac.py index aaa15fc4..def453b9 100644 --- a/stable_learning_control/algos/pytorch/sac/sac.py +++ b/stable_learning_control/algos/pytorch/sac/sac.py @@ -172,8 +172,10 @@ def __init__( .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually - close to 1.). Defaults to ``0.995``. + where :math:`\\rho` is polyak (Always between 0 and 1, usually close to + 1.). In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) + where :math:`\\tau` is the soft replacement factor. Defaults to + ``0.995``. target_entropy (float, optional): Initial target entropy used while learning the entropy temperature (alpha). Defaults to the maximum information (bits) contained in action space. This can be @@ -856,8 +858,9 @@ def sac( .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually - close to 1.). Defaults to ``0.995``. + where :math:`\\rho` is polyak (Always between 0 and 1, usually close to 1.). + In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) where + :math:`\\tau` is the soft replacement factor. Defaults to ``0.995``. target_entropy (float, optional): Initial target entropy used while learning the entropy temperature (alpha). Defaults to the maximum information (bits) contained in action space. This can be diff --git a/stable_learning_control/algos/tf2/lac/lac.py b/stable_learning_control/algos/tf2/lac/lac.py index 06e3d6d4..47c9a530 100644 --- a/stable_learning_control/algos/tf2/lac/lac.py +++ b/stable_learning_control/algos/tf2/lac/lac.py @@ -185,8 +185,10 @@ def __init__( .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually - close to 1.). Defaults to ``0.995``. + where :math:`\\rho` is polyak (Always between 0 and 1, usually close to + 1.). In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) + where :math:`\\tau` is the soft replacement factor. Defaults to + ``0.995``. target_entropy (float, optional): Initial target entropy used while learning the entropy temperature (alpha). Defaults to the maximum information (bits) contained in action space. This can be @@ -922,8 +924,9 @@ def lac( .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually - close to 1.). Defaults to ``0.995``. + where :math:`\\rho` is polyak (Always between 0 and 1, usually close to 1.). + In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) where + :math:`\\tau` is the soft replacement factor. Defaults to ``0.995``. target_entropy (float, optional): Initial target entropy used while learning the entropy temperature (alpha). Defaults to the maximum information (bits) contained in action space. This can be diff --git a/stable_learning_control/algos/tf2/sac/sac.py b/stable_learning_control/algos/tf2/sac/sac.py index 78f5c62c..da2fcb3a 100644 --- a/stable_learning_control/algos/tf2/sac/sac.py +++ b/stable_learning_control/algos/tf2/sac/sac.py @@ -165,8 +165,10 @@ def __init__( .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually - close to 1.). Defaults to ``0.995``. + where :math:`\\rho` is polyak (Always between 0 and 1, usually close to + 1.). In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) + where :math:`\\tau` is the soft replacement factor. Defaults to + ``0.995``. target_entropy (float, optional): Initial target entropy used while learning the entropy temperature (alpha). Defaults to the maximum information (bits) contained in action space. This can be @@ -787,8 +789,9 @@ def sac( .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta - where :math:`\\rho` is polyak. (Always between 0 and 1, usually - close to 1.). Defaults to ``0.995``. + where :math:`\\rho` is polyak (Always between 0 and 1, usually close to 1.). + In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) where + :math:`\\tau` is the soft replacement factor. Defaults to ``0.995``. target_entropy (float, optional): Initial target entropy used while learning the entropy temperature (alpha). Defaults to the maximum information (bits) contained in action space. This can be