From 2404a4e78946cc222bbabae630ec40af3f63953d Mon Sep 17 00:00:00 2001
From: Rick Staa <rick.staa@outlook.com>
Date: Sat, 24 Feb 2024 20:20:37 +0100
Subject: [PATCH] docs: improve polyak description (#417)

This commit improves the polyak parameter description to prevent
confusion with papers that use the soft replacement factor.
---
 stable_learning_control/algos/pytorch/lac/lac.py | 11 +++++++----
 stable_learning_control/algos/pytorch/sac/sac.py | 11 +++++++----
 stable_learning_control/algos/tf2/lac/lac.py     | 11 +++++++----
 stable_learning_control/algos/tf2/sac/sac.py     | 11 +++++++----
 4 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/stable_learning_control/algos/pytorch/lac/lac.py b/stable_learning_control/algos/pytorch/lac/lac.py
index af3f05f6..e4ed5da2 100644
--- a/stable_learning_control/algos/pytorch/lac/lac.py
+++ b/stable_learning_control/algos/pytorch/lac/lac.py
@@ -188,8 +188,10 @@ def __init__(
                 .. math:: \\theta_{\\text{targ}} \\leftarrow
                     \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-                where :math:`\\rho` is polyak. (Always between 0 and 1, usually
-                close to 1.). Defaults to ``0.995``.
+                where :math:`\\rho` is polyak (Always between 0 and 1, usually close to
+                1.). In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`)
+                where :math:`\\tau` is the soft replacement factor. Defaults to
+                ``0.995``.
             target_entropy (float, optional): Initial target entropy used while learning
                 the entropy temperature (alpha). Defaults to the
                 maximum information (bits) contained in action space. This can be
@@ -991,8 +993,9 @@ def lac(
             .. math:: \\theta_{\\text{targ}} \\leftarrow
                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
-            close to 1.). Defaults to ``0.995``.
+            where :math:`\\rho` is polyak (Always between 0 and 1, usually close to 1.).
+            In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) where
+            :math:`\\tau` is the soft replacement factor. Defaults to ``0.995``.
         target_entropy (float, optional): Initial target entropy used while learning
             the entropy temperature (alpha). Defaults to the
             maximum information (bits) contained in action space. This can be
diff --git a/stable_learning_control/algos/pytorch/sac/sac.py b/stable_learning_control/algos/pytorch/sac/sac.py
index aaa15fc4..def453b9 100644
--- a/stable_learning_control/algos/pytorch/sac/sac.py
+++ b/stable_learning_control/algos/pytorch/sac/sac.py
@@ -172,8 +172,10 @@ def __init__(
                 .. math:: \\theta_{\\text{targ}} \\leftarrow
                     \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-                where :math:`\\rho` is polyak. (Always between 0 and 1, usually
-                close to 1.). Defaults to ``0.995``.
+                where :math:`\\rho` is polyak (Always between 0 and 1, usually close to
+                1.). In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`)
+                where :math:`\\tau` is the soft replacement factor. Defaults to
+                ``0.995``.
             target_entropy (float, optional): Initial target entropy used while learning
                 the entropy temperature (alpha). Defaults to the
                 maximum information (bits) contained in action space. This can be
@@ -856,8 +858,9 @@ def sac(
             .. math:: \\theta_{\\text{targ}} \\leftarrow
                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
-            close to 1.). Defaults to ``0.995``.
+            where :math:`\\rho` is polyak (Always between 0 and 1, usually close to 1.).
+            In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) where
+            :math:`\\tau` is the soft replacement factor. Defaults to ``0.995``.
         target_entropy (float, optional): Initial target entropy used while learning
             the entropy temperature (alpha). Defaults to the
             maximum information (bits) contained in action space. This can be
diff --git a/stable_learning_control/algos/tf2/lac/lac.py b/stable_learning_control/algos/tf2/lac/lac.py
index 06e3d6d4..47c9a530 100644
--- a/stable_learning_control/algos/tf2/lac/lac.py
+++ b/stable_learning_control/algos/tf2/lac/lac.py
@@ -185,8 +185,10 @@ def __init__(
                 .. math:: \\theta_{\\text{targ}} \\leftarrow
                     \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-                where :math:`\\rho` is polyak. (Always between 0 and 1, usually
-                close to 1.). Defaults to ``0.995``.
+                where :math:`\\rho` is polyak (Always between 0 and 1, usually close to
+                1.). In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`)
+                where :math:`\\tau` is the soft replacement factor. Defaults to
+                ``0.995``.
             target_entropy (float, optional): Initial target entropy used while learning
                 the entropy temperature (alpha). Defaults to the
                 maximum information (bits) contained in action space. This can be
@@ -922,8 +924,9 @@ def lac(
             .. math:: \\theta_{\\text{targ}} \\leftarrow
                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
-            close to 1.). Defaults to ``0.995``.
+            where :math:`\\rho` is polyak (Always between 0 and 1, usually close to 1.).
+            In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) where
+            :math:`\\tau` is the soft replacement factor. Defaults to ``0.995``.
         target_entropy (float, optional): Initial target entropy used while learning
             the entropy temperature (alpha). Defaults to the
             maximum information (bits) contained in action space. This can be
diff --git a/stable_learning_control/algos/tf2/sac/sac.py b/stable_learning_control/algos/tf2/sac/sac.py
index 78f5c62c..da2fcb3a 100644
--- a/stable_learning_control/algos/tf2/sac/sac.py
+++ b/stable_learning_control/algos/tf2/sac/sac.py
@@ -165,8 +165,10 @@ def __init__(
                 .. math:: \\theta_{\\text{targ}} \\leftarrow
                     \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-                where :math:`\\rho` is polyak. (Always between 0 and 1, usually
-                close to 1.). Defaults to ``0.995``.
+                where :math:`\\rho` is polyak (Always between 0 and 1, usually close to
+                1.). In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`)
+                where :math:`\\tau` is the soft replacement factor. Defaults to
+                ``0.995``.
             target_entropy (float, optional): Initial target entropy used while learning
                 the entropy temperature (alpha). Defaults to the
                 maximum information (bits) contained in action space. This can be
@@ -787,8 +789,9 @@ def sac(
             .. math:: \\theta_{\\text{targ}} \\leftarrow
                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
-            close to 1.). Defaults to ``0.995``.
+            where :math:`\\rho` is polyak (Always between 0 and 1, usually close to 1.).
+            In some papers :math:`\\rho` is defined as (1 - :math:`\\tau`) where
+            :math:`\\tau` is the soft replacement factor. Defaults to ``0.995``.
         target_entropy (float, optional): Initial target entropy used while learning
             the entropy temperature (alpha). Defaults to the
             maximum information (bits) contained in action space. This can be