From 51d1d95ec0ee8b42a5d8dd39a5087d762ec58b81 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 11 Feb 2024 16:37:32 -0300
Subject: [PATCH 01/25] Autocast

---
 src/Native/LibTorchSharp/THSTorch.cpp         | 112 +++++++++++++++++-
 src/Native/LibTorchSharp/THSTorch.h           |  34 +++++-
 .../PInvoke/LibTorchSharp.THSTorch.cs         |  40 +++++++
 src/TorchSharp/Tensor/torch.Autocast.cs       |  79 ++++++++++++
 4 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100644 src/TorchSharp/Tensor/torch.Autocast.cs

diff --git a/src/Native/LibTorchSharp/THSTorch.cpp b/src/Native/LibTorchSharp/THSTorch.cpp
index b846557bc..1a170913c 100644
--- a/src/Native/LibTorchSharp/THSTorch.cpp
+++ b/src/Native/LibTorchSharp/THSTorch.cpp
@@ -323,4 +323,114 @@ double THSSpecial_erf_scalar(const double x)
 double THSSpecial_erfc_scalar(const double x)
 {
     return erfc(x);
-}
\ No newline at end of file
+}
+
+bool THSTorch_is_torch_function_mode_enabled()
+{
+    return at::impl::torch_function_mode_enabled(); //https://github.com/pytorch/pytorch/blob/2c91e13afc6edcfe0a0e6189a88aae4ecbbf3516/torch/csrc/autograd/init.cpp#L911
+}
+
+bool THSTorch_is_autocast_cache_enabled()
+{
+    return at::autocast::is_autocast_cache_enabled();
+}
+
+bool THSTorch_is_autocast_cpu_enabled()
+{
+    return at::autocast::is_cpu_enabled();  //https://github.com/pytorch/pytorch/blob/2c91e13afc6edcfe0a0e6189a88aae4ecbbf3516/torch/csrc/autograd/init.cpp#L523
+}
+
+bool THSTorch_is_autocast_gpu_enabled()
+{
+    return at::autocast::is_enabled(); //https://github.com/pytorch/pytorch/blob/2c91e13afc6edcfe0a0e6189a88aae4ecbbf3516/torch/amp/autocast_mode.py#L363
+}
+bool THSTorch_is_autocast_xpu_enabled()
+{
+    return at::autocast::is_xpu_enabled();
+}
+bool THSTorch_is_autocast_hpu_enabled()
+{
+    return at::autocast::is_hpu_enabled();
+}
+
+#if (TORCH_VERSION_MAJOR ==2 && TORCH_VERSION_MINOR > 0)
+bool THSTorch_is_autocast_ipu_enabled()
+{
+    return at::autocast::is_ipu_enabled();
+}
+
+bool THSTorch_is_autocast_xla_enabled()
+{
+    return at::autocast::is_xla_enabled();
+}
+
+#endif
+
+int8_t THSTorch_get_autocast_cpu_dtype()
+{
+    return (int8_t)at::autocast::get_autocast_cpu_dtype();
+}
+
+int8_t THSTorch_get_autocast_gpu_dtype()
+{
+    //TODO: Implement AUTOCAST AMP AND GRADSCALER
+
+    //INFO: Enter/Exit function of autocast_mode not need to do in C/C++ only in C# with Disposable C# Can handle all of that function (if exists)
+    //https://github.com/pytorch/pytorch/blob/main/torch/amp/autocast_mode.py
+
+
+    //https://github.com/pytorch/pytorch/blob/2c91e13afc6edcfe0a0e6189a88aae4ecbbf3516/torch/csrc/autograd/init.cpp#L629
+    //https://github.com/pytorch/pytorch/blob/2c91e13afc6edcfe0a0e6189a88aae4ecbbf3516/aten/src/ATen/autocast_mode.h#L20
+    return (int8_t)at::autocast::get_autocast_gpu_dtype();
+}
+
+int8_t THSTorch_get_autocast_xpu_dtype()
+{
+    return (int8_t)at::autocast::get_autocast_xpu_dtype();
+}
+
+
+int THSTorch_autocast_increment_nesting()
+{
+    return at::autocast::increment_nesting();
+}
+
+int THSTorch_autocast_decremental_nesting()
+{
+    return at::autocast::decrement_nesting();
+}
+
+void THSTorch_set_autocast_enabled(bool enabled)
+{
+    at::autocast::set_enabled(enabled);
+}
+
+void THSTorch_set_autocast_cache_enabled(bool enabled)
+{
+    at::autocast::set_autocast_cache_enabled(enabled);
+}
+
+void THSTorch_set_autocast_cpu_dtype(int8_t dtype)
+{
+    at::autocast::set_autocast_cpu_dtype((c10::ScalarType)dtype);
+}
+
+void THSTorch_set_autocast_gpu_dtype(int8_t dtype)
+{
+    at::autocast::set_autocast_gpu_dtype((c10::ScalarType)dtype);
+}
+
+void THSTorch_set_autocast_xpu_dtype(int8_t dtype)
+{
+    at::autocast::set_autocast_xpu_dtype((c10::ScalarType)dtype);
+}
+
+void THSTorch_clear_autocast_cache()
+{
+    at::autocast::clear_cache();
+}
+
+/*bool THSTorch_jit_is_scripting()
+{
+    
+}*/
\ No newline at end of file
diff --git a/src/Native/LibTorchSharp/THSTorch.h b/src/Native/LibTorchSharp/THSTorch.h
index 9ab80e828..dd9483f5f 100644
--- a/src/Native/LibTorchSharp/THSTorch.h
+++ b/src/Native/LibTorchSharp/THSTorch.h
@@ -4,7 +4,8 @@
 #include "../Stdafx.h"
 
 #include "Utils.h"
-
+#include <ATen/autocast_mode.h>
+//#include <ATen/impl.h>
 // API.
 
 // Sets manually the seed.
@@ -91,3 +92,34 @@ EXPORT_API(void) THSTorch_dispose_scalar(Scalar scalar);
 
 EXPORT_API(double) THSSpecial_erf_scalar(const double x);
 EXPORT_API(double) THSSpecial_erfc_scalar(const double x);
+
+EXPORT_API(bool) THSTorch_is_torch_function_mode_enabled();
+
+//Maybe the best work is call THSTorch_is_autocast_enabled(enum of devices c# as int8_t);
+EXPORT_API(bool) THSTorch_is_autocast_cache_enabled();
+EXPORT_API(bool) THSTorch_is_autocast_cpu_enabled();
+EXPORT_API(bool) THSTorch_is_autocast_gpu_enabled();
+EXPORT_API(bool) THSTorch_is_autocast_xpu_enabled();
+EXPORT_API(bool) THSTorch_is_autocast_hpu_enabled();
+
+#if (TORCH_VERSION_MAJOR ==2 && TORCH_VERSION_MINOR > 0)
+EXPORT_API(bool) THSTorch_is_autocast_ipu_enabled();
+EXPORT_API(bool) THSTorch_is_autocast_xla_enabled();
+#endif
+
+EXPORT_API(int8_t) THSTorch_get_autocast_cpu_dtype();
+EXPORT_API(int8_t) THSTorch_get_autocast_gpu_dtype();
+EXPORT_API(int8_t) THSTorch_get_autocast_xpu_dtype();
+
+EXPORT_API(int) THSTorch_autocast_increment_nesting();
+EXPORT_API(int) THSTorch_autocast_decrement_nesting();
+
+EXPORT_API(void) THSTorch_set_autocast_enabled(bool enabled);
+EXPORT_API(void) THSTorch_set_autocast_cache_enabled(bool enabled);
+EXPORT_API(void) THSTorch_set_autocast_cpu_dtype(int8_t dtype);
+EXPORT_API(void) THSTorch_set_autocast_gpu_dtype(int8_t dtype);
+EXPORT_API(void) THSTorch_set_autocast_xpu_dtype(int8_t dtype);
+
+EXPORT_API(void) THSTorch_clear_autocast_cache();
+
+//EXPORT_API(bool) THSTorch_jit_is_scripting();
\ No newline at end of file
diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSTorch.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSTorch.cs
index 3d3919ee3..fb609e286 100644
--- a/src/TorchSharp/PInvoke/LibTorchSharp.THSTorch.cs
+++ b/src/TorchSharp/PInvoke/LibTorchSharp.THSTorch.cs
@@ -108,5 +108,45 @@ internal static partial class NativeMethods
 
         [DllImport("LibTorchSharp")]
         internal static extern void THSTorch_set_num_interop_threads(int threads);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern bool THSTorch_is_torch_function_mode_enabled();
+
+        [DllImport("LibTorchSharp")]
+        internal static extern bool THSTorch_is_autocast_cache_enabled();
+        [DllImport("LibTorchSharp")]
+        internal static extern bool THSTorch_is_autocast_cpu_enabled();
+        [DllImport("LibTorchSharp")]
+        internal static extern bool THSTorch_is_autocast_gpu_enabled();
+        [DllImport("LibTorchSharp")]
+        internal static extern bool THSTorch_is_autocast_xpu_enabled();
+        [DllImport("LibTorchSharp")]
+        internal static extern bool THSTorch_is_autocast_hpu_enabled();
+
+        [DllImport("LibTorchSharp")]
+        internal static extern sbyte THSTorch_get_autocast_cpu_dtype();
+        [DllImport("LibTorchSharp")]
+        internal static extern sbyte THSTorch_get_autocast_gpu_dtype();
+        [DllImport("LibTorchSharp")]
+        internal static extern sbyte THSTorch_get_autocast_xpu_dtype();
+
+        [DllImport("LibTorchSharp")]
+        internal static extern int THSTorch_autocast_increment_nesting();
+        [DllImport("LibTorchSharp")]
+        internal static extern int THSTorch_autocast_decrement_nesting();
+
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSTorch_set_autocast_enabled(bool enabled);
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSTorch_set_autocast_cache_enabled(bool enabled);
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSTorch_set_autocast_cpu_dtype(sbyte dtype);
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSTorch_set_autocast_gpu_dtype(sbyte dtype);
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSTorch_set_autocast_xpu_dtype(sbyte dtype);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSTorch_clear_autocast_cache();
     }
 }
diff --git a/src/TorchSharp/Tensor/torch.Autocast.cs b/src/TorchSharp/Tensor/torch.Autocast.cs
new file mode 100644
index 000000000..6745133be
--- /dev/null
+++ b/src/TorchSharp/Tensor/torch.Autocast.cs
@@ -0,0 +1,79 @@
+using System;
+using static TorchSharp.PInvoke.NativeMethods;
+
+namespace TorchSharp
+{
+    public static partial class torch
+    {
+        public static bool is_autocast_cache_enabled()
+        {
+            return THSTorch_is_autocast_cache_enabled();
+        }
+        public static bool is_autocast_cpu_enabled()
+        {
+            return THSTorch_is_autocast_cpu_enabled();
+        }
+        public static bool is_autocast_gpu_enabled()
+        {
+            return THSTorch_is_autocast_gpu_enabled();
+        }
+        public static bool is_autocast_xpu_enabled()
+        {
+            return THSTorch_is_autocast_xpu_enabled();
+        }
+        public static bool is_autocast_hpu_enabled()
+        {
+            return THSTorch_is_autocast_hpu_enabled();
+        }
+
+        public static ScalarType get_autocast_cpu_dtype()
+        {
+            return (ScalarType)THSTorch_get_autocast_cpu_dtype();
+        }
+        public static ScalarType get_autocast_gpu_dtype()
+        {
+            return (ScalarType)THSTorch_get_autocast_gpu_dtype();
+        }
+        public static ScalarType get_autocast_xpu_dtype()
+        {
+            return (ScalarType)THSTorch_get_autocast_xpu_dtype();
+        }
+
+        public static int autocast_increment_nesting()
+        {
+            return THSTorch_autocast_increment_nesting();
+        }
+
+        public static int autocast_decrement_nesting()
+        {
+            return THSTorch_autocast_decrement_nesting();
+        }
+
+        public static void set_autocast_enabled(bool enabled)
+        {
+            THSTorch_set_autocast_enabled(enabled);
+        }
+        public static void set_autocast_cache_enabled(bool enabled)
+        {
+            THSTorch_set_autocast_cache_enabled(enabled);
+        }
+
+        public static void set_autocast_cpu_dtype(ScalarType dtype)
+        {
+            THSTorch_set_autocast_cpu_dtype((sbyte)dtype);
+        }
+        public static void set_autocast_gpu_dtype(ScalarType dtype)
+        {
+            THSTorch_set_autocast_gpu_dtype((sbyte)dtype);
+        }
+        public static void set_autocast_xpu_dtype(ScalarType dtype)
+        {
+            THSTorch_set_autocast_xpu_dtype((sbyte)dtype);
+        }
+
+        public static void clear_autocast_cache()
+        {
+            THSTorch_clear_autocast_cache();
+        }
+    }
+}
\ No newline at end of file

From 29b490026f9e600ec75b022cbc9dadab5330c46e Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sat, 17 Feb 2024 19:17:16 -0300
Subject: [PATCH 02/25] Added some features

---
 .gitignore                                    |  1 +
 src/Native/CMakeSettings.json                 | 16 ++--
 src/Native/LibTorchSharp/CMakeLists.txt       |  2 +-
 src/Native/LibTorchSharp/THSTensor.cpp        | 15 ++++
 src/Native/LibTorchSharp/THSTensor.h          |  4 +
 src/TorchSharp/Amp/AutocastMode.cs            | 54 +++++++++++++
 src/TorchSharp/Amp/GradScaler.cs              | 66 ++++++++++++++++
 .../PInvoke/LibTorchSharp.THSTensor.cs        |  2 +
 src/TorchSharp/Tensor/Tensor.cs               |  9 +++
 src/TorchSharp/Torch.cs                       | 25 +++++-
 src/TorchSharp/TorchSharp.csproj              | 78 -------------------
 11 files changed, 187 insertions(+), 85 deletions(-)
 create mode 100644 src/TorchSharp/Amp/AutocastMode.cs
 create mode 100644 src/TorchSharp/Amp/GradScaler.cs
 delete mode 100644 src/TorchSharp/TorchSharp.csproj

diff --git a/.gitignore b/.gitignore
index bab8676e1..f34d405aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -272,3 +272,4 @@ packages/
 *.code-workspace
 /.idea
 /test/TorchSharpTest/exportsd.py
+/src/TorchSharp/TorchSharp.csproj
diff --git a/src/Native/CMakeSettings.json b/src/Native/CMakeSettings.json
index 9204f06eb..f47283578 100644
--- a/src/Native/CMakeSettings.json
+++ b/src/Native/CMakeSettings.json
@@ -1,15 +1,21 @@
-﻿{
+{
   "configurations": [
     {
       "name": "x64-Debug",
-      "generator": "Ninja",
+      "generator": "Visual Studio 17 2022 Win64",
       "configurationType": "Debug",
       "inheritEnvironments": [ "msvc_x64_x64" ],
       "buildRoot": "${projectDir}\\out\\build\\${name}",
       "installRoot": "${projectDir}\\out\\install\\${name}",
-      "cmakeCommandArgs": "",
-      "buildCommandArgs": "",
-      "ctestCommandArgs": ""
+      "cmakeCommandArgs": "-DCMAKE_PREFIX_PATH=\"K:\\FrameworksForC\\LibTorch\\libtorch-win-shared-with-deps-debug-2.0.1+cu117\"",
+      "ctestCommandArgs": "",
+      "variables": [
+        {
+          "name": "Torch_DIR",
+          "value": "K:/FrameworksForC/LibTorch/libtorch-win-shared-with-deps-debug-2.0.1+cu117",
+          "type": "PATH"
+        }
+      ]
     }
   ]
 }
\ No newline at end of file
diff --git a/src/Native/LibTorchSharp/CMakeLists.txt b/src/Native/LibTorchSharp/CMakeLists.txt
index 17c2b7fcf..544ac3e22 100644
--- a/src/Native/LibTorchSharp/CMakeLists.txt
+++ b/src/Native/LibTorchSharp/CMakeLists.txt
@@ -64,7 +64,7 @@ add_library(LibTorchSharp SHARED ${SOURCES} ${RESOURCES})
 
 target_link_libraries(LibTorchSharp ${TORCH_LIBRARIES})
 
-set_property(TARGET LibTorchSharp PROPERTY CXX_STANDARD 14)
+set_property(TARGET LibTorchSharp PROPERTY CXX_STANDARD 17)
 
 if(APPLE)
     set_target_properties(LibTorchSharp PROPERTIES INSTALL_RPATH "@loader_path;@executable_path;")
diff --git a/src/Native/LibTorchSharp/THSTensor.cpp b/src/Native/LibTorchSharp/THSTensor.cpp
index 2bdc96a83..f4617b5f7 100644
--- a/src/Native/LibTorchSharp/THSTensor.cpp
+++ b/src/Native/LibTorchSharp/THSTensor.cpp
@@ -1836,6 +1836,21 @@ Tensor THSTensor_to_type_and_device(const Tensor tensor, int8_t scalar_type, con
     );
 }
 
+/*Tensor THSTensor_device_and_non_blocking(const Tensor tensor, const int device_type, const int device_index, const bool non_blocking)
+{
+    CATCH_RETURN_Tensor(
+        auto device = c10::Device((c10::DeviceType)device_type, (c10::DeviceIndex)device_index);
+    res = ResultTensor(tensor->to(device, non_blocking, at::ScalarType(scalar_type), false));
+    );
+}*/
+Tensor THSTensor_to_type_and_device_and_non_blocking(const Tensor tensor, int8_t scalar_type, const int device_type, const int device_index,const bool non_blocking)
+{
+    CATCH_RETURN_Tensor(
+        auto device = c10::Device((c10::DeviceType)device_type, (c10::DeviceIndex)device_index);
+    res = ResultTensor(tensor->to(device, non_blocking, at::ScalarType(scalar_type), false));
+    );
+}
+
 Tensor THSTensor_triu(const Tensor tensor, const int64_t diagonal, const bool inplace)
 {
     CATCH_TENSOR(inplace ? tensor->triu_(diagonal) : tensor->triu(diagonal));
diff --git a/src/Native/LibTorchSharp/THSTensor.h b/src/Native/LibTorchSharp/THSTensor.h
index 6af55912b..63bb976d7 100644
--- a/src/Native/LibTorchSharp/THSTensor.h
+++ b/src/Native/LibTorchSharp/THSTensor.h
@@ -1333,6 +1333,10 @@ EXPORT_API(Tensor) THSTensor_to_type(const Tensor tensor, int8_t scalar_type, co
 
 EXPORT_API(Tensor) THSTensor_to_type_and_device(const Tensor tensor, int8_t scalar_type, const int device_type, const int device_index, const bool copy);
 
+//EXPORT_API(Tensor) THSTensor_device_and_non_blocking(const Tensor tensor, const int device_type, const int device_index, const bool non_blocking);
+
+EXPORT_API(Tensor) THSTensor_to_type_and_device_and_non_blocking(const Tensor tensor, int8_t scalar_type, const int device_type, const int device_index, const bool non_blocking);
+
 EXPORT_API(void) THSTensor_topk(const Tensor tensor, Tensor* (*allocator)(size_t length), const int k, const int64_t dim, const bool largest, const bool sorted);
 
 EXPORT_API(Tensor) THSTensor_trunc(const Tensor tensor);
diff --git a/src/TorchSharp/Amp/AutocastMode.cs b/src/TorchSharp/Amp/AutocastMode.cs
new file mode 100644
index 000000000..7b9af69eb
--- /dev/null
+++ b/src/TorchSharp/Amp/AutocastMode.cs
@@ -0,0 +1,54 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace TorchSharp.Amp
+{
+    public class AutocastMode : IDisposable
+    {
+        private bool Enabled, Prev;
+        private torch.ScalarType Dtype;
+        private torch.ScalarType fast_dtype;
+        private torch.Device Device;
+        public AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enabled=true, bool? cache_enabled = null)
+        {
+            fast_dtype = dtype.Value;
+            if (dev.type == DeviceType.CUDA)
+                fast_dtype = torch.get_autocast_gpu_dtype();
+            if (dev.type == DeviceType.CPU)
+                fast_dtype = torch.get_autocast_cpu_dtype();
+
+            bool _cache_enabled = torch.is_autocast_cache_enabled();
+            if (!torch.cuda.is_available() && dev.type == DeviceType.CUDA) //Is not available for doing multicast
+                Enabled = false;
+            if (dtype.HasValue)
+                fast_dtype = dtype.Value;
+            if(cache_enabled.HasValue)
+                _cache_enabled=cache_enabled.Value;
+
+            if (dev.type == DeviceType.CUDA) {
+                if (enabled && fast_dtype == torch.ScalarType.BFloat16 && !torch.cuda.is_bf16_supported())
+                    throw new Exception("Current CUDA Device does not support bfloat16. Please switch dtype to float16.");
+            }
+            this.Enabled = enabled;
+
+            this.Prev = torch.is_autocast_cpu_enabled();
+            if (dev.type == DeviceType.CUDA) {
+                this.Prev = torch.is_autocast_gpu_enabled();
+            }
+            throw new NotImplementedException();
+        }
+        public void Dispose()
+        {
+            if (Device.type == DeviceType.CUDA) {
+                if(torch.autocast_decrement_nesting() == 0)
+                    torch.clear_autocast_cache();
+                torch.set_autocast_gpu_dtype(this.fast_dtype);
+                torch.set_autocast_enabled(this.Prev);
+            }
+            throw new NotImplementedException();
+        }
+    }
+}
diff --git a/src/TorchSharp/Amp/GradScaler.cs b/src/TorchSharp/Amp/GradScaler.cs
new file mode 100644
index 000000000..6da7a9dab
--- /dev/null
+++ b/src/TorchSharp/Amp/GradScaler.cs
@@ -0,0 +1,66 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace TorchSharp.Amp
+{
+    public class GradScaler
+    {
+        private bool Enabled;
+
+        private torch.Tensor _scale, _growth_tracker;
+
+        private float InitScale, GrowthFactor, BackoffFactor, GrowthInterval, InitGrowthTracker;
+
+        //https://github.com/pytorch/pytorch/blob/main/torch/amp/grad_scaler.py
+        public GradScaler(torch.Device dev, float init_scale = 2.0e16f, float growth_factor = 2.0f,
+            float backoff_factor = 0.5f, int growth_interval = 2000, bool enabled = true)
+        {
+            Debug.Assert(dev == torch.CPU || dev == torch.CUDA);
+            this.Enabled = enabled;
+            this.InitScale = init_scale;
+            this.GrowthFactor = growth_factor;
+            this.BackoffFactor = backoff_factor;
+            this.GrowthInterval = growth_interval;
+            this.InitGrowthTracker = 0.0f;
+            throw new NotImplementedException();
+        }
+
+        private void LazyInitScaleGrowthTracker(torch.Device dev)
+        {
+            this._scale = torch.full(0, this.InitScale, torch.ScalarType.Float32, device: dev);
+            this._growth_tracker = torch.full(0, this.InitGrowthTracker, torch.ScalarType.Float32, device: dev);
+        }
+
+        //private check_scale_growth_tracker
+        public torch.Tensor scale(torch.Tensor output)
+        {
+            if (!Enabled)
+                return output;
+            if (_scale.numel() == 0)
+                this.LazyInitScaleGrowthTracker(output.device);
+            return output * this._scale.to(output.device, output.dtype, true);
+        }
+
+        public torch.Tensor unscale_grads(torch.optim.Optimizer optimizer, torch.Tensor inv_scale, torch.Tensor found_inf, bool allow_fp16)
+        {
+            return false;
+        }
+
+        public void unscale(torch.optim.Optimizer optimizer)
+        {
+            if (!Enabled)
+                return;
+
+            
+        }
+        /*public IList<torch.Tensor> scale(IList<torch.Tensor> outputs)
+        {
+
+
+        }*/
+    }
+}
\ No newline at end of file
diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs
index c82b659a3..28b3b6f2f 100644
--- a/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs
+++ b/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs
@@ -293,6 +293,8 @@ internal static extern IntPtr THSTensor_upsample_nearest3d(IntPtr input,
 
         [DllImport("LibTorchSharp")]
         internal static extern IntPtr THSTensor_to_type_and_device(IntPtr handle, sbyte scalar_type, int device_type, int device_index, [MarshalAs(UnmanagedType.U1)] bool copy);
+        [DllImport("LibTorchSharp")]
+        internal static extern IntPtr THSTensor_to_type_and_device_and_non_blocking(IntPtr handle, sbyte scalar_type, int device_type, int device_index, [MarshalAs(UnmanagedType.U1)] bool non_blocking);
 
         [DllImport("LibTorchSharp")]
         internal static extern void THSTensor_set_(IntPtr tensor, IntPtr source);
diff --git a/src/TorchSharp/Tensor/Tensor.cs b/src/TorchSharp/Tensor/Tensor.cs
index b8b457063..83924753e 100644
--- a/src/TorchSharp/Tensor/Tensor.cs
+++ b/src/TorchSharp/Tensor/Tensor.cs
@@ -794,6 +794,15 @@ public Tensor to(ScalarType type, torch.Device device, bool copy = false, bool d
                 return new Tensor(res);
             }
 
+            public Tensor to(torch.Device device, ScalarType type, bool non_blocking)
+            {
+                torch.InitializeDevice(device);
+                var res = NativeMethods.THSTensor_to_type_and_device_and_non_blocking(Handle, (sbyte)type, (int)device.type, device.index, non_blocking);
+                if (res == IntPtr.Zero)
+                    CheckForErrors();
+                return new Tensor(res);
+            }
+
             /// <summary>
             /// Cast the tensor to the given element type.
             /// </summary>
diff --git a/src/TorchSharp/Torch.cs b/src/TorchSharp/Torch.cs
index 9028d2bdb..5523c8e53 100644
--- a/src/TorchSharp/Torch.cs
+++ b/src/TorchSharp/Torch.cs
@@ -406,7 +406,6 @@ public static void vector_to_parameters(Tensor vec, IEnumerable<Modules.Paramete
 
         public static partial class cuda
         {
-
             /// This must be a separate method to the failure to bind DllImport THSTorchCuda_is_available
             /// is not raised as early as a DllImportException
             [System.Runtime.CompilerServices.MethodImpl(System.Runtime.CompilerServices.MethodImplOptions.NoInlining)]
@@ -476,6 +475,30 @@ public static void synchronize(Device? device = null)
                 TryInitializeDeviceType(device?.type ?? DeviceType.CUDA);
                 THSTorchCuda_synchronize(device?.index ?? -1);
             }
+
+            public static bool is_bf16_supported()
+            {
+                //TODO IMPLEMENT: torch.cuda.current_device() https://github.com/pytorch/pytorch/blob/a4cc6b85dc14d5895499f89f39181c00196d336e/torch/cuda/__init__.py#L153
+                if (int.TryParse(cudaVersion.Split('.')[0], out int res)){
+
+                    //TODO: Implement get device properties
+                    //WARNING: Need Major compute capability version https://github.com/pytorch/pytorch/blob/a4cc6b85dc14d5895499f89f39181c00196d336e/torch/cuda/__init__.py#L161
+                    if (res >= 11)
+                        return true;
+                }
+
+                return check_bf16_tensor_supported(torch.CUDA);
+            }
+
+            private static bool check_bf16_tensor_supported(torch.Device dev)
+            {
+                try {
+                    var va = torch.tensor(new float[] { 1.0f }, dtype: torch.bfloat16, device: dev);
+                    return true;
+                } catch {
+                    return false;
+                }
+            }
         }
 
         /// <summary>
diff --git a/src/TorchSharp/TorchSharp.csproj b/src/TorchSharp/TorchSharp.csproj
deleted file mode 100644
index 5a102f34e..000000000
--- a/src/TorchSharp/TorchSharp.csproj
+++ /dev/null
@@ -1,78 +0,0 @@
-<Project>
-  <!-- Implicit top import -->
-  <Import Project="Sdk.props" Sdk="Microsoft.NET.Sdk" />
-
-  <PropertyGroup>
-      <TargetFrameworks>net6.0;netstandard2.0</TargetFrameworks>
-      <LangVersion>9.0</LangVersion>
-      <IncludeInPackage>TorchSharp</IncludeInPackage>
-      <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
-      <UseMLCodeAnalyzer>false</UseMLCodeAnalyzer>
-      <UseStyleCopAnalyzer>false</UseStyleCopAnalyzer>
-      <IsPackable>false</IsPackable>
-      <DefineConstants>$(DefineConstants);LIBTORCH_$(LibTorchPackageVersion.Replace('.', '_'));CUDA_$(CudaVersionDot.Replace('.', '_'))</DefineConstants>
-  </PropertyGroup>
-
-  <ItemGroup>
-    <Compile Remove="TorchVision\**" />
-    <EmbeddedResource Remove="TorchVision\**" />
-    <None Remove="TorchVision\**" />
-  </ItemGroup>
-
-  <ItemGroup>
-    <None Remove="Tensor\TensorTyped.tt" />
-  </ItemGroup>
-
-  <ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETCoreApp'">
-    <Compile Remove="netstandard.cs" />
-  </ItemGroup>
-
-  <ItemGroup>
-    <PackageReference Include="Google.Protobuf" Version="3.21.9" />
-    <PackageReference Include="SharpZipLib" Version="1.4.0" />
-    <PackageReference Include="SkiaSharp" Version="2.88.6" />
-    <PackageReference Include="SkiaSharp.NativeAssets.Linux.NoDependencies" Version="2.88.6" />
-    <PackageReference Include="System.Memory" Version="4.5.5" />
-  </ItemGroup>
-
-  <ItemGroup>
-    <Service Include="{508349b6-6b84-4df5-91f0-309beebad82d}" />
-  </ItemGroup>
-
-  <ItemGroup>
-    <Compile Update="Tensor\TensorTyped.generated.cs">
-      <DesignTime>True</DesignTime>
-      <AutoGen>True</AutoGen>
-      <DependentUpon>TensorTyped.tt</DependentUpon>
-    </Compile>
-  </ItemGroup>
-
-
-  <PropertyGroup>
-      <PackDependsOn>
-          $(PackDependsOn);
-          RealPack
-      </PackDependsOn>
-      <SignAssembly>True</SignAssembly>
-      <AssemblyOriginatorKeyFile>..\..\build\TorchSharp.snk</AssemblyOriginatorKeyFile>
-  </PropertyGroup>
-
-
-  <Import Project="Sdk.targets" Sdk="Microsoft.NET.Sdk" />
-
-  <!-- Trigger the download+placement of the redist bits and the build of the C++ project -->
-  <Target Name="BuildNativeLibTorch" BeforeTargets="BeforeBuild">
-    <Message Importance="High" Text="Using VersionSuffix = $(VersionSuffix)" />
-    <Message Importance="High" Text="Using Version = $(Version)" />
-    <MSBuild Projects="..\Redist\libtorch-cuda-$(CudaVersionDot)\libtorch-cuda-$(CudaVersionDot).proj" Condition="'$(BuildingInsideVisualStudio)'!='true' AND '$(SkipNative)' != 'true'  AND '$(SkipCuda)' != 'true'" RemoveProperties="TargetFramework" Targets="Build" />
-
-    <MSBuild Projects="..\Redist\libtorch-cpu\libtorch-cpu.proj" Condition="'$(BuildingInsideVisualStudio)'!='true' AND '$(SkipNative)' != 'true'" RemoveProperties="TargetFramework" Targets="Build" />
-
-    <MSBuild Projects="..\Native\build.proj" Condition="'$(SkipNative)' != 'true'" RemoveProperties="TargetFramework" Targets="Build" />
-  </Target>
-
-  <Target Name="RealPack">
-    <MSBuild Projects="..\..\pkg\pack.proj" Targets="Pack" />
-  </Target>
-
-</Project>

From defd582da252fe90d5f43f90a963e5797cdb6ea5 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 18 Feb 2024 13:32:16 -0300
Subject: [PATCH 03/25] Fix mistake gitignore

---
 .gitignore                             |  1 -
 src/Native/LibTorchSharp/THSTensor.cpp |  2 +-
 src/TorchSharp/Amp/AutocastMode.cs     |  6 +-
 src/TorchSharp/TorchSharp.csproj       | 88 ++++++++++++++++++++++++++
 4 files changed, 92 insertions(+), 5 deletions(-)
 create mode 100644 src/TorchSharp/TorchSharp.csproj

diff --git a/.gitignore b/.gitignore
index f34d405aa..bab8676e1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -272,4 +272,3 @@ packages/
 *.code-workspace
 /.idea
 /test/TorchSharpTest/exportsd.py
-/src/TorchSharp/TorchSharp.csproj
diff --git a/src/Native/LibTorchSharp/THSTensor.cpp b/src/Native/LibTorchSharp/THSTensor.cpp
index f4617b5f7..97499ab42 100644
--- a/src/Native/LibTorchSharp/THSTensor.cpp
+++ b/src/Native/LibTorchSharp/THSTensor.cpp
@@ -1847,7 +1847,7 @@ Tensor THSTensor_to_type_and_device_and_non_blocking(const Tensor tensor, int8_t
 {
     CATCH_RETURN_Tensor(
         auto device = c10::Device((c10::DeviceType)device_type, (c10::DeviceIndex)device_index);
-    res = ResultTensor(tensor->to(device, non_blocking, at::ScalarType(scalar_type), false));
+        res = ResultTensor(tensor->to(device, at::ScalarType(scalar_type),non_blocking, false));
     );
 }
 
diff --git a/src/TorchSharp/Amp/AutocastMode.cs b/src/TorchSharp/Amp/AutocastMode.cs
index 7b9af69eb..c7fdaa857 100644
--- a/src/TorchSharp/Amp/AutocastMode.cs
+++ b/src/TorchSharp/Amp/AutocastMode.cs
@@ -9,9 +9,9 @@ namespace TorchSharp.Amp
     public class AutocastMode : IDisposable
     {
         private bool Enabled, Prev;
-        private torch.ScalarType Dtype;
-        private torch.ScalarType fast_dtype;
-        private torch.Device Device;
+        //private torch.ScalarType Dtype = torch.ScalarType.Float32;
+        private torch.ScalarType fast_dtype = torch.ScalarType.Float32;
+        private torch.Device Device = new torch.Device(DeviceType.CUDA);
         public AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enabled=true, bool? cache_enabled = null)
         {
             fast_dtype = dtype.Value;
diff --git a/src/TorchSharp/TorchSharp.csproj b/src/TorchSharp/TorchSharp.csproj
new file mode 100644
index 000000000..ef6d6ff94
--- /dev/null
+++ b/src/TorchSharp/TorchSharp.csproj
@@ -0,0 +1,88 @@
+<Project>
+  <!-- Implicit top import -->
+  <Import Project="Sdk.props" Sdk="Microsoft.NET.Sdk" />
+
+  <PropertyGroup>
+      <TargetFrameworks>netstandard2.0</TargetFrameworks>
+      <LangVersion>9.0</LangVersion>
+      <IncludeInPackage>TorchSharp</IncludeInPackage>
+      <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+      <UseMLCodeAnalyzer>false</UseMLCodeAnalyzer>
+      <UseStyleCopAnalyzer>false</UseStyleCopAnalyzer>
+      <IsPackable>false</IsPackable>
+      <DefineConstants>$(DefineConstants);LIBTORCH_$(LibTorchPackageVersion.Replace('.', '_'));CUDA_$(CudaVersionDot.Replace('.', '_'))</DefineConstants>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <Compile Remove="TorchVision\**" />
+    <EmbeddedResource Remove="TorchVision\**" />
+    <None Remove="TorchVision\**" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <None Remove="Tensor\TensorTyped.tt" />
+  </ItemGroup>
+
+  <ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETCoreApp'">
+    <Compile Remove="netstandard.cs" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Google.Protobuf" Version="3.21.9" />
+    <PackageReference Include="SharpZipLib" Version="1.4.0" />
+    <PackageReference Include="SkiaSharp" Version="2.88.6" />
+    <PackageReference Include="SkiaSharp.NativeAssets.Linux.NoDependencies" Version="2.88.6" />
+    <PackageReference Include="System.Memory" Version="4.5.5" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Service Include="{508349b6-6b84-4df5-91f0-309beebad82d}" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Compile Update="Tensor\TensorTyped.generated.cs">
+      <DesignTime>True</DesignTime>
+      <AutoGen>True</AutoGen>
+      <DependentUpon>TensorTyped.tt</DependentUpon>
+    </Compile>
+  </ItemGroup>
+
+
+  <PropertyGroup>
+      <PackDependsOn>
+          $(PackDependsOn);
+          RealPack
+      </PackDependsOn>
+      <SignAssembly>True</SignAssembly>
+      <AssemblyOriginatorKeyFile>..\..\build\TorchSharp.snk</AssemblyOriginatorKeyFile>
+  </PropertyGroup>
+
+
+  <PropertyGroup Condition="'$(Configuration)|$(TargetFramework)|$(Platform)'=='Debug|netstandard2.0|AnyCPU'">
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+
+
+  <PropertyGroup Condition="'$(Configuration)|$(TargetFramework)|$(Platform)'=='Release|netstandard2.0|AnyCPU'">
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+
+
+  <Import Project="Sdk.targets" Sdk="Microsoft.NET.Sdk" />
+
+  <!-- Trigger the download+placement of the redist bits and the build of the C++ project -->
+  <Target Name="BuildNativeLibTorch" BeforeTargets="BeforeBuild">
+    <Message Importance="High" Text="Using VersionSuffix = $(VersionSuffix)" />
+    <Message Importance="High" Text="Using Version = $(Version)" />
+    <MSBuild Projects="..\Redist\libtorch-cuda-$(CudaVersionDot)\libtorch-cuda-$(CudaVersionDot).proj" Condition="'$(BuildingInsideVisualStudio)'!='true' AND '$(SkipNative)' != 'true'  AND '$(SkipCuda)' != 'true'" RemoveProperties="TargetFramework" Targets="Build" />
+
+    <MSBuild Projects="..\Redist\libtorch-cpu\libtorch-cpu.proj" Condition="'$(BuildingInsideVisualStudio)'!='true' AND '$(SkipNative)' != 'true'" RemoveProperties="TargetFramework" Targets="Build" />
+
+    <MSBuild Projects="..\Native\build.proj" Condition="'$(SkipNative)' != 'true'" RemoveProperties="TargetFramework" Targets="Build" />
+  </Target>
+
+  <Target Name="RealPack">
+    <MSBuild Projects="..\..\pkg\pack.proj" Targets="Pack" />
+  </Target>
+
+</Project>

From d5324020a35dccd93e67f890131d34fd9f352652 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 18 Feb 2024 15:37:17 -0300
Subject: [PATCH 04/25] AMP

---
 src/Native/LibTorchSharp/THSTorch.cpp         |  4 +-
 src/Native/LibTorchSharp/Utils.h              | 17 ++++-
 src/TorchSharp/Amp/AutocastMode.cs            | 68 +++++++++++++++++--
 src/TorchSharp/NN/Module.cs                   | 25 ++++++-
 .../Tensor/Factories/Tensor.Factories.cs      |  6 ++
 .../Tensor/Factories/tensor_float.cs          | 10 ++-
 src/TorchSharp/Tensor/torch.Autocast.cs       | 17 +++++
 7 files changed, 134 insertions(+), 13 deletions(-)

diff --git a/src/Native/LibTorchSharp/THSTorch.cpp b/src/Native/LibTorchSharp/THSTorch.cpp
index 1a170913c..93f550de6 100644
--- a/src/Native/LibTorchSharp/THSTorch.cpp
+++ b/src/Native/LibTorchSharp/THSTorch.cpp
@@ -375,7 +375,7 @@ int8_t THSTorch_get_autocast_gpu_dtype()
 {
     //TODO: Implement AUTOCAST AMP AND GRADSCALER
 
-    //INFO: Enter/Exit function of autocast_mode not need to do in C/C++ only in C# with Disposable C# Can handle all of that function (if exists)
+    //INFO: Enter/Exit function of autocast_mode not need to do in C/C++ only in C# with Disposable can handle all of that function (if exists)
     //https://github.com/pytorch/pytorch/blob/main/torch/amp/autocast_mode.py
 
 
@@ -395,7 +395,7 @@ int THSTorch_autocast_increment_nesting()
     return at::autocast::increment_nesting();
 }
 
-int THSTorch_autocast_decremental_nesting()
+int THSTorch_autocast_decrement_nesting()
 {
     return at::autocast::decrement_nesting();
 }
diff --git a/src/Native/LibTorchSharp/Utils.h b/src/Native/LibTorchSharp/Utils.h
index 4c3606491..cc0242af1 100644
--- a/src/Native/LibTorchSharp/Utils.h
+++ b/src/Native/LibTorchSharp/Utils.h
@@ -4,7 +4,7 @@
 #include <string>
 
 #include "torch/torch.h"
-
+#include <ATen/autocast_mode.h>
 extern thread_local char *torch_last_err;
 
 typedef torch::Tensor *Tensor;
@@ -59,8 +59,21 @@ struct TensorArray {
 // Return undefined tensors as nullptr to C#
 inline Tensor ResultTensor(const at::Tensor & res)
 {
-    if (res.defined())
+    if (res.defined()) {
+        /*at::Tensor* resT = new torch::Tensor(res);
+        if (at::autocast::is_autocast_cache_enabled()){
+            if (res.is_cuda()) {
+                ::std::cout << "IS CUDA" << std::endl;
+                resT->to(at::autocast::get_autocast_gpu_dtype());
+            }
+            if (res.is_cpu()) {
+                ::std::cout << "IS CPU" << std::endl;
+                resT->to(at::autocast::get_autocast_cpu_dtype());
+            }
+        }
+        return resT;*/
         return new torch::Tensor(res);
+    }
     else
         return nullptr;
 }
diff --git a/src/TorchSharp/Amp/AutocastMode.cs b/src/TorchSharp/Amp/AutocastMode.cs
index c7fdaa857..43d3805fa 100644
--- a/src/TorchSharp/Amp/AutocastMode.cs
+++ b/src/TorchSharp/Amp/AutocastMode.cs
@@ -6,20 +6,42 @@
 
 namespace TorchSharp.Amp
 {
-    public class AutocastMode : IDisposable
+    public static class Autocast
+    {
+        public static torch.Tensor AutoCast(this torch.Tensor input)
+        {
+            return AutocastMode.GetInstance().CastTensor(input);
+        }
+    }
+    //TODO: Should make Singleton and IDisposable on ENTER
+    public sealed class AutocastMode : IDisposable
     {
         private bool Enabled, Prev;
         //private torch.ScalarType Dtype = torch.ScalarType.Float32;
         private torch.ScalarType fast_dtype = torch.ScalarType.Float32;
         private torch.Device Device = new torch.Device(DeviceType.CUDA);
-        public AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enabled=true, bool? cache_enabled = null)
+        private static AutocastMode instance;
+        /*public static AutocastMode GetInstance(torch.Device dev, torch.ScalarType? dtype = null, bool enabled = true, bool? cache_enabled = null)
+        {
+            if(instance ==null)
+                instance = new AutocastMode(dev, dtype, enabled, cache_enabled);
+            return instance;
+        }*/
+        public static AutocastMode GetInstance()
         {
-            fast_dtype = dtype.Value;
+            return instance ?? (instance = new AutocastMode(torch.CUDA, cache_enabled:true));
+        }
+
+        private AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enabled=true, bool? cache_enabled = null)
+        {
+            //var la = torch.tensor(9);
+            fast_dtype = dtype ?? torch.ScalarType.Float32;
             if (dev.type == DeviceType.CUDA)
                 fast_dtype = torch.get_autocast_gpu_dtype();
             if (dev.type == DeviceType.CPU)
                 fast_dtype = torch.get_autocast_cpu_dtype();
-
+            IntPtr ptr = IntPtr.Zero;
+            
             bool _cache_enabled = torch.is_autocast_cache_enabled();
             if (!torch.cuda.is_available() && dev.type == DeviceType.CUDA) //Is not available for doing multicast
                 Enabled = false;
@@ -38,17 +60,49 @@ public AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enabl
             if (dev.type == DeviceType.CUDA) {
                 this.Prev = torch.is_autocast_gpu_enabled();
             }
-            throw new NotImplementedException();
+
+            torch.set_autocast_cache_enabled(_cache_enabled);
+            torch.set_autocast_enabled(this.Enabled);
+            //throw new NotImplementedException();
         }
+
+        /*internal void Cast(torch.Tensor tensor)
+        {
+            tensor.to(fast_dtype, tensor.device);
+        }*/
+
+        internal torch.Tensor CastTensor(torch.Tensor tensor)
+        {
+            if (!Enabled)
+                return tensor;
+            return tensor.to(fast_dtype, tensor.device);
+        }
+        /*public IDisposable Enter()
+        {
+
+            return this;
+        }*/
         public void Dispose()
         {
+            this.Enabled = false;
             if (Device.type == DeviceType.CUDA) {
                 if(torch.autocast_decrement_nesting() == 0)
                     torch.clear_autocast_cache();
                 torch.set_autocast_gpu_dtype(this.fast_dtype);
-                torch.set_autocast_enabled(this.Prev);
+                //torch.set_autocast_enabled(this.Prev);
+                torch.set_autocast_enabled(false);
+                torch.set_autocast_cache_enabled(false);
+            }
+
+            if (Device.type == DeviceType.CPU) {
+                if (torch.autocast_decrement_nesting() == 0)
+                    torch.clear_autocast_cache();
+                //torch.set_autocast_enabled(this.Prev);
+                torch.set_autocast_cpu_dtype(this.fast_dtype);
+                torch.set_autocast_enabled(false);
+                torch.set_autocast_cache_enabled(false);
             }
-            throw new NotImplementedException();
+            //throw new NotImplementedException();
         }
     }
 }
diff --git a/src/TorchSharp/NN/Module.cs b/src/TorchSharp/NN/Module.cs
index 4ca8a3258..911f29fd9 100644
--- a/src/TorchSharp/NN/Module.cs
+++ b/src/TorchSharp/NN/Module.cs
@@ -681,6 +681,8 @@ public virtual void register_buffer(string name, Tensor tensor, bool persistent
 
                     if (!_internal_buffers.TryAdd(name, (tensor, persistent)))
                         throw new InvalidOperationException($"Tensor {name} is already registered.");
+
+
                 }
 
                 /// <summary>
@@ -700,6 +702,13 @@ public virtual void register_parameter(string name, Parameter param)
 
                     if (!_internal_params.TryAdd(name, param))
                         throw new InvalidOperationException($"Parameter {name} is already registered.");
+
+                    /*if (is_autocast_cache_enabled()) {
+                        if (is_autocast_gpu_enabled())
+                            param = param.to(get_autocast_dtype(CUDA)).AsParameter();
+                        if (is_autocast_cpu_enabled())
+                            param = param.to(get_autocast_dtype(CPU)).AsParameter();
+                    }*/
                 }
 
                 /// <summary>
@@ -740,7 +749,15 @@ public virtual void register_module(string name, Module submodule)
                         }
 
                         submodule.RegisterComponents();
-
+                        if (!is_autocast_cache_enabled()) {
+                            _internal_submodules.Add(name, submodule);
+                            return;
+                        }
+                        if (is_autocast_gpu_enabled())
+                            submodule = submodule.to(get_autocast_dtype(CUDA));
+                        if (is_autocast_cpu_enabled())
+                            submodule = submodule.to(get_autocast_dtype(CPU));
+                        
                         _internal_submodules.Add(name, submodule);
                     }
                 }
@@ -1042,6 +1059,8 @@ protected virtual void RegisterComponents()
                     _areComponentsRegistered = true;
                 }
 
+
+
                 protected static (Device device, ScalarType dtype) GetDefaultDeviceAndType(Device device = null, ScalarType? dtype = null)
                 {
                     if (!dtype.HasValue)
@@ -1295,6 +1314,10 @@ public TResult call(T input)
                             input = modified;
                     }
 
+                    /*if (is_autocast_cache_enabled()) { //Should i cast this for better managment???
+                        if(input is Tensor) 
+                    }*/
+
                     var result = forward(input);
 
                     // Call post-hooks, if available.
diff --git a/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs b/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs
index 9bc1c562f..899342207 100644
--- a/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs
+++ b/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs
@@ -179,6 +179,12 @@ private static Tensor _tensor_generic(Array rawArray, ReadOnlySpan<long> dimensi
                         tensor.rename_(names);
                     }
 
+                    if (!is_autocast_cache_enabled())
+                        return tensor;
+                    if (is_autocast_gpu_enabled())
+                        tensor = tensor.to(get_autocast_gpu_dtype());
+                    if (is_autocast_cpu_enabled())
+                        tensor = tensor.to(get_autocast_cpu_dtype());
                     return tensor;
                 }
             }
diff --git a/src/TorchSharp/Tensor/Factories/tensor_float.cs b/src/TorchSharp/Tensor/Factories/tensor_float.cs
index 562c826f2..f33d1b90a 100644
--- a/src/TorchSharp/Tensor/Factories/tensor_float.cs
+++ b/src/TorchSharp/Tensor/Factories/tensor_float.cs
@@ -3,6 +3,7 @@
 using System.Collections.Generic;
 using System.Diagnostics.Contracts;
 using System.Linq;
+using TorchSharp.Amp;
 using static TorchSharp.PInvoke.NativeMethods;
 
 #nullable enable
@@ -18,7 +19,14 @@ public static Tensor tensor(float scalar, Device? device = null, bool requires_g
             device = InitializeDevice(device);
             var handle = THSTensor_newFloat32Scalar(scalar, (int)device.type, device.index, requires_grad);
             if (handle == IntPtr.Zero) { CheckForErrors(); }
-            return new Tensor(handle);
+
+
+            var t = new Tensor(handle).AutoCast();
+            /*if (is_autocast_cache_enabled()) {
+                if (is_autocast_gpu_enabled())
+                    return t.to(get_autocast_gpu_dtype()); //this work, but should put that on all tensor factorie... 
+            }*/
+            return t;
         }
 
         /// <summary>
diff --git a/src/TorchSharp/Tensor/torch.Autocast.cs b/src/TorchSharp/Tensor/torch.Autocast.cs
index 6745133be..e3fc33f52 100644
--- a/src/TorchSharp/Tensor/torch.Autocast.cs
+++ b/src/TorchSharp/Tensor/torch.Autocast.cs
@@ -9,6 +9,15 @@ public static bool is_autocast_cache_enabled()
         {
             return THSTorch_is_autocast_cache_enabled();
         }
+
+        public static bool is_autocast_enabled(Device device)
+        {
+            if(device.type == DeviceType.CPU)
+                return THSTorch_is_autocast_cpu_enabled();
+            if(device.type == DeviceType.CUDA)
+                return THSTorch_is_autocast_gpu_enabled();
+            return THSTorch_is_autocast_cache_enabled();
+        }
         public static bool is_autocast_cpu_enabled()
         {
             return THSTorch_is_autocast_cpu_enabled();
@@ -26,6 +35,14 @@ public static bool is_autocast_hpu_enabled()
             return THSTorch_is_autocast_hpu_enabled();
         }
 
+        public static ScalarType get_autocast_dtype(Device device)
+        {
+            if (device.type == DeviceType.CPU)
+                return get_autocast_cpu_dtype();
+            if (device.type == DeviceType.CUDA)
+                return get_autocast_gpu_dtype();
+            return ScalarType.Float32;
+        }
         public static ScalarType get_autocast_cpu_dtype()
         {
             return (ScalarType)THSTorch_get_autocast_cpu_dtype();

From 0b839dbbb5bff741162ddd14ac270660325f3fca Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 18 Feb 2024 21:21:49 -0300
Subject: [PATCH 05/25] Add Print Modules Still in progress

---
 src/Native/LibTorchSharp/THSConvolution.cpp   |  8 ++++++++
 src/Native/LibTorchSharp/THSNN.cpp            | 12 ++++++++++++
 src/Native/LibTorchSharp/THSNN.h              |  5 +++++
 src/Native/LibTorchSharp/Utils.h              |  1 -
 src/TorchSharp/PInvoke/LibTorchSharp.THSNN.cs |  3 +++
 src/TorchSharp/Tensor/torch.Utilities.cs      |  6 ++++++
 6 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/src/Native/LibTorchSharp/THSConvolution.cpp b/src/Native/LibTorchSharp/THSConvolution.cpp
index e1500d939..27e2e62a7 100644
--- a/src/Native/LibTorchSharp/THSConvolution.cpp
+++ b/src/Native/LibTorchSharp/THSConvolution.cpp
@@ -683,6 +683,7 @@ void THSNN_Conv1d_set_weight(const NNModule module, const Tensor weight)
     set_weight<torch::nn::Conv1d>(module, weight);
 }
 
+
 NNModule THSNN_Conv2d_ctor(const int64_t inputChannel, const int64_t outputChannel,
     const int64_t kernelSize, const int64_t stride, const int64_t padding,
     const int64_t dilation, const int64_t paddingMode, const int64_t groups, const bool bias,
@@ -757,6 +758,13 @@ void THSNN_Conv2d_set_weight(const NNModule module, const Tensor weight)
     set_weight<torch::nn::Conv2d>(module, weight);
 }
 
+/*void THSNN_Conv2d_print_options(const NNModule module) {
+    auto opt = (*module)->as<torch::nn::Conv2d>()->options;
+    ::std::cout << "Conv2d (" << std::to_string(opt.in_channels()) << "," << std::to_string(opt.out_channels()) << ")" << std::endl;
+}*/
+
+
+
 NNModule THSNN_Conv3d_ctor(const int64_t inputChannel, const int64_t outputChannel,
     const int64_t kernelSize, const int64_t stride, const int64_t padding,
     const int64_t dilation, const int64_t paddingMode, const int64_t groups, const bool bias,
diff --git a/src/Native/LibTorchSharp/THSNN.cpp b/src/Native/LibTorchSharp/THSNN.cpp
index 12b6a461a..a164f0f67 100644
--- a/src/Native/LibTorchSharp/THSNN.cpp
+++ b/src/Native/LibTorchSharp/THSNN.cpp
@@ -1334,4 +1334,16 @@ Tensor THSNN_scaled_dot_product_attention(const Tensor query, const Tensor key,
     auto mask = attention_mask == nullptr ? c10::nullopt : c10::optional<at::Tensor>(*attention_mask);
 
     CATCH_TENSOR(torch::scaled_dot_product_attention(*query, *key, *value, mask, p, casual));
+}
+
+void THSNN_Print_Module(const NNModule module) {
+    if (auto* conv = (*module)->as<torch::nn::Conv2d>())
+    {
+        auto opt = conv->options;
+        ::std::cout << conv->name() << "(" << opt.in_channels() << "," << opt.out_channels() << ", K=" << opt.kernel_size() <<", S=" << opt.stride() << ")" << std::endl; //TODO: Add padding
+    }
+    if (auto* bn = (*module)->as<torch::nn::BatchNorm2d>()) {
+        auto opt = bn->options;
+        ::std::cout << bn->name() << "(" << opt.num_features() << ", Eps=" << opt.eps() << ", M=" << (opt.momentum().has_value() ? opt.momentum().value() : 0) << ")" << std::endl; //TODO: Add another data
+    }
 }
\ No newline at end of file
diff --git a/src/Native/LibTorchSharp/THSNN.h b/src/Native/LibTorchSharp/THSNN.h
index 07d247d87..49d293113 100644
--- a/src/Native/LibTorchSharp/THSNN.h
+++ b/src/Native/LibTorchSharp/THSNN.h
@@ -145,6 +145,7 @@ EXPORT_API(Tensor)   THSNN_Conv2d_weight(const NNModule module);
 EXPORT_API(void)     THSNN_Conv2d_set_weight(const NNModule module, const Tensor weight);
 EXPORT_API(Tensor)   THSNN_Conv2d_bias(const NNModule module);
 EXPORT_API(void)     THSNN_Conv2d_set_bias(const NNModule module, const Tensor bias);
+//EXPORT_API(void)     THSNN_Conv2d_print_options(const NNModule module);
 EXPORT_API(NNModule) THSNN_Conv3d_ctor(const int64_t inputChannel, const int64_t outputChannel, const int64_t kernelSize, const int64_t stride, const int64_t padding, const int64_t dilation, const int64_t paddingMode, const int64_t groups, const bool bias, NNAnyModule* outAsAnyModule);
 EXPORT_API(NNModule) THSNN_Conv3d_ctor_1(const int64_t inputChannel, const int64_t outputChannel, const int64_t kernelX, const int64_t kernelY, const int64_t kernelZ, const int64_t strideX, const int64_t strideY, const int64_t strideZ, const int64_t paddingX, const int64_t paddingY, const int64_t paddingZ, const int64_t dilationX, const int64_t dilationY, const int64_t dilationZ, const int64_t paddingMode, const int64_t groups, const bool bias, NNAnyModule* outAsAnyModule);
 EXPORT_API(Tensor)   THSNN_Conv3d_forward(const NNModule module, const Tensor tensor);
@@ -592,3 +593,7 @@ EXPORT_API(PackedSequence) THSNN_pack_padded_sequence(Tensor input, Tensor lengt
 EXPORT_API(void) THSNN_pad_packed_sequence(PackedSequence sequence, bool batch_first, double padding_value, int64_t total_length, Tensor* res1, Tensor* res2);
 EXPORT_API(Tensor) THSNN_pad_sequence(const Tensor* sequences, const int sequences_len, bool batch_first, double padding_value);
 EXPORT_API(PackedSequence) THSNN_pack_sequence(const Tensor* sequences, int sequences_len, bool enforce_sorted);
+
+
+// Printer Modules
+EXPORT_API(void) THSNN_Print_Module(const NNModule module);
diff --git a/src/Native/LibTorchSharp/Utils.h b/src/Native/LibTorchSharp/Utils.h
index cc0242af1..892e0e2ec 100644
--- a/src/Native/LibTorchSharp/Utils.h
+++ b/src/Native/LibTorchSharp/Utils.h
@@ -2,7 +2,6 @@
 #pragma once
 
 #include <string>
-
 #include "torch/torch.h"
 #include <ATen/autocast_mode.h>
 extern thread_local char *torch_last_err;
diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSNN.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSNN.cs
index 8bef36230..870e4e647 100644
--- a/src/TorchSharp/PInvoke/LibTorchSharp.THSNN.cs
+++ b/src/TorchSharp/PInvoke/LibTorchSharp.THSNN.cs
@@ -1318,6 +1318,9 @@ internal static extern IntPtr THSNN_custom_module(
 
         [DllImport("LibTorchSharp")]
         internal static extern IntPtr THSNN_MaxUnpool2d_ctor(IntPtr pkernelSize, int kernelSizeLength, IntPtr pstrides, int stridesLength, IntPtr pPadding, int paddingLength, out IntPtr pBoxedModule);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSNN_Print_Module(torch.nn.Module.HType module);
     }
 #pragma warning restore CA2101
 }
diff --git a/src/TorchSharp/Tensor/torch.Utilities.cs b/src/TorchSharp/Tensor/torch.Utilities.cs
index 42745a786..91d79539a 100644
--- a/src/TorchSharp/Tensor/torch.Utilities.cs
+++ b/src/TorchSharp/Tensor/torch.Utilities.cs
@@ -2,6 +2,7 @@
 #nullable enable
 using System;
 using System.Diagnostics.Contracts;
+using TorchSharp.PInvoke;
 using static TorchSharp.PInvoke.NativeMethods;
 
 namespace TorchSharp
@@ -79,5 +80,10 @@ public static ScalarType promote_types(ScalarType type1, ScalarType type2)
 
         [Obsolete("not implemented", true)]
         public static void _assert(Func<bool> condition, string message) => throw new NotImplementedException();
+
+        public static void PrintModule(torch.nn.Module module)
+        {
+            NativeMethods.THSNN_Print_Module(module.handle);
+        }
     }
 }
\ No newline at end of file

From 98cabfa4496b1a9bb1bbc996cbf931dd73fd2961 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 18 Feb 2024 22:49:43 -0300
Subject: [PATCH 06/25] Add some printing module

---
 src/Native/LibTorchSharp/THSNN.cpp           | 47 +++++++++++++++++---
 src/TorchSharp/NN/Dropout2d.cs               |  4 +-
 src/TorchSharp/NN/Normalization/LayerNorm.cs |  4 +-
 src/TorchSharp/Tensor/torch.Utilities.cs     | 14 ++++++
 4 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/src/Native/LibTorchSharp/THSNN.cpp b/src/Native/LibTorchSharp/THSNN.cpp
index a164f0f67..430c17f5e 100644
--- a/src/Native/LibTorchSharp/THSNN.cpp
+++ b/src/Native/LibTorchSharp/THSNN.cpp
@@ -1337,13 +1337,48 @@ Tensor THSNN_scaled_dot_product_attention(const Tensor query, const Tensor key,
 }
 
 void THSNN_Print_Module(const NNModule module) {
-    if (auto* conv = (*module)->as<torch::nn::Conv2d>())
+    std::ostringstream oss;
+    const std::string name = module->get()->name();
+    oss << name << "(";
+    if (auto* conv2 = (*module)->as<torch::nn::Conv2d>())
     {
-        auto opt = conv->options;
-        ::std::cout << conv->name() << "(" << opt.in_channels() << "," << opt.out_channels() << ", K=" << opt.kernel_size() <<", S=" << opt.stride() << ")" << std::endl; //TODO: Add padding
+        const auto opt = &conv2->options;
+        oss << opt->in_channels() << "," << opt->out_channels() << ", K=" << opt->kernel_size();
+        oss << ", S=" << opt->stride() << ", P=" << opt->padding().index() << ", D=" << opt->dilation();
+        oss << ", G=" << opt->groups() << ", B=" << opt->bias();
     }
-    if (auto* bn = (*module)->as<torch::nn::BatchNorm2d>()) {
-        auto opt = bn->options;
-        ::std::cout << bn->name() << "(" << opt.num_features() << ", Eps=" << opt.eps() << ", M=" << (opt.momentum().has_value() ? opt.momentum().value() : 0) << ")" << std::endl; //TODO: Add another data
+    if (auto* bn2 = (*module)->as<torch::nn::BatchNorm2d>()) {
+        const auto opt = &bn2->options;
+        oss << opt->num_features() << ", Eps=" << opt->eps() << ", M=" << (opt->momentum().has_value() ? std::to_string(opt->momentum().value()) : "NaN");
+        oss << ", A=" << opt->affine() << ", T=" << opt->track_running_stats();
     }
+    if(auto* ln = (*module)->as<torch::nn::LayerNorm>()) //This not printed because the TorchSharp not have a ctor of LayerNorm
+    {
+        const auto opt = ln->options;
+        oss << opt.eps() << ", Elem=" << opt.elementwise_affine() << ", N=[";
+        for(int64_t i=0;i< static_cast<int64_t>(opt.normalized_shape().size());i++)
+            oss << opt.normalized_shape()[i] << ((i == static_cast<int64_t>(opt.normalized_shape().size()-1)) ? "]" : ",");
+    }
+    if (const auto* d2 = (*module)->as<torch::nn::Dropout2d>()) //This not printed because the TorchSharp not have a ctor of Dropout2d
+    {
+        auto opt = d2->options;
+        oss << opt.p() << ", Inplace=" << opt.inplace();
+    }
+    if(auto* avp2 = (*module)->as<torch::nn::AdaptiveAvgPool2d>())
+    {
+        const auto opt = &avp2->options;
+        oss << "[";
+        for (int64_t i = 0; i < opt->output_size().size(); i++)
+            oss << opt->output_size()->at(i).value() << ((i == opt->output_size().size() - 1) ? "]" : ",");
+    }
+    if (auto* amp2 = (*module)->as<torch::nn::AdaptiveMaxPool2d>())
+    {
+        const auto opt = &amp2->options;
+        oss << "[";
+        for (int64_t i = 0; i < opt->output_size().size(); i++)
+            oss << opt->output_size()->at(i).value() << ((i == opt->output_size().size() - 1) ? "]" : ",");
+    }
+
+    oss << ")";
+    std::cout << oss.str() << std::endl;
 }
\ No newline at end of file
diff --git a/src/TorchSharp/NN/Dropout2d.cs b/src/TorchSharp/NN/Dropout2d.cs
index 363cb40d5..49db468d7 100644
--- a/src/TorchSharp/NN/Dropout2d.cs
+++ b/src/TorchSharp/NN/Dropout2d.cs
@@ -33,8 +33,8 @@ public override Tensor forward(Tensor input)
             protected internal override nn.Module _to(DeviceType deviceType, int deviceIndex = -1) => this;
             protected internal override nn.Module _to(ScalarType dtype) => this;
 
-            private bool inplace;
-            private double p;
+            internal bool inplace; //Set internal accesibility for PrintModule
+            internal double p; //Set internal accesibility for PrintModule
         }
     }
 
diff --git a/src/TorchSharp/NN/Normalization/LayerNorm.cs b/src/TorchSharp/NN/Normalization/LayerNorm.cs
index 7010e754e..6ed8dae45 100644
--- a/src/TorchSharp/NN/Normalization/LayerNorm.cs
+++ b/src/TorchSharp/NN/Normalization/LayerNorm.cs
@@ -18,8 +18,8 @@ namespace Modules
         /// </summary>
         public sealed class LayerNorm : torch.nn.Module<Tensor, Tensor>
         {
-            private long[] _normalized_shape;
-            private double _eps;
+            internal long[] _normalized_shape;
+            internal double _eps;
 
             internal LayerNorm(long[] normalized_shape, double eps, bool elementwise_affine, bool bias, Device? device, ScalarType? dtype) : base(nameof(LayerNorm))
             {
diff --git a/src/TorchSharp/Tensor/torch.Utilities.cs b/src/TorchSharp/Tensor/torch.Utilities.cs
index 91d79539a..7525ea6c9 100644
--- a/src/TorchSharp/Tensor/torch.Utilities.cs
+++ b/src/TorchSharp/Tensor/torch.Utilities.cs
@@ -2,6 +2,7 @@
 #nullable enable
 using System;
 using System.Diagnostics.Contracts;
+using TorchSharp.Modules;
 using TorchSharp.PInvoke;
 using static TorchSharp.PInvoke.NativeMethods;
 
@@ -83,6 +84,19 @@ public static ScalarType promote_types(ScalarType type1, ScalarType type2)
 
         public static void PrintModule(torch.nn.Module module)
         {
+            if (module is Dropout2d drop2d) {
+                Console.WriteLine($"{module.GetName()}({drop2d.p}, {drop2d.inplace})");
+                return;
+            }
+
+            if (module is LayerNorm ln) {
+                string str= "[";
+                for (int i = 0; i < ln._normalized_shape.Length; i++)
+                    str += ln._normalized_shape[i] + ",";
+                str = str.TrimEnd(',')+"]";
+                Console.WriteLine($"{module.GetName()}({ln._eps}, {str})");
+                return;
+            }
             NativeMethods.THSNN_Print_Module(module.handle);
         }
     }

From 669b4facd7eac6dcd6ba01c25c2be0831c9ffe67 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Tue, 20 Feb 2024 16:08:27 -0300
Subject: [PATCH 07/25] Fix some dotnet build. Need fix tests

---
 .gitignore                                    |  22 +++
 .../FileRestitcher.Tests.csproj               |   2 +-
 .../FileRestitcher/FileRestitcher.csproj      |   6 +-
 src/Examples.Utils/Examples.Utils.csproj      |   3 +-
 src/Examples.Utils/Vocab.cs                   |   9 +-
 src/Examples/Examples.csproj                  |   2 +-
 src/FSharp.Examples/FSharp.Examples.fsproj    |   2 +-
 src/Native/build.cmd                          | 151 ------------------
 src/TorchSharp/TorchSharp.csproj              |  28 ++--
 9 files changed, 51 insertions(+), 174 deletions(-)
 delete mode 100644 src/Native/build.cmd

diff --git a/.gitignore b/.gitignore
index bab8676e1..a17061b33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -272,3 +272,25 @@ packages/
 *.code-workspace
 /.idea
 /test/TorchSharpTest/exportsd.py
+/src/Native/CMakeFiles
+/src/Native/LibTorchSharp/CMakeFiles
+/src/Native/ALL_BUILD.vcxproj
+/src/Native/ALL_BUILD.vcxproj.filters
+/src/Native/build.cmd
+/src/Native/CMakeCache.txt
+/src/Native/cmake_install.cmake
+/src/Native/INSTALL.vcxproj
+/src/Native/INSTALL.vcxproj.filters
+/src/Native/install_manifest.txt
+/src/Native/LibTorchSharp/ALL_BUILD.vcxproj
+/src/Native/LibTorchSharp/ALL_BUILD.vcxproj.filters
+/src/Native/LibTorchSharp/cmake_install.cmake
+/src/Native/LibTorchSharp/INSTALL.vcxproj
+/src/Native/LibTorchSharp/INSTALL.vcxproj.filters
+/src/Native/LibTorchSharp/LibTorchSharp.sln
+/src/Native/LibTorchSharp/LibTorchSharp.vcxproj
+/src/Native/LibTorchSharp/LibTorchSharp.vcxproj.filters
+/src/Native/Project.sln
+/src/Native/ZERO_CHECK.vcxproj
+/src/Native/ZERO_CHECK.vcxproj.filters
+/src/FSharp.Examples/FSharp.Examples.fsproj
diff --git a/pkg/FileRestitcher/FileRestitcher.Tests/FileRestitcher.Tests.csproj b/pkg/FileRestitcher/FileRestitcher.Tests/FileRestitcher.Tests.csproj
index e76338122..bc96dbe96 100644
--- a/pkg/FileRestitcher/FileRestitcher.Tests/FileRestitcher.Tests.csproj
+++ b/pkg/FileRestitcher/FileRestitcher.Tests/FileRestitcher.Tests.csproj
@@ -3,7 +3,7 @@
   <PropertyGroup>
     <NoBuild>false</NoBuild>
     <!-- The Directory.Build.props initialize TargetFrameworks to multiple targets. We have to clear that out to set only the targets we support. -->
-    <TargetFrameworks></TargetFrameworks>
+    <TargetFrameworks>net472;netstandard2.0;$(TargetFrameworks)</TargetFrameworks>
 
     <TargetFrameworks Condition="'$(SkipNetCoreBuild)' != 'true'">net6.0</TargetFrameworks>
     <TargetFrameworks Condition="'$(OS)' == 'Windows_NT' And '$(SkipNetFxBuild)' != 'true'">net472;$(TargetFrameworks)</TargetFrameworks>
diff --git a/pkg/FileRestitcher/FileRestitcher/FileRestitcher.csproj b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.csproj
index bbfbab0cc..3b4d8b200 100644
--- a/pkg/FileRestitcher/FileRestitcher/FileRestitcher.csproj
+++ b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.csproj
@@ -1,11 +1,11 @@
 <Project Sdk="Microsoft.NET.Sdk">
-    
+
   <PropertyGroup>
     <NoBuild>false</NoBuild>
     <OutputType>Library</OutputType>
-    <TargetFramework>netstandard2.0</TargetFramework>
+    <TargetFrameworks>netstandard2.0;net6.0</TargetFrameworks>
     <IsPackable>false</IsPackable>
     <PlatformTarget>x64</PlatformTarget>
   </PropertyGroup>
-    
+
 </Project>
diff --git a/src/Examples.Utils/Examples.Utils.csproj b/src/Examples.Utils/Examples.Utils.csproj
index 1f6d5a081..6a5a09eeb 100644
--- a/src/Examples.Utils/Examples.Utils.csproj
+++ b/src/Examples.Utils/Examples.Utils.csproj
@@ -5,7 +5,8 @@
     <TargetFrameworks></TargetFrameworks>
     <LangVersion>9.0</LangVersion>
     <TargetFrameworks Condition="'$(SkipNetCoreBuild)' != 'true'">net6.0</TargetFrameworks>
-    <TargetFrameworks Condition="'$(OS)' == 'Windows_NT' And '$(SkipNetFxBuild)' != 'true'">net472;$(TargetFrameworks)</TargetFrameworks>
+    <TargetFrameworks Condition="'$(OS)' == 'Windows_NT' And '$(SkipNetFxBuild)' != 'true'">net472;$(TargetFrameworks);netstandard2.0</TargetFrameworks>
+	<!--<TargetFrameworks Condition="'$(OS)' == 'Windows_NT' And '$(SkipNetFxBuild)' != 'true'">net472;$(TargetFrameworks)</TargetFrameworks>-->
     <TargetFrameworks Condition="'$(TargetFrameworks)' == ''">net6.0</TargetFrameworks>
   </PropertyGroup>
 
diff --git a/src/Examples.Utils/Vocab.cs b/src/Examples.Utils/Vocab.cs
index 743e4c55c..7a1deb298 100644
--- a/src/Examples.Utils/Vocab.cs
+++ b/src/Examples.Utils/Vocab.cs
@@ -88,12 +88,17 @@ public void Add(KeyValuePair<string, int> item)
         {
             Add(item.Key, item.Value);
         }
-
+#if NETSTANDARD2_0
+        public bool TryGetValue(string key, out int value)
+        {
+            return _dict.TryGetValue(key, out value);
+        }
+#else
         public bool TryGetValue(string key, [MaybeNullWhen(false)] out int value)
         {
             return _dict.TryGetValue(key, out value);
         }
-
+#endif
         private Dictionary<string, int> _dict = new Dictionary<string, int>();
         private int _last = 0;
     }
diff --git a/src/Examples/Examples.csproj b/src/Examples/Examples.csproj
index f6fe32680..79c448399 100644
--- a/src/Examples/Examples.csproj
+++ b/src/Examples/Examples.csproj
@@ -5,7 +5,7 @@
     <TestUsesLibTorch>true</TestUsesLibTorch>
     <TestCuda>true</TestCuda>
     <!-- The Directory.Build.props initialize TargetFrameworks to multiple targets. We have to clear that out to set only the targets we support. -->
-    <TargetFrameworks></TargetFrameworks>
+    <TargetFrameworks>net472;netstandard2.0;$(TargetFrameworks)</TargetFrameworks>
     <LangVersion>9.0</LangVersion>
     <TargetFrameworks Condition="'$(SkipNetCoreBuild)' != 'true'">net6.0</TargetFrameworks>
     <TargetFrameworks Condition="'$(OS)' == 'Windows_NT' And '$(SkipNetFxBuild)' != 'true'">net472;$(TargetFrameworks)</TargetFrameworks>
diff --git a/src/FSharp.Examples/FSharp.Examples.fsproj b/src/FSharp.Examples/FSharp.Examples.fsproj
index 900e25caa..a6ecbb723 100644
--- a/src/FSharp.Examples/FSharp.Examples.fsproj
+++ b/src/FSharp.Examples/FSharp.Examples.fsproj
@@ -6,7 +6,7 @@
     <TestCuda>true</TestCuda>
     <TargetFrameworks></TargetFrameworks>
     <TargetFrameworks Condition="'$(SkipNetCoreBuild)' != 'true'">net6.0</TargetFrameworks>
-    <TargetFrameworks Condition="'$(OS)' == 'Windows_NT' And '$(SkipNetFxBuild)' != 'true'">net472;$(TargetFrameworks)</TargetFrameworks>
+    <TargetFrameworks Condition="'$(OS)' == 'Windows_NT' And '$(SkipNetFxBuild)' != 'true'">net472;netstandard2.0;$(TargetFrameworks)</TargetFrameworks>
     <TargetFrameworks Condition="'$(TargetFrameworks)' == ''">net6.0</TargetFrameworks>
     <TestUsesLibTorch>true</TestUsesLibTorch>
     <RootNamespace>Examples</RootNamespace>
diff --git a/src/Native/build.cmd b/src/Native/build.cmd
deleted file mode 100644
index c805b2608..000000000
--- a/src/Native/build.cmd
+++ /dev/null
@@ -1,151 +0,0 @@
-@if not defined _echo @echo off
-setlocal
-
-:: Store current script directory before %~dp0 gets affected by another process later.
-set __currentScriptDir=%~dp0
-
-:SetupArgs
-:: Initialize the args that will be passed to cmake
-set __binDir=%__currentScriptDir%..\..\bin
-set __rootDir=%__currentScriptDir%..\..
-set __CMakeBinDir=""
-set __IntermediatesDir=""
-set __BuildArch=x64
-set __VCBuildArch=x86_amd64
-set CMAKE_BUILD_TYPE=Debug
-set LIBTORCH_PATH=""
-
-:Arg_Loop
-if [%1] == [] goto :ToolsVersion
-if /i [%1] == [Release]     ( set CMAKE_BUILD_TYPE=Release&&shift&goto Arg_Loop)
-if /i [%1] == [Debug]       ( set CMAKE_BUILD_TYPE=Debug&&shift&goto Arg_Loop)
-
-if /i [%1] == [x86]         ( set __BuildArch=x86&&set __VCBuildArch=x86&&shift&goto Arg_Loop)
-if /i [%1] == [x64]         ( set __BuildArch=x64&&set __VCBuildArch=x86_amd64&&shift&goto Arg_Loop)
-if /i [%1] == [amd64]       ( set __BuildArch=x64&&set __VCBuildArch=x86_amd64&&shift&goto Arg_Loop)
-
-if /i [%1] == [--libtorchpath] ( set LIBTORCH_PATH=%2&&shift&goto Arg_Loop)
-
-shift
-goto :Arg_Loop
-
-:ToolsVersion
-if defined VisualStudioVersion goto :RunVCVars
-
-set _VSWHERE="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
-if exist %_VSWHERE% (
-  for /f "usebackq tokens=*" %%i in (`%_VSWHERE% -latest -prerelease -property installationPath`) do set _VSCOMNTOOLS=%%i\Common7\Tools
-)
-if not exist "%_VSCOMNTOOLS%" set _VSCOMNTOOLS=%VS140COMNTOOLS%
-if not exist "%_VSCOMNTOOLS%" goto :MissingVersion
-
-
-set "VSCMD_START_DIR=%__currentScriptDir%"
-call "%_VSCOMNTOOLS%\VsDevCmd.bat"
-
-:RunVCVars
-if "%VisualStudioVersion%"=="17.0" (
-    goto :VS2022
-) else if "%VisualStudioVersion%"=="16.0" (
-    goto :VS2019
-) else if "%VisualStudioVersion%"=="15.0" (
-    goto :VS2017
-) else if "%VisualStudioVersion%"=="14.0" (
-    goto :VS2015
-)
-
-:MissingVersion
-:: Can't find VS 2015, 2017 or 2019
-echo Error: Visual Studio 2015, 2017 or 2019 required
-echo        Please see https://github.com/dotnet/machinelearning/tree/master/Documentation for build instructions.
-exit /b 1
-
-:VS2022
-:: Setup vars for VS2022
-set __PlatformToolset=v143
-set __VSVersion=17 2022
-if NOT "%__BuildArch%" == "arm64" (
-    :: Set the environment for the native build
-    call "%VS160COMNTOOLS%..\..\VC\Auxiliary\Build\vcvarsall.bat" %__VCBuildArch%
-)
-goto :SetupDirs
-
-:VS2019
-:: Setup vars for VS2019
-set __PlatformToolset=v142
-set __VSVersion=16 2019
-if NOT "%__BuildArch%" == "arm64" (
-    :: Set the environment for the native build
-    call "%VS160COMNTOOLS%..\..\VC\Auxiliary\Build\vcvarsall.bat" %__VCBuildArch%
-)
-goto :SetupDirs
-
-:VS2017
-:: Setup vars for VS2017
-set __PlatformToolset=v141
-set __VSVersion=15 2017
-if NOT "%__BuildArch%" == "arm64" (
-    :: Set the environment for the native build
-    call "%VS150COMNTOOLS%..\..\VC\Auxiliary\Build\vcvarsall.bat" %__VCBuildArch%
-)
-goto :SetupDirs
-
-:VS2015
-:: Setup vars for VS2015build
-set __PlatformToolset=v140
-set __VSVersion=14 2015
-if NOT "%__BuildArch%" == "arm64" (
-    :: Set the environment for the native build
-    call "%VS140COMNTOOLS%..\..\VC\vcvarsall.bat" %__VCBuildArch%
-)
-
-:SetupDirs
-:: Setup to cmake the native components
-echo Commencing native build of dotnet/machinelearning
-echo.
-
-if %__CMakeBinDir% == "" (
-    set "__CMakeBinDir=%__binDir%\%__BuildArch%.%CMAKE_BUILD_TYPE%\Native"
-)
-if %__IntermediatesDir% == "" (
-    set "__IntermediatesDir=%__binDir%\obj\%__BuildArch%.%CMAKE_BUILD_TYPE%\Native"
-)
-set "__CMakeBinDir=%__CMakeBinDir:\=/%"
-set "__IntermediatesDir=%__IntermediatesDir:\=/%"
-
-:: Check that the intermediate directory exists so we can place our cmake build tree there
-if not exist "%__IntermediatesDir%" md "%__IntermediatesDir%"
-
-:: Regenerate the VS solution
-
-set "__gen-buildsys-win-path=%__currentScriptDir%\gen-buildsys-win.bat"
-set "__source-code-path=%__currentScriptDir%"
-
-echo Calling "%__gen-buildsys-win-path%" "%__source-code-path%" "%__VSVersion%" %__BuildArch%
-pushd "%__IntermediatesDir%"
-call "%__gen-buildsys-win-path%" "%__source-code-path%" "%__VSVersion%" %__BuildArch%
-popd
-
-:CheckForProj
-:: Check that the project created by Cmake exists
-if exist "%__IntermediatesDir%\INSTALL.vcxproj" goto BuildNativeProj
-goto :Failure
-
-:BuildNativeProj
-:: Build the project created by Cmake
-set __msbuildArgs=/p:Platform=%__BuildArch% /p:PlatformToolset="%__PlatformToolset%"
-
-cd %__rootDir%
-
-echo msbuild "%__IntermediatesDir%\INSTALL.vcxproj" /t:build /p:Configuration=%CMAKE_BUILD_TYPE% %__msbuildArgs%
-call msbuild "%__IntermediatesDir%\INSTALL.vcxproj" /t:build /p:Configuration=%CMAKE_BUILD_TYPE% %__msbuildArgs%
-IF ERRORLEVEL 1 (
-    goto :Failure
-)
-echo Done building Native components
-exit /B 0
-
-:Failure
-:: Build failed
-echo Failed to generate native component build project!
-exit /b 1
diff --git a/src/TorchSharp/TorchSharp.csproj b/src/TorchSharp/TorchSharp.csproj
index ef6d6ff94..054f5c18a 100644
--- a/src/TorchSharp/TorchSharp.csproj
+++ b/src/TorchSharp/TorchSharp.csproj
@@ -3,14 +3,14 @@
   <Import Project="Sdk.props" Sdk="Microsoft.NET.Sdk" />
 
   <PropertyGroup>
-      <TargetFrameworks>netstandard2.0</TargetFrameworks>
-      <LangVersion>9.0</LangVersion>
-      <IncludeInPackage>TorchSharp</IncludeInPackage>
-      <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
-      <UseMLCodeAnalyzer>false</UseMLCodeAnalyzer>
-      <UseStyleCopAnalyzer>false</UseStyleCopAnalyzer>
-      <IsPackable>false</IsPackable>
-      <DefineConstants>$(DefineConstants);LIBTORCH_$(LibTorchPackageVersion.Replace('.', '_'));CUDA_$(CudaVersionDot.Replace('.', '_'))</DefineConstants>
+    <TargetFrameworks>netstandard2.0;net6.0</TargetFrameworks>
+    <LangVersion>9.0</LangVersion>
+    <IncludeInPackage>TorchSharp</IncludeInPackage>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+    <UseMLCodeAnalyzer>false</UseMLCodeAnalyzer>
+    <UseStyleCopAnalyzer>false</UseStyleCopAnalyzer>
+    <IsPackable>false</IsPackable>
+    <DefineConstants>$(DefineConstants);LIBTORCH_$(LibTorchPackageVersion.Replace('.', '_'));CUDA_$(CudaVersionDot.Replace('.', '_'))</DefineConstants>
   </PropertyGroup>
 
   <ItemGroup>
@@ -49,12 +49,12 @@
 
 
   <PropertyGroup>
-      <PackDependsOn>
-          $(PackDependsOn);
-          RealPack
-      </PackDependsOn>
-      <SignAssembly>True</SignAssembly>
-      <AssemblyOriginatorKeyFile>..\..\build\TorchSharp.snk</AssemblyOriginatorKeyFile>
+    <PackDependsOn>
+      $(PackDependsOn);
+      RealPack
+    </PackDependsOn>
+    <SignAssembly>True</SignAssembly>
+    <AssemblyOriginatorKeyFile>..\..\build\TorchSharp.snk</AssemblyOriginatorKeyFile>
   </PropertyGroup>
 
 

From 394041426e75864e182b0e4bcb0ceb2289351f2f Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 30 Jun 2024 19:39:43 -0300
Subject: [PATCH 08/25] Fast tensor accessor for ToArray()

---
 src/Examples.Utils/Examples.Utils.csproj      |  8 +-
 src/TorchSharp/Amp/AutocastDisposedManager.cs | 10 +++
 src/TorchSharp/Amp/AutocastDisposedScope.cs   | 10 +++
 .../Tensor/Factories/tensor_float.cs          |  3 +-
 src/TorchSharp/Utils/TensorAccessor.cs        | 79 ++++++++++++++++---
 5 files changed, 97 insertions(+), 13 deletions(-)
 create mode 100644 src/TorchSharp/Amp/AutocastDisposedManager.cs
 create mode 100644 src/TorchSharp/Amp/AutocastDisposedScope.cs

diff --git a/src/Examples.Utils/Examples.Utils.csproj b/src/Examples.Utils/Examples.Utils.csproj
index 6a5a09eeb..d8ce3a24a 100644
--- a/src/Examples.Utils/Examples.Utils.csproj
+++ b/src/Examples.Utils/Examples.Utils.csproj
@@ -21,7 +21,13 @@
 
   <ItemGroup>
     <PackageReference Include="SharpZipLib" Version="1.4.0" />
-    <PackageReference Include="SixLabors.ImageSharp" Version="2.1.1" />
+  </ItemGroup>
+  <ItemGroup Condition="'$(TargetFrameworks)' == ''">
+    <PackageReference Include="SixLabors.ImageSharp" Version="3.1.4" />
+  </ItemGroup>
+
+  <ItemGroup Condition="'$(TargetFrameworks)' != ''">
+    <PackageReference Include="SixLabors.ImageSharp" Version="2.1.8" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/src/TorchSharp/Amp/AutocastDisposedManager.cs b/src/TorchSharp/Amp/AutocastDisposedManager.cs
new file mode 100644
index 000000000..d4ec1ccd7
--- /dev/null
+++ b/src/TorchSharp/Amp/AutocastDisposedManager.cs
@@ -0,0 +1,10 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace TorchSharp.Amp
+{
+    class AutocastDisposedManager
+    {
+    }
+}
diff --git a/src/TorchSharp/Amp/AutocastDisposedScope.cs b/src/TorchSharp/Amp/AutocastDisposedScope.cs
new file mode 100644
index 000000000..7c771d16f
--- /dev/null
+++ b/src/TorchSharp/Amp/AutocastDisposedScope.cs
@@ -0,0 +1,10 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace TorchSharp.Amp
+{
+    class AutocastDisposedScope
+    {
+    }
+}
diff --git a/src/TorchSharp/Tensor/Factories/tensor_float.cs b/src/TorchSharp/Tensor/Factories/tensor_float.cs
index f33d1b90a..e50943689 100644
--- a/src/TorchSharp/Tensor/Factories/tensor_float.cs
+++ b/src/TorchSharp/Tensor/Factories/tensor_float.cs
@@ -21,7 +21,8 @@ public static Tensor tensor(float scalar, Device? device = null, bool requires_g
             if (handle == IntPtr.Zero) { CheckForErrors(); }
 
 
-            var t = new Tensor(handle).AutoCast();
+            //var t = new Tensor(handle).AutoCast();
+            var t = new Tensor(handle);
             /*if (is_autocast_cache_enabled()) {
                 if (is_autocast_gpu_enabled())
                     return t.to(get_autocast_gpu_dtype()); //this work, but should put that on all tensor factorie... 
diff --git a/src/TorchSharp/Utils/TensorAccessor.cs b/src/TorchSharp/Utils/TensorAccessor.cs
index 9514003f2..ab9846eec 100644
--- a/src/TorchSharp/Utils/TensorAccessor.cs
+++ b/src/TorchSharp/Utils/TensorAccessor.cs
@@ -38,16 +38,28 @@ internal TensorAccessor(torch.Tensor tensor)
             _tensor = tensor; // Keep the tensor alive now that everything is alright.
         }
 
+        /// <summary>
+        /// This is important for performance because only called with CopyTo, CopyFrom. Is not necesary in each invocation call tensor.numel() because that use intensive CPU.
+        /// This temporary count avoid so much use CPU. The Property <see cref="Count"/> act as method.
+        /// If tensor is for example 640*640*3 = 1.228.800, <see cref="Count"/> property invoke 1 millons times!!!
+        /// If we only want copy is not necesary call that method so many times.
+        /// </summary>
+        private long TempCount = -1;
         public long Count => (_tensor is not null ? _tensor.numel() : 0);
 
         public bool IsReadOnly => false;
 
+
         public T[] ToArray()
         {
             if (_tensor.ndim < 2)
                 return (T[])ToNDArray();
 
-            var result = new T[Count];
+            var shps = _tensor.shape;
+            TempCount = 1;
+            for(int i=0;i<shps.Length;i++)
+                TempCount *= shps[i]; //Theorically the numel is simple as product of each element shape
+            var result = new T[TempCount];
             CopyTo(result);
             return result;
         }
@@ -241,6 +253,16 @@ public void CopyTo(T[] array, int arrayIndex = 0, long tensorIndex = 0)
             }
         }
 
+        public void CopyTo(Span<T> array, int arrayIndex = 0, long tensorIndex = 0)
+        {
+            int idx = arrayIndex;
+            foreach (int offset in GetSubsequentIndices(tensorIndex)) {
+                if (idx >= array.Length) break;
+                unsafe { array[idx] = ((T*)_tensor_data_ptr)[offset]; }
+                idx += 1;
+            }
+        }
+
         public void CopyFrom(T[] array, int arrayIndex = 0, long tensorIndex = 0)
         {
             int idx = arrayIndex;
@@ -251,6 +273,16 @@ public void CopyFrom(T[] array, int arrayIndex = 0, long tensorIndex = 0)
             }
         }
 
+        public void CopyFrom(ReadOnlySpan<T> array, int arrayIndex = 0, long tensorIndex = 0)
+        {
+            int idx = arrayIndex;
+            foreach (int offset in GetSubsequentIndices(tensorIndex)) {
+                if (idx >= array.Length) break;
+                unsafe { ((T*)_tensor_data_ptr)[offset] = array[idx]; }
+                idx += 1;
+            }
+        }
+
         /// <summary>
         /// Translates a linear index within the span represented by the accessor to a linear index
         /// used by the underlying tensor. The two should only be different if the tensor is a view
@@ -274,7 +306,27 @@ private static long TranslateIndex(long idx, torch.Tensor tensor)
 
             return result;
         }
+        /// <summary>
+        /// WARNING: Test purpose not use in production
+        /// </summary>
+        private long TranslateIndexNonStatic(long idx, torch.Tensor tensor)
+        {
+            if (idx >= TempCount || idx < 0)
+                throw new ArgumentOutOfRangeException($"{idx} in a collection of  ${tensor.numel()} elements.");
+
+            if (tensor.is_contiguous() || idx == 0) return idx;
 
+            long result = 0;
+            var shape = tensor.shape;
+            var strides = tensor.stride();
+
+            for (var i = shape.Length - 1; i >= 0; i--) {
+                idx = Math.DivRem(idx, shape[i], out long s);
+                result += s * strides[i];
+            }
+
+            return result;
+        }
         private static long TranslateIndex(long[] idx, torch.Tensor tensor)
         {
             long result = 0;
@@ -347,15 +399,18 @@ internal static T ReadItemAt(torch.Tensor tensor, long index)
 
         private IEnumerable<long> GetSubsequentIndices(long startingIndex)
         {
-            if (startingIndex < 0 || startingIndex >= Count)
+            TempCount = Count;
+
+            if (startingIndex < 0 || startingIndex >= TempCount)
                 throw new ArgumentOutOfRangeException(nameof(startingIndex));
 
-            if (Count <= 1) {
-                if (Count == 0) {
+            if (TempCount <= 1) {
+                if (TempCount == 0) {
                     return Enumerable.Empty<long>();
                 }
 
-                return (new long[] { 0 }).AsEnumerable<long>();
+                return new List<long>() { 0 };
+                //return (new long[] { 0 }).AsEnumerable<long>();
             }
 
             if (_tensor.is_contiguous()) {
@@ -371,7 +426,6 @@ private IEnumerable<long> GetSubsequentIndices(long startingIndex)
 
             return MultiDimensionIndices(startingIndex);
         }
-
         private IEnumerable<long> MultiDimensionIndices(long startingIndex)
         {
             long[] shape = _tensor.shape;
@@ -379,7 +433,8 @@ private IEnumerable<long> MultiDimensionIndices(long startingIndex)
             long[] inds = new long[stride.Length];
 
             long index = startingIndex;
-            long offset = TranslateIndex(startingIndex, _tensor);
+            //long offset = TranslateIndex(startingIndex, _tensor);
+            long offset = TranslateIndexNonStatic(startingIndex, _tensor); //WARNING: Test purpose not use in production
 
             while (true) {
 
@@ -387,7 +442,7 @@ private IEnumerable<long> MultiDimensionIndices(long startingIndex)
 
                 yield return offset;
 
-                if (index >= Count) break;
+                if (index >= TempCount) break;
 
                 for (int i = inds.Length - 1; ; i--) {
                     Debug.Assert(i >= 0);
@@ -408,21 +463,23 @@ private IEnumerable<long> MultiDimensionIndices(long startingIndex)
         private IEnumerable<long> SimpleIndices(long startingIndex, long stride)
         {
             long index = startingIndex;
-            long offset = TranslateIndex(startingIndex, _tensor);
+            //long offset = TranslateIndex(startingIndex, _tensor);
+            long offset = TranslateIndexNonStatic(startingIndex, _tensor); //WARNING: Test purpose not use in production
 
-            while (index < Count) {
+            while (index < TempCount) {
                 yield return offset;
                 offset += stride;
                 index += 1;
             }
         }
+
         private IEnumerable<long> ContiguousIndices(long startingIndex)
         {
             // If there was an overload for Enumerable.Range that
             // produced long integers, we wouldn't need this implementation.
 
             long index = startingIndex;
-            while (index < Count) {
+            while (index < TempCount) {
                 yield return index;
                 index += 1;
             }

From 5062339fe0cc4989f286bcd5812c00b4f920bc4a Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 30 Jun 2024 20:02:32 -0300
Subject: [PATCH 09/25] fix local build dotnet

---
 src/Examples/AdversarialExampleGeneration.cs      | 2 ++
 src/Examples/SequenceToSequence.cs                | 7 +++++++
 src/Examples/TextClassification.cs                | 2 ++
 src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs | 6 +++---
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/Examples/AdversarialExampleGeneration.cs b/src/Examples/AdversarialExampleGeneration.cs
index 7bfc174b2..49bd10956 100644
--- a/src/Examples/AdversarialExampleGeneration.cs
+++ b/src/Examples/AdversarialExampleGeneration.cs
@@ -34,6 +34,8 @@ public class AdversarialExampleGeneration
     {
 #if NET472_OR_GREATER
         private readonly static string _dataLocation = NSPath.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "mnist");
+#elif NETSTANDARD2_0
+        private readonly static string _dataLocation = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "mnist");
 #else
         private readonly static string _dataLocation = Path.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "mnist");
 #endif // NET472_OR_GREATER
diff --git a/src/Examples/SequenceToSequence.cs b/src/Examples/SequenceToSequence.cs
index 436c05a67..8ff2c6dc5 100644
--- a/src/Examples/SequenceToSequence.cs
+++ b/src/Examples/SequenceToSequence.cs
@@ -6,6 +6,7 @@
 using System.Diagnostics;
 using static TorchSharp.torch;
 using static TorchSharp.torch.nn;
+using System.Text.RegularExpressions;
 
 namespace TorchSharp.Examples
 {
@@ -26,6 +27,8 @@ public class SequenceToSequence
         // This path assumes that you're running this on Windows.
 #if NET472_OR_GREATER
         private readonly static string _dataLocation = NSPath.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "wikitext-2-v1");
+#elif NETSTANDARD2_0               
+        private readonly static string _dataLocation = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "wikitext-2-v1");
 #else
         private readonly static string _dataLocation = Path.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "wikitext-2-v1");
 #endif // NET472_OR_GREATER
@@ -251,7 +254,11 @@ private void InitWeights()
 
             public override Tensor forward(Tensor t, Tensor mask)
             {
+#if !NETSTANDARD2_0
                 var src = pos_encoder.call(encoder.call(t) * MathF.Sqrt(ninputs));
+#else
+                var src = pos_encoder.call(encoder.call(t) * (float)Math.Sqrt(ninputs));
+#endif
                 var enc = transformer_encoder.call(src, mask);
                 return decoder.call(enc);
             }
diff --git a/src/Examples/TextClassification.cs b/src/Examples/TextClassification.cs
index 8fb175718..4cdc79bc1 100644
--- a/src/Examples/TextClassification.cs
+++ b/src/Examples/TextClassification.cs
@@ -36,6 +36,8 @@ public class TextClassification
         // This path assumes that you're running this on Windows.
 #if NET472_OR_GREATER
         private readonly static string _dataLocation = NSPath.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "AG_NEWS");
+#elif NETSTANDARD2_0               
+        private readonly static string _dataLocation = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "AG_NEWS");
 #else
         private readonly static string _dataLocation = Path.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "AG_NEWS");
 #endif // NET472_OR_GREATER
diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs
index 4b38f5655..173ccd48a 100644
--- a/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs
+++ b/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs
@@ -288,12 +288,12 @@ internal static extern IntPtr THSTensor_upsample_nearest3d(IntPtr input,
         [DllImport("LibTorchSharp")]
         internal static extern IntPtr THSTensor_to_device(IntPtr handle, int device_type, int device_index, [MarshalAs(UnmanagedType.U1)] bool copy, [MarshalAs(UnmanagedType.U1)] bool non_blocking);
 
+        [DllImport("LibTorchSharp")]
+        //internal static extern IntPtr THSTensor_to_type_and_device(IntPtr handle, sbyte scalar_type, int device_type, int device_index, [MarshalAs(UnmanagedType.U1)] bool copy);
+        internal static extern IntPtr THSTensor_to_type_and_device(IntPtr handle, sbyte scalar_type, int device_type, int device_index, [MarshalAs(UnmanagedType.U1)] bool copy, [MarshalAs(UnmanagedType.U1)] bool non_blocking);
         [DllImport("LibTorchSharp")]
         internal static extern IntPtr THSTensor_to_type(IntPtr handle, sbyte scalar_type, [MarshalAs(UnmanagedType.U1)] bool copy, [MarshalAs(UnmanagedType.U1)] bool non_blocking);
 
-        [DllImport("LibTorchSharp")]
-        internal static extern IntPtr THSTensor_to_type_and_device(IntPtr handle, sbyte scalar_type, int device_type, int device_index, [MarshalAs(UnmanagedType.U1)] bool copy, [MarshalAs(UnmanagedType.U1)] bool non_blocking);
-        internal static extern IntPtr THSTensor_to_type_and_device(IntPtr handle, sbyte scalar_type, int device_type, int device_index, [MarshalAs(UnmanagedType.U1)] bool copy);
         [DllImport("LibTorchSharp")]
         internal static extern IntPtr THSTensor_to_type_and_device_and_non_blocking(IntPtr handle, sbyte scalar_type, int device_type, int device_index, [MarshalAs(UnmanagedType.U1)] bool non_blocking);
 

From 3a467af99a1afc640d780e52510ecf82c97e5c5a Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Tue, 2 Jul 2024 18:16:42 -0300
Subject: [PATCH 10/25] Fast ToArray() TensorAccessor

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index a17061b33..875954e1a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -294,3 +294,4 @@ packages/
 /src/Native/ZERO_CHECK.vcxproj
 /src/Native/ZERO_CHECK.vcxproj.filters
 /src/FSharp.Examples/FSharp.Examples.fsproj
+/pkg/FileRestitcher

From 18c7528a50173ac26e21a5ec4d833c84510608be Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Tue, 2 Jul 2024 18:28:45 -0300
Subject: [PATCH 11/25] Fast tensor accesor

---
 Directory.Build.props                         |  9 +++-
 src/Native/LibTorchSharp/Utils.h              |  3 ++
 src/TorchSharp/Amp/AutocastDisposeManager.cs  | 29 ++++++++++++
 src/TorchSharp/Amp/AutocastDisposeScope.cs    | 23 ++++++++++
 src/TorchSharp/Amp/AutocastDisposedManager.cs | 10 -----
 src/TorchSharp/Amp/AutocastDisposedScope.cs   | 10 -----
 src/TorchSharp/Amp/AutocastMode.cs            |  5 ++-
 src/TorchSharp/Tensor/Tensor.cs               | 18 +++++++-
 src/TorchSharp/Utils/TensorAccessor.cs        | 44 +++++++++++++++----
 9 files changed, 118 insertions(+), 33 deletions(-)
 create mode 100644 src/TorchSharp/Amp/AutocastDisposeManager.cs
 create mode 100644 src/TorchSharp/Amp/AutocastDisposeScope.cs
 delete mode 100644 src/TorchSharp/Amp/AutocastDisposedManager.cs
 delete mode 100644 src/TorchSharp/Amp/AutocastDisposedScope.cs

diff --git a/Directory.Build.props b/Directory.Build.props
index 1321ec4ff..aad7547a9 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -5,6 +5,10 @@
   <Import Project="build/Dependencies.props" />
 
   <PropertyGroup>
+    <!--If set true need set full path of LibTorchPathCPU and LibTorchPathCUDA-->
+    <UseCustomLibTorchPath>true</UseCustomLibTorchPath>
+    <LibTorchPathCPU Condition="'$(UseCustomLibTorchPath)'=='true'">$(LibTorch)libtorch-win-shared-with-deps-2.3.1+cpu\libtorch</LibTorchPathCPU>
+    <LibTorchPathCUDA Condition="'$(UseCustomLibTorchPath)'=='true'">$(LibTorch)libtorch-win-shared-with-deps-2.3.1+cu121\libtorch</LibTorchPathCUDA>
     <Configuration Condition="'$(Configuration)'==''">Debug</Configuration>
     <Configurations>Debug;Release</Configurations>
     <_DefaultArchitecture>$([System.Runtime.InteropServices.RuntimeInformation]::OSArchitecture.ToString().ToLower())</_DefaultArchitecture>
@@ -133,7 +137,7 @@
     <NativeLibSymbolExtension Condition="'$(TargetOS)' == 'mac'">.dylib.dwarf</NativeLibSymbolExtension>
   </PropertyGroup>
 
-  <PropertyGroup>
+  <PropertyGroup Condition="'$(UseCustomLibTorchPath)'=='false'">
     <LibTorchArchiveSource>pytorch</LibTorchArchiveSource>
     <LibTorchArchiveSource Condition="'$(TargetPlatform)' == 'mac-arm64'">conda</LibTorchArchiveSource>
     <CondaArchivePlatformName Condition="'$(TargetPlatform)' == 'mac-arm64'">osx-arm64</CondaArchivePlatformName>
@@ -152,6 +156,9 @@
     <LibTorchCudaLocalBase>$(LibTorchArchiveCoreName)-$(LibTorchVersion)$(LibTorchCudaLocalNameSuffix)</LibTorchCudaLocalBase>
     <LibTorchCmakePath>$(IntermediateOutputRootPath)libtorch-cpu\$(LibTorchCpuLocalBase)\libtorch\share\cmake\Torch</LibTorchCmakePath>
   </PropertyGroup>
+  <PropertyGroup Condition="'$(UseCustomLibTorchPath)'=='true'">
+    <LibTorchCmakePath>$(LibTorchPathCPU)\share\cmake\Torch</LibTorchCmakePath>
+  </PropertyGroup>
 
   <!-- Language configuration -->
   <PropertyGroup>
diff --git a/src/Native/LibTorchSharp/Utils.h b/src/Native/LibTorchSharp/Utils.h
index 892e0e2ec..42573753b 100644
--- a/src/Native/LibTorchSharp/Utils.h
+++ b/src/Native/LibTorchSharp/Utils.h
@@ -59,6 +59,9 @@ struct TensorArray {
 inline Tensor ResultTensor(const at::Tensor & res)
 {
     if (res.defined()) {
+
+        //TODO: Autocast here only if is INNER-SCOPE 
+
         /*at::Tensor* resT = new torch::Tensor(res);
         if (at::autocast::is_autocast_cache_enabled()){
             if (res.is_cuda()) {
diff --git a/src/TorchSharp/Amp/AutocastDisposeManager.cs b/src/TorchSharp/Amp/AutocastDisposeManager.cs
new file mode 100644
index 000000000..83c31f335
--- /dev/null
+++ b/src/TorchSharp/Amp/AutocastDisposeManager.cs
@@ -0,0 +1,29 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace TorchSharp.Amp
+{
+    public class AutocastDisposeManager
+    {
+
+        /*[ThreadStatic] private static AutocastDisposeManager _threadAutocastSingleton;
+
+        internal static AutocastDisposeManager ThreadAutocastSingleton => _threadAutocastSingleton ??= new AutocastDisposeManager();
+
+        internal AutocastDisposeScope CurrentAutocastDispose;
+        //internal HashSet<torch.nn.Module> Modules = new List<torch.nn.Module>();
+        public AutocastDisposeManager()
+        {
+            CurrentAutocastDispose = new AutocastDisposeScope(this);
+        }
+        internal AutocastDisposeScope RegisterTensorAutocastScope(torch.Tensor t)
+        {
+            if (CurrentAutocastDispose == null)
+                return null;
+            CurrentAutocastDispose.Tensors.Add(t);
+            return CurrentAutocastDispose;
+        }*/
+
+    }
+}
diff --git a/src/TorchSharp/Amp/AutocastDisposeScope.cs b/src/TorchSharp/Amp/AutocastDisposeScope.cs
new file mode 100644
index 000000000..8f5df9490
--- /dev/null
+++ b/src/TorchSharp/Amp/AutocastDisposeScope.cs
@@ -0,0 +1,23 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace TorchSharp.Amp
+{
+    public sealed class AutocastDisposeScope : IDisposable
+    {
+        //private AutocastDisposeManager autocastDisposeManager;
+        public bool IsEnabled;
+        /*internal AutocastMode autocastMode = AutocastMode.GetInstance();
+        internal HashSet<torch.Tensor> Tensors = new HashSet<torch.Tensor>();
+        public AutocastDisposeScope(AutocastDisposeManager autocastDisposeManager)
+        {
+            this.autocastDisposeManager = autocastDisposeManager;
+            IsEnabled = true;
+        }*/
+        public void Dispose()
+        {
+            IsEnabled = false;
+        }
+    }
+}
diff --git a/src/TorchSharp/Amp/AutocastDisposedManager.cs b/src/TorchSharp/Amp/AutocastDisposedManager.cs
deleted file mode 100644
index d4ec1ccd7..000000000
--- a/src/TorchSharp/Amp/AutocastDisposedManager.cs
+++ /dev/null
@@ -1,10 +0,0 @@
-﻿using System;
-using System.Collections.Generic;
-using System.Text;
-
-namespace TorchSharp.Amp
-{
-    class AutocastDisposedManager
-    {
-    }
-}
diff --git a/src/TorchSharp/Amp/AutocastDisposedScope.cs b/src/TorchSharp/Amp/AutocastDisposedScope.cs
deleted file mode 100644
index 7c771d16f..000000000
--- a/src/TorchSharp/Amp/AutocastDisposedScope.cs
+++ /dev/null
@@ -1,10 +0,0 @@
-﻿using System;
-using System.Collections.Generic;
-using System.Text;
-
-namespace TorchSharp.Amp
-{
-    class AutocastDisposedScope
-    {
-    }
-}
diff --git a/src/TorchSharp/Amp/AutocastMode.cs b/src/TorchSharp/Amp/AutocastMode.cs
index 43d3805fa..07c8149d2 100644
--- a/src/TorchSharp/Amp/AutocastMode.cs
+++ b/src/TorchSharp/Amp/AutocastMode.cs
@@ -16,6 +16,7 @@ public static torch.Tensor AutoCast(this torch.Tensor input)
     //TODO: Should make Singleton and IDisposable on ENTER
     public sealed class AutocastMode : IDisposable
     {
+        //NEED "Register" all tensor in scope for uncasting outer-scope
         private bool Enabled, Prev;
         //private torch.ScalarType Dtype = torch.ScalarType.Float32;
         private torch.ScalarType fast_dtype = torch.ScalarType.Float32;
@@ -29,7 +30,7 @@ public sealed class AutocastMode : IDisposable
         }*/
         public static AutocastMode GetInstance()
         {
-            return instance ?? (instance = new AutocastMode(torch.CUDA, cache_enabled:true));
+            return instance ??= new AutocastMode(torch.CUDA, cache_enabled:true);
         }
 
         private AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enabled=true, bool? cache_enabled = null)
@@ -40,7 +41,7 @@ private AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enab
                 fast_dtype = torch.get_autocast_gpu_dtype();
             if (dev.type == DeviceType.CPU)
                 fast_dtype = torch.get_autocast_cpu_dtype();
-            IntPtr ptr = IntPtr.Zero;
+            //IntPtr ptr = IntPtr.Zero;
             
             bool _cache_enabled = torch.is_autocast_cache_enabled();
             if (!torch.cuda.is_available() && dev.type == DeviceType.CUDA) //Is not available for doing multicast
diff --git a/src/TorchSharp/Tensor/Tensor.cs b/src/TorchSharp/Tensor/Tensor.cs
index c2055d0ec..81f97cafa 100644
--- a/src/TorchSharp/Tensor/Tensor.cs
+++ b/src/TorchSharp/Tensor/Tensor.cs
@@ -9,6 +9,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Text;
+using TorchSharp.Amp;
 using TorchSharp.PInvoke;
 
 #nullable enable
@@ -33,13 +34,25 @@ public partial class Tensor : IDisposable
             static long _peakCount = 0;
 
             internal DisposeScope? OwningDisposeScope { get; set; }
-
+            //internal AutocastDisposeScope? AutocastDisposeScope;
             internal Tensor(IntPtr handle)
             {
                 this.handle = handle;
+                
+                /*if (_totalCount > 0) {
+                    //have used
+                    AutocastDisposeScope = AutocastDisposeManager.ThreadAutocastSingleton.RegisterTensorAutocastScope(this);
+                    this = AutocastDisposeScope.autocastMode.CastTensor(this); //should cast when using INSIDE NOT WHERE CREATED
+                }*/
                 System.Threading.Interlocked.Increment(ref _totalCount);
                 _peakCount = Math.Max(_totalCount, _peakCount);
                 OwningDisposeScope = DisposeScopeManager.ThreadSingleton.RegisterOnCurrentDisposeScope(this);
+
+                //TODO: Add Autocast/AMP ScopeManager, need improve this.. 1) is not threadsafe and may have big problem while casting and uncasting.
+                //DANGER: DONT USE THIS ON PRODUCTION
+                /*AutocastDisposeScope = AutocastDisposeManager.ThreadAutocastSingleton.RegisterTensorAutocastScope(this);
+                this = AutocastDisposeScope.autocastMode.CastTensor(this); //should cast when using INSIDE NOT WHERE CREATED*/
+                //Should cast inner scope when get tensors for every each method? example prod, sum, div, reshape, etc???
             }
 
             /// <summary>
@@ -209,6 +222,9 @@ public IntPtr Handle {
                 get {
                     if (handle == IntPtr.Zero)
                         throw new InvalidOperationException("Tensor invalid -- empty handle.");
+
+                    //AutocastDisposeScope.autocastMode.CastTensor(this); //This is wrong right???
+
                     return handle;
                 }
             }
diff --git a/src/TorchSharp/Utils/TensorAccessor.cs b/src/TorchSharp/Utils/TensorAccessor.cs
index ab9846eec..f0050c928 100644
--- a/src/TorchSharp/Utils/TensorAccessor.cs
+++ b/src/TorchSharp/Utils/TensorAccessor.cs
@@ -3,6 +3,7 @@
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.Linq;
+using System.Runtime.InteropServices;
 using static TorchSharp.PInvoke.NativeMethods;
 
 namespace TorchSharp.Utils
@@ -43,13 +44,13 @@ internal TensorAccessor(torch.Tensor tensor)
         /// This temporary count avoid so much use CPU. The Property <see cref="Count"/> act as method.
         /// If tensor is for example 640*640*3 = 1.228.800, <see cref="Count"/> property invoke 1 millons times!!!
         /// If we only want copy is not necesary call that method so many times.
+        /// For some reason the method numel() use so much cpu.
         /// </summary>
-        private long TempCount = -1;
-        public long Count => (_tensor is not null ? _tensor.numel() : 0);
+        internal long TempCount = -1;
+        public long Count => _tensor?.numel() ?? 0;
 
         public bool IsReadOnly => false;
 
-
         public T[] ToArray()
         {
             if (_tensor.ndim < 2)
@@ -59,6 +60,14 @@ public T[] ToArray()
             TempCount = 1;
             for(int i=0;i<shps.Length;i++)
                 TempCount *= shps[i]; //Theorically the numel is simple as product of each element shape
+            
+            if (typeof(T) == typeof(float)) {
+                if (_tensor.is_contiguous()) { //This is very fast. And work VERY WELL
+                    unsafe {
+                        return new Span<T>(_tensor_data_ptr.ToPointer(), Convert.ToInt32(TempCount)).ToArray();
+                    }
+                }
+            }
             var result = new T[TempCount];
             CopyTo(result);
             return result;
@@ -246,6 +255,18 @@ private void validate(long index)
         public void CopyTo(T[] array, int arrayIndex = 0, long tensorIndex = 0)
         {
             int idx = arrayIndex;
+            /*if (_tensor.is_contiguous()) {
+                if (typeof(T) == typeof(float)) {
+                    float[] ff = new float[TempCount];
+                    Marshal.Copy(_tensor_data_ptr, ff, 0,ff.Length);
+                }
+            }*/
+            //Because the contiguous cause arange from tensorIndex to Numel. So is not necesary "create" array of arange, i said "create" because in fact enumerable do not create itself. Very cool.
+            if (_tensor.is_contiguous()) {
+                for(long i= tensorIndex; i<TempCount;i++)
+                    unsafe { array[i] = ((T*)_tensor_data_ptr)[i]; }
+                return;
+            }
             foreach (int offset in GetSubsequentIndices(tensorIndex)) {
                 if (idx >= array.Length) break;
                 unsafe { array[idx] = ((T*)_tensor_data_ptr)[offset]; }
@@ -399,7 +420,7 @@ internal static T ReadItemAt(torch.Tensor tensor, long index)
 
         private IEnumerable<long> GetSubsequentIndices(long startingIndex)
         {
-            TempCount = Count;
+            //TempCount = Count;
 
             if (startingIndex < 0 || startingIndex >= TempCount)
                 throw new ArgumentOutOfRangeException(nameof(startingIndex));
@@ -477,7 +498,7 @@ private IEnumerable<long> ContiguousIndices(long startingIndex)
         {
             // If there was an overload for Enumerable.Range that
             // produced long integers, we wouldn't need this implementation.
-
+            
             long index = startingIndex;
             while (index < TempCount) {
                 yield return index;
@@ -534,11 +555,16 @@ private void Dispose(bool disposing)
 #if true
         public IEnumerator<T> GetEnumerator()
         {
-            if (Count <= 1) {
-                if (Count == 0)
+            if (TempCount <= 1) {
+                if (TempCount == 0)
                     return Enumerable.Empty<T>().GetEnumerator();
                 return new T[1] { this[0] }.AsEnumerable<T>().GetEnumerator();
             }
+            /*if (Count <= 1) {
+                if (Count == 0)
+                    return Enumerable.Empty<T>().GetEnumerator();
+                return new T[1] { this[0] }.AsEnumerable<T>().GetEnumerator();
+            }*/
 
             if (_tensor.is_contiguous()) {
                 return new SimpleAtorImpl(this, 1);
@@ -568,7 +594,7 @@ private class SimpleAtorImpl : IEnumerator<T>
             public SimpleAtorImpl(TensorAccessor<T> span, long stride)
             {
                 _span = span;
-                _count = span.Count;
+                _count = span.TempCount;
                 Debug.Assert(_count > 0);
                 _stride = stride;
                 Reset();
@@ -623,7 +649,7 @@ public GeneralAtorImpl(TensorAccessor<T> span, long[] stride)
             {
                 Debug.Assert(stride.Length > 1);
                 _span = span;
-                _count = span.Count;
+                _count = span.TempCount;
                 Debug.Assert(_count > 0);
                 _shape = span._tensor.shape;
                 Debug.Assert(_shape.Length == stride.Length);

From 728c9fb7100eeb893d15af636783972a6ab1a6c7 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Mon, 8 Jul 2024 22:22:43 -0300
Subject: [PATCH 12/25] fix accesor for every types

---
 Directory.Build.props                  |  2 +-
 TorchSharp.sln                         | 14 +++++++-------
 src/TorchSharp/Utils/TensorAccessor.cs |  8 +++-----
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/Directory.Build.props b/Directory.Build.props
index aad7547a9..1dbeae229 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -6,7 +6,7 @@
 
   <PropertyGroup>
     <!--If set true need set full path of LibTorchPathCPU and LibTorchPathCUDA-->
-    <UseCustomLibTorchPath>true</UseCustomLibTorchPath>
+    <UseCustomLibTorchPath>false</UseCustomLibTorchPath>
     <LibTorchPathCPU Condition="'$(UseCustomLibTorchPath)'=='true'">$(LibTorch)libtorch-win-shared-with-deps-2.3.1+cpu\libtorch</LibTorchPathCPU>
     <LibTorchPathCUDA Condition="'$(UseCustomLibTorchPath)'=='true'">$(LibTorch)libtorch-win-shared-with-deps-2.3.1+cu121\libtorch</LibTorchPathCUDA>
     <Configuration Condition="'$(Configuration)'==''">Debug</Configuration>
diff --git a/TorchSharp.sln b/TorchSharp.sln
index 8cec25c7d..054c07bb3 100644
--- a/TorchSharp.sln
+++ b/TorchSharp.sln
@@ -34,7 +34,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "TorchSharp", "TorchSharp",
 		pkg\TorchSharp\TorchSharp.symbols.nupkgproj = pkg\TorchSharp\TorchSharp.symbols.nupkgproj
 	EndProjectSection
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LibTorchSharp", "bin\obj\x64.Debug\Native\LibTorchSharp\LibTorchSharp.vcxproj", "{2B359162-062E-3C52-91D3-027A8542A58C}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LibTorchSharp", "bin\obj\x64.Debug\Native\LibTorchSharp\LibTorchSharp.vcxproj", "{265C2E6F-04E6-37A8-B504-E3DD4A3FEE06}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LibTorchSharp", "bin\obj\x64.Release\Native\LibTorchSharp\LibTorchSharp.vcxproj", "{E4C0DBEE-0815-311B-9065-137BB50BD793}"
 EndProject
@@ -66,9 +66,9 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
 		azure-pipelines.yml = azure-pipelines.yml
 		build\BranchInfo.props = build\BranchInfo.props
 		DEVGUIDE.md = DEVGUIDE.md
+		global.json = global.json
 		README.md = README.md
 		RELEASENOTES.md = RELEASENOTES.md
-		global.json = global.json
 	EndProjectSection
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TorchVision", "src\TorchVision\TorchVision.csproj", "{DCF01EE5-6431-4115-85E0-1FC4C3DE86A2}"
@@ -107,10 +107,10 @@ Global
 		{42B45168-476D-4BFA-87B8-81A34E6295CD}.Release|Any CPU.Build.0 = Release|Any CPU
 		{42B45168-476D-4BFA-87B8-81A34E6295CD}.Release|x64.ActiveCfg = Release|Any CPU
 		{42B45168-476D-4BFA-87B8-81A34E6295CD}.Release|x64.Build.0 = Release|Any CPU
-		{2B359162-062E-3C52-91D3-027A8542A58C}.Debug|Any CPU.ActiveCfg = Debug|x64
-		{2B359162-062E-3C52-91D3-027A8542A58C}.Debug|x64.ActiveCfg = Debug|x64
-		{2B359162-062E-3C52-91D3-027A8542A58C}.Release|Any CPU.ActiveCfg = Release|x64
-		{2B359162-062E-3C52-91D3-027A8542A58C}.Release|x64.ActiveCfg = Release|x64
+		{265C2E6F-04E6-37A8-B504-E3DD4A3FEE06}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{265C2E6F-04E6-37A8-B504-E3DD4A3FEE06}.Debug|x64.ActiveCfg = Debug|x64
+		{265C2E6F-04E6-37A8-B504-E3DD4A3FEE06}.Release|Any CPU.ActiveCfg = Release|x64
+		{265C2E6F-04E6-37A8-B504-E3DD4A3FEE06}.Release|x64.ActiveCfg = Release|x64
 		{E4C0DBEE-0815-311B-9065-137BB50BD793}.Debug|Any CPU.ActiveCfg = Debug|x64
 		{E4C0DBEE-0815-311B-9065-137BB50BD793}.Debug|x64.ActiveCfg = Debug|x64
 		{E4C0DBEE-0815-311B-9065-137BB50BD793}.Release|Any CPU.ActiveCfg = Release|x64
@@ -181,7 +181,7 @@ Global
 		{6C323B05-9028-4B09-911C-3C03AE058BEE} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{42B45168-476D-4BFA-87B8-81A34E6295CD} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{567456AD-B026-4CB6-B98D-4FC930C90223} = {D3D38B03-B557-484D-8348-8BADEE4DF592}
-		{2B359162-062E-3C52-91D3-027A8542A58C} = {CF2C1A9E-3A8A-4329-8A6E-7880C15AAC3D}
+		{265C2E6F-04E6-37A8-B504-E3DD4A3FEE06} = {CF2C1A9E-3A8A-4329-8A6E-7880C15AAC3D}
 		{E4C0DBEE-0815-311B-9065-137BB50BD793} = {4DB9E84D-324C-408F-87A6-246E86205540}
 		{CF2C1A9E-3A8A-4329-8A6E-7880C15AAC3D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{D8C60CD8-8429-45F2-A755-47B6CD10FDF8} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
diff --git a/src/TorchSharp/Utils/TensorAccessor.cs b/src/TorchSharp/Utils/TensorAccessor.cs
index f0050c928..f7f825ffc 100644
--- a/src/TorchSharp/Utils/TensorAccessor.cs
+++ b/src/TorchSharp/Utils/TensorAccessor.cs
@@ -61,11 +61,9 @@ public T[] ToArray()
             for(int i=0;i<shps.Length;i++)
                 TempCount *= shps[i]; //Theorically the numel is simple as product of each element shape
             
-            if (typeof(T) == typeof(float)) {
-                if (_tensor.is_contiguous()) { //This is very fast. And work VERY WELL
-                    unsafe {
-                        return new Span<T>(_tensor_data_ptr.ToPointer(), Convert.ToInt32(TempCount)).ToArray();
-                    }
+            if (_tensor.is_contiguous()) { //This is very fast. And work VERY WELL
+                unsafe {
+                    return new Span<T>(_tensor_data_ptr.ToPointer(), Convert.ToInt32(TempCount)).ToArray();
                 }
             }
             var result = new T[TempCount];

From a9a611aeecfa85b75cc51021f2eeef0145493b5d Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Fri, 12 Jul 2024 13:43:16 -0300
Subject: [PATCH 13/25] GradScaler

---
 src/Native/LibTorchSharp/CMakeLists.txt       |   2 +
 src/Native/LibTorchSharp/THSAmp.cpp           |  15 +++
 src/Native/LibTorchSharp/THSAmp.h             |  13 ++
 src/Native/LibTorchSharp/THSTensor.cpp        |  13 ++
 src/Native/LibTorchSharp/THSTensor.h          |   3 +
 src/TorchSharp/Amp/GradScaler.cs              | 121 +++++++++++++++---
 .../PInvoke/LibTorchSharp.THSAmp.cs           |  15 +++
 .../PInvoke/LibTorchSharp.THSTensor.cs        |   5 +
 .../PInvoke/LibTorchSharp.THSTorchCuda.cs     |   2 +
 src/TorchSharp/Tensor/Tensor.cs               |  29 +++++
 src/TorchSharp/Tensor/torch.Amp.cs            |  17 +++
 11 files changed, 216 insertions(+), 19 deletions(-)
 create mode 100644 src/Native/LibTorchSharp/THSAmp.cpp
 create mode 100644 src/Native/LibTorchSharp/THSAmp.h
 create mode 100644 src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
 create mode 100644 src/TorchSharp/Tensor/torch.Amp.cs

diff --git a/src/Native/LibTorchSharp/CMakeLists.txt b/src/Native/LibTorchSharp/CMakeLists.txt
index a592475ad..c0852a2a1 100644
--- a/src/Native/LibTorchSharp/CMakeLists.txt
+++ b/src/Native/LibTorchSharp/CMakeLists.txt
@@ -9,6 +9,7 @@ find_package(Torch REQUIRED PATHS ${LIBTORCH_PATH})
 set(SOURCES
     cifar10.h
 	crc32c.h
+	THSAmp.h
     THSAutograd.h
     THSData.h
     THSJIT.h
@@ -21,6 +22,7 @@ set(SOURCES
     cifar10.cpp
 	crc32c.c
 	THSActivation.cpp
+	THSAmp.cpp
     THSAutograd.cpp
 	THSConvolution.cpp
     THSData.cpp
diff --git a/src/Native/LibTorchSharp/THSAmp.cpp b/src/Native/LibTorchSharp/THSAmp.cpp
new file mode 100644
index 000000000..56ea1ac18
--- /dev/null
+++ b/src/Native/LibTorchSharp/THSAmp.cpp
@@ -0,0 +1,15 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+#include "THSAmp.h"
+
+#include <iostream>
+#include <fstream>
+
+/*void THSAmp_amp_foreach_non_finite_check_and_unscale_(const at::TensorList self, at::Tensor& found_inf, const at::Tensor& inv_scale)
+{
+    torch::_amp_foreach_non_finite_check_and_unscale_(self, found_inf, inv_scale);
+}*/
+
+void THSAmp_amp_foreach_non_finite_check_and_unscale_(Tensor* self, const int64_t tLength, at::Tensor& found_inf, const at::Tensor& inv_scale)
+{
+    torch::_amp_foreach_non_finite_check_and_unscale_(toTensors<at::Tensor>((torch::Tensor**)self, tLength),found_inf,inv_scale);
+}
diff --git a/src/Native/LibTorchSharp/THSAmp.h b/src/Native/LibTorchSharp/THSAmp.h
new file mode 100644
index 000000000..c85eb0609
--- /dev/null
+++ b/src/Native/LibTorchSharp/THSAmp.h
@@ -0,0 +1,13 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+#pragma once
+
+#include "../Stdafx.h"
+
+#include "torch/torch.h"
+
+#include "Utils.h"
+
+//https://github.com/pytorch/pytorch/blob/main/torch/_meta_registrations.py#L5957
+//EXPORT_API(void) THSAmp_amp_foreach_non_finite_check_and_unscale_(const at::TensorList self, at::Tensor& found_inf, const at::Tensor& inv_scale);
+
+EXPORT_API(void) THSAmp_amp_foreach_non_finite_check_and_unscale_(Tensor* self, const int64_t tLength, at::Tensor& found_inf, const at::Tensor& inv_scale);
diff --git a/src/Native/LibTorchSharp/THSTensor.cpp b/src/Native/LibTorchSharp/THSTensor.cpp
index 5a41bdca0..970dbdeb6 100644
--- a/src/Native/LibTorchSharp/THSTensor.cpp
+++ b/src/Native/LibTorchSharp/THSTensor.cpp
@@ -2226,3 +2226,16 @@ Tensor THSTensor_unflatten_names(Tensor tensor, const char** names, const int64_
 
     return nullptr;
 }
+
+bool THSTensor_is_coalesce(Tensor tensor)
+{
+    return tensor->is_coalesced();
+}
+
+Tensor THSTensor_coalesce(Tensor tensor)
+{
+    CATCH(
+        return ResultTensor(tensor->coalesce());
+    );
+    return nullptr;
+}
\ No newline at end of file
diff --git a/src/Native/LibTorchSharp/THSTensor.h b/src/Native/LibTorchSharp/THSTensor.h
index 36468d995..b889ca055 100644
--- a/src/Native/LibTorchSharp/THSTensor.h
+++ b/src/Native/LibTorchSharp/THSTensor.h
@@ -1743,3 +1743,6 @@ EXPORT_API(Tensor) THSTensor_kaiser_window(const int64_t len, bool periodic, dou
 
 EXPORT_API(Tensor) THSTensor_stft(const Tensor x, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor window, bool normalized, int64_t onesided, bool return_complex);
 EXPORT_API(Tensor) THSTensor_istft(const Tensor x, int64_t n_fft, int64_t hop_length, int64_t win_length, const Tensor window, bool center, bool normalized, int64_t onesided, int64_t length, bool return_complex);
+
+EXPORT_API(Tensor) THSTensor_coalesce(const Tensor x);
+EXPORT_API(bool) THSTensor_is_coalesce(const Tensor x);
\ No newline at end of file
diff --git a/src/TorchSharp/Amp/GradScaler.cs b/src/TorchSharp/Amp/GradScaler.cs
index 6da7a9dab..ac10ef6ea 100644
--- a/src/TorchSharp/Amp/GradScaler.cs
+++ b/src/TorchSharp/Amp/GradScaler.cs
@@ -4,6 +4,7 @@
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
+using TorchSharp.Modules;
 
 namespace TorchSharp.Amp
 {
@@ -20,19 +21,19 @@ public GradScaler(torch.Device dev, float init_scale = 2.0e16f, float growth_fac
             float backoff_factor = 0.5f, int growth_interval = 2000, bool enabled = true)
         {
             Debug.Assert(dev == torch.CPU || dev == torch.CUDA);
-            this.Enabled = enabled;
-            this.InitScale = init_scale;
-            this.GrowthFactor = growth_factor;
-            this.BackoffFactor = backoff_factor;
-            this.GrowthInterval = growth_interval;
-            this.InitGrowthTracker = 0.0f;
+            Enabled = enabled;
+            InitScale = init_scale;
+            GrowthFactor = growth_factor;
+            BackoffFactor = backoff_factor;
+            GrowthInterval = growth_interval;
+            InitGrowthTracker = 0.0f;
             throw new NotImplementedException();
         }
 
         private void LazyInitScaleGrowthTracker(torch.Device dev)
         {
-            this._scale = torch.full(0, this.InitScale, torch.ScalarType.Float32, device: dev);
-            this._growth_tracker = torch.full(0, this.InitGrowthTracker, torch.ScalarType.Float32, device: dev);
+            _scale = torch.full(0, InitScale, torch.ScalarType.Float32, device: dev);
+            _growth_tracker = torch.full(0, InitGrowthTracker, torch.ScalarType.Int32, device: dev);
         }
 
         //private check_scale_growth_tracker
@@ -40,27 +41,109 @@ public torch.Tensor scale(torch.Tensor output)
         {
             if (!Enabled)
                 return output;
-            if (_scale.numel() == 0)
-                this.LazyInitScaleGrowthTracker(output.device);
-            return output * this._scale.to(output.device, output.dtype, true);
+            if (_scale.is_null())
+                LazyInitScaleGrowthTracker(output.device);
+            return output * _scale.to(output.device, output.dtype, true);
         }
 
-        public torch.Tensor unscale_grads(torch.optim.Optimizer optimizer, torch.Tensor inv_scale, torch.Tensor found_inf, bool allow_fp16)
+        public IList<torch.Tensor> scale(IList<torch.Tensor> outputs)
         {
-            return false;
+            apply_scale(outputs);
+            return outputs;
         }
+        private class MultiDeviceReplicator
+        {
+            private torch.Tensor master;
 
-        public void unscale(torch.optim.Optimizer optimizer)
+            internal Dictionary<torch.Device, torch.Tensor> per_device_tensors = new Dictionary<torch.Device, torch.Tensor>();
+            public MultiDeviceReplicator(torch.Tensor master_tensor)
+            {
+                master = master_tensor;
+            }
+
+            public torch.Tensor Get(torch.Device device)
+            {
+                torch.Tensor retval=null;
+                if (!per_device_tensors.ContainsKey(device)) {
+                    retval = master.to(device, true, non_blocking: true);
+                    per_device_tensors.Add(device, retval);
+                }
+                return retval;
+            }
+        }
+        
+        private torch.Tensor apply_scale(torch.Tensor scale)
         {
-            if (!Enabled)
-                return;
+            IList<MultiDeviceReplicator> stash = new List<MultiDeviceReplicator>();
+            if (stash.Count == 0) {
+                if (_scale.is_null()) {
+                    LazyInitScaleGrowthTracker(scale.device);
+                }
+                stash.Add(new MultiDeviceReplicator(_scale));
+            }
+            return scale * stash[0].Get(scale.device);
+        }
 
-            
+        private void apply_scale(IList<torch.Tensor> scales)
+        {
+            for (int i = 0; i < scales.Count; i++)
+                scales[i] = apply_scale(scales[i]);
         }
-        /*public IList<torch.Tensor> scale(IList<torch.Tensor> outputs)
+        public Dictionary<torch.Device, torch.Tensor> unscale_grads(torch.optim.Optimizer optimizer, torch.Tensor inv_scale, torch.Tensor found_inf, bool allow_fp16)
         {
+            var per_device_inv_scale = new MultiDeviceReplicator(inv_scale);
+            var per_device_found_inf= new MultiDeviceReplicator(found_inf);
+            Dictionary<torch.Device, Dictionary<torch.ScalarType, IList<torch.Tensor>>> per_device_and_dtype_grads = new Dictionary<torch.Device, Dictionary<torch.ScalarType, IList<torch.Tensor>>>();
+
+            using (torch.no_grad()) {
+                if (optimizer is AdamW adamW){ //Some optimizer have parameter tensor for unscale_grads i need that.
+                    using (var enumer = adamW.parameters().GetEnumerator()) {
+                        while (enumer.MoveNext()) {
+                            var param = enumer.Current;
+                            if (param.is_null()) 
+                                continue;
+                            if (!allow_fp16 && param.dtype == torch.ScalarType.Float16)
+                                throw new Exception("Attempting to unscale FP16 Gradients");
+                            torch.Tensor to_unscale;
+                            if (param.grad.is_sparse) {
+                                if (param.grad.dtype == torch.ScalarType.Float16) {
+                                    
+                                    param.grad = param.grad.coalesce();
+                                }
+
+                                to_unscale = param.grad.SparseValues;
+                            } else {
+                                to_unscale = param.grad;
+                            }
 
+                            if (!per_device_and_dtype_grads.ContainsKey(to_unscale.device)) {
+                                per_device_and_dtype_grads.Add(to_unscale.device, new Dictionary<torch.ScalarType, IList<torch.Tensor>>());
+                                per_device_and_dtype_grads[to_unscale.device].Add(to_unscale.dtype, new List<torch.Tensor>());
+                                per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].Add(to_unscale);
+                            } else {
+                                if (!per_device_and_dtype_grads[to_unscale.device].ContainsKey(to_unscale.dtype)) {
+                                    per_device_and_dtype_grads[to_unscale.device].Add(to_unscale.dtype, new List<torch.Tensor>());
+                                } else {
+                                    per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].Add(to_unscale);
+                                }
+                            }
 
-        }*/
+                        }
+                    }
+
+                    foreach (var d in per_device_and_dtype_grads)
+                        foreach (var g in d.Value)
+                            torch._amp_foreach_non_finite_check_and_unscale_(g.Value, per_device_found_inf.Get(d.Key), per_device_inv_scale.Get(d.Key));
+                }
+            }
+
+            return per_device_found_inf.per_device_tensors;
+        }
+
+        public void unscale(torch.optim.Optimizer optimizer)
+        {
+            if (!Enabled)
+                return;
+        }
     }
 }
\ No newline at end of file
diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
new file mode 100644
index 000000000..5b1716bf3
--- /dev/null
+++ b/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
@@ -0,0 +1,15 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+#nullable enable
+using System;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
+
+namespace TorchSharp.PInvoke
+{
+    internal static partial class NativeMethods
+    {
+        [DllImport("LibTorchSharp")]
+        internal static extern void THSAmp_amp_foreach_non_finite_check_and_unscale_(IntPtr tensors, long tLength, IntPtr found_inf, IntPtr inv_scale);
+
+    }
+}
\ No newline at end of file
diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs
index 173ccd48a..2428223d9 100644
--- a/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs
+++ b/src/TorchSharp/PInvoke/LibTorchSharp.THSTensor.cs
@@ -2110,6 +2110,11 @@ internal static extern IntPtr THSTensor_upsample_nearest3d(IntPtr input,
         internal static extern IntPtr THSTensor_histogram_out_t(IntPtr input, IntPtr bins, IntPtr weight, bool density, out IntPtr hist, out IntPtr bin_edges, out IntPtr r_bin_edges);
         [DllImport("LibTorchSharp")]
         internal static extern IntPtr THSTensor_histogram_out_i(IntPtr input, long bins, IntPtr range, int length, IntPtr weight, bool density, out IntPtr hist, out IntPtr bin_edges, out IntPtr r_bin_edges);
+
+        [DllImport("LibTorchSharp")]
+        internal static extern IntPtr THSTensor_coalesce(IntPtr input);
+        [DllImport("LibTorchSharp")]
+        internal static extern bool THSTensor_is_coalesce(IntPtr input);
     }
 #pragma warning restore CA2101
 }
diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSTorchCuda.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSTorchCuda.cs
index fc67a88de..531b47d76 100644
--- a/src/TorchSharp/PInvoke/LibTorchSharp.THSTorchCuda.cs
+++ b/src/TorchSharp/PInvoke/LibTorchSharp.THSTorchCuda.cs
@@ -19,5 +19,7 @@ internal static partial class NativeMethods
 
         [DllImport("LibTorchSharp")]
         internal static extern void THSTorchCuda_synchronize(long device_index);
+
+
     }
 }
diff --git a/src/TorchSharp/Tensor/Tensor.cs b/src/TorchSharp/Tensor/Tensor.cs
index 81f97cafa..167fcb738 100644
--- a/src/TorchSharp/Tensor/Tensor.cs
+++ b/src/TorchSharp/Tensor/Tensor.cs
@@ -261,6 +261,7 @@ internal IntPtr MoveHandle()
             /// </summary>
             public long numel() => NumberOfElements;
 
+            public bool is_null() => handle == IntPtr.Zero;
             /// <summary>
             /// Get the size of each element in the tensor.
             /// </summary>
@@ -294,6 +295,21 @@ public bool is_nonzero()
                 return res != 0;
             }
 
+            public bool is_coalesce()
+            {
+                var res = NativeMethods.THSTensor_is_coalesce(Handle);
+                CheckForErrors();
+                return res;
+            }
+
+            public Tensor coalesce()
+            {
+                var res = NativeMethods.THSTensor_coalesce(Handle);
+                if(res == IntPtr.Zero)
+                    CheckForErrors();
+                return new Tensor(res);
+            }
+
             public bool is_cuda => device.type == DeviceType.CUDA;
 
             public bool is_meta => device.type == DeviceType.META;
@@ -716,6 +732,7 @@ public bool is_sparse {
             public void backward(IList<Tensor>? grad_tensors = null, bool create_graph = false, bool retain_graph = false, IList<Tensor>? inputs = null) =>
                 torch.autograd.backward(new[] { this }, grad_tensors, create_graph, retain_graph, inputs);
 
+
             /// <summary>
             /// Creates a tensor by loading it from a file.
             /// </summary>
@@ -7427,5 +7444,17 @@ public static Tensor WrappedTensorDisposeScope(Func<Tensor> expr)
             var result = expr();
             return result.MoveToOuterDisposeScope();
         }
+
+        public static void _amp_foreach_non_finite_check_and_unscale(Tensor found_inf, Tensor inv_scale)
+        {
+            if (found_inf.numel() == 1)
+                throw new Exception("found_inf must be a 1-element tensor.");
+            if (found_inf.numel() == 1)
+                throw new Exception("found_inf must be a 1-element tensor.");
+            if (found_inf.numel() == 1)
+                throw new Exception("found_inf must be a 1-element tensor.");
+            if (found_inf.numel() == 1)
+                throw new Exception("found_inf must be a 1-element tensor.");
+        }
     }
 }
\ No newline at end of file
diff --git a/src/TorchSharp/Tensor/torch.Amp.cs b/src/TorchSharp/Tensor/torch.Amp.cs
new file mode 100644
index 000000000..dfa4245fd
--- /dev/null
+++ b/src/TorchSharp/Tensor/torch.Amp.cs
@@ -0,0 +1,17 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using static TorchSharp.PInvoke.NativeMethods;
+
+namespace TorchSharp
+{
+    public static partial class torch
+    {
+        public static void _amp_foreach_non_finite_check_and_unscale_(IList<Tensor> tensors, Tensor found_inf, Tensor inv_scale)
+        {
+            using var ts = new PinnedArray<IntPtr>();
+            IntPtr tens = ts.CreateArray(tensors.Select(x => x.Handle).ToArray());
+            THSAmp_amp_foreach_non_finite_check_and_unscale_(tens, ts.Array.Length, found_inf.Handle, inv_scale.Handle);
+        }
+    }
+}

From 4a406ece7e7b9a0119300cb2230c6c02b9712b2b Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 14 Jul 2024 14:50:13 -0300
Subject: [PATCH 14/25] Trying fix build for azure

---
 .../FileRestitcher.Tests/FileRestitcher.Tests.csproj      | 8 ++++++--
 src/Examples/Examples.csproj                              | 7 +++++--
 src/TorchSharp/Torch.cs                                   | 2 +-
 src/TorchVision/models/VGG.cs                             | 6 +++---
 .../TorchSharpTest.WithCudaBinaries.csproj                | 1 +
 test/TorchSharpTest/TorchSharpTest.csproj                 | 1 +
 6 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/pkg/FileRestitcher/FileRestitcher.Tests/FileRestitcher.Tests.csproj b/pkg/FileRestitcher/FileRestitcher.Tests/FileRestitcher.Tests.csproj
index 37f37a9bb..39dc54a1b 100644
--- a/pkg/FileRestitcher/FileRestitcher.Tests/FileRestitcher.Tests.csproj
+++ b/pkg/FileRestitcher/FileRestitcher.Tests/FileRestitcher.Tests.csproj
@@ -1,4 +1,4 @@
-﻿<Project Sdk="Microsoft.NET.Sdk">
+<Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
     <NoBuild>false</NoBuild>
@@ -14,7 +14,11 @@
   <ItemGroup>
     <PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.9.4" />
     <PackageReference Include="xunit" Version="2.4.2" />
-    <PackageReference Include="xunit.runner.visualstudio" Version="2.4.5">
+    <PackageReference Include="xunit.runner.visualstudio" Version="2.4.0" Condition="'$(TargetFrameworks)'=='net472'">
+      <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
+      <PrivateAssets>all</PrivateAssets>
+    </PackageReference>
+    <PackageReference Include="xunit.runner.visualstudio" Version="2.2.0" Condition="'$(TargetFrameworks)'=='netstandard2.0'">
       <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
       <PrivateAssets>all</PrivateAssets>
     </PackageReference>
diff --git a/src/Examples/Examples.csproj b/src/Examples/Examples.csproj
index 10d6171e7..37ec4b75d 100644
--- a/src/Examples/Examples.csproj
+++ b/src/Examples/Examples.csproj
@@ -5,9 +5,12 @@
     <TestUsesLibTorch>true</TestUsesLibTorch>
     <TestCuda>true</TestCuda>
     <!-- The Directory.Build.props initialize TargetFrameworks to multiple targets. We have to clear that out to set only the targets we support. -->
-    <TargetFrameworks>net472;netstandard2.0;$(TargetFrameworks)</TargetFrameworks>
+    <!--
+    <TargetFrameworks>net472;netstandard2.0;$(TargetFrameworks)</TargetFrameworks>-->
     <LangVersion>9.0</LangVersion>
-    <TargetFrameworks Condition="'$(TargetFrameworks)' == ''">net6.0</TargetFrameworks>
+    <!--
+    <TargetFrameworks Condition="'$(TargetFrameworks)' == ''">net6.0</TargetFrameworks>-->
+    <TargetFrameworks>net6.0</TargetFrameworks>
     <TestUsesLibTorch>true</TestUsesLibTorch>
     <UseMLCodeAnalyzer>false</UseMLCodeAnalyzer>
     <UseStyleCopAnalyzer>false</UseStyleCopAnalyzer>
diff --git a/src/TorchSharp/Torch.cs b/src/TorchSharp/Torch.cs
index 6a6bbec0f..d10254a2c 100644
--- a/src/TorchSharp/Torch.cs
+++ b/src/TorchSharp/Torch.cs
@@ -158,7 +158,7 @@ private static void LoadNativeBackend(bool useCudaBackend, out StringBuilder? tr
                     var torchsharpLoc = Path.GetDirectoryName(typeof(torch).Assembly.Location);
                     var packagesDir = Path.GetFullPath(Path.Combine(torchsharpLoc!, "..", "..", "..", ".."));
                     var torchsharpHome = Path.GetFullPath(Path.Combine(torchsharpLoc!, "..", ".."));
-
+                    //torchsharpLoc = @"K:\Proyects_Repos\TorchSharp";
                     trace.AppendLine($"    torchsharpLoc = {torchsharpLoc}");
                     trace.AppendLine($"    packagesDir = {packagesDir}");
                     trace.AppendLine($"    torchsharpHome = {torchsharpHome}");
diff --git a/src/TorchVision/models/VGG.cs b/src/TorchVision/models/VGG.cs
index e79f9ddec..cb6ff9f7f 100644
--- a/src/TorchVision/models/VGG.cs
+++ b/src/TorchVision/models/VGG.cs
@@ -332,9 +332,9 @@ public class VGG : Module<Tensor, Tensor>
                 { "VGG19", new long[] { 64, 64, 0, 128, 128, 0, 256, 256, 256, 256, 0, 512, 512, 512, 512, 0, 512, 512, 512, 512, 0 } }
             };
 
-            private readonly Module<Tensor, Tensor> features;
-            private readonly Module<Tensor, Tensor> avgpool;
-            private readonly Module<Tensor, Tensor> classifier;
+            public readonly Module<Tensor, Tensor> features;
+            public readonly Module<Tensor, Tensor> avgpool;
+            public readonly Module<Tensor, Tensor> classifier;
 
             protected override void Dispose(bool disposing)
             {
diff --git a/test/TorchSharpTest.WithCudaBinaries/TorchSharpTest.WithCudaBinaries.csproj b/test/TorchSharpTest.WithCudaBinaries/TorchSharpTest.WithCudaBinaries.csproj
index 055fb9ffc..c7ef48fd8 100644
--- a/test/TorchSharpTest.WithCudaBinaries/TorchSharpTest.WithCudaBinaries.csproj
+++ b/test/TorchSharpTest.WithCudaBinaries/TorchSharpTest.WithCudaBinaries.csproj
@@ -12,6 +12,7 @@
     <UseStyleCopAnalyzer>false</UseStyleCopAnalyzer>
     <VSTestLogger>trx</VSTestLogger>
     <VSTestResultsDirectory>$(OutputPath)</VSTestResultsDirectory>
+    <Configurations>Debug;Release;LibTorch2.3.1</Configurations>
   </PropertyGroup>
 
   <ItemGroup>
diff --git a/test/TorchSharpTest/TorchSharpTest.csproj b/test/TorchSharpTest/TorchSharpTest.csproj
index 2de45fe06..d0d7ace08 100644
--- a/test/TorchSharpTest/TorchSharpTest.csproj
+++ b/test/TorchSharpTest/TorchSharpTest.csproj
@@ -13,6 +13,7 @@
     <VSTestLogger>trx</VSTestLogger>
     <VSTestResultsDirectory>$(OutputPath)</VSTestResultsDirectory>
     <LangVersion>10.0</LangVersion>
+    <Configurations>Debug;Release;LibTorch2.3.1</Configurations>
   </PropertyGroup>
 
   <ItemGroup>

From 280c8d59df7db5990efc6fe27d1bd474f27abf1a Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Tue, 16 Jul 2024 23:03:16 -0300
Subject: [PATCH 15/25] Range sequential

---
 src/Examples/Examples.csproj                  |  4 ++--
 src/TorchSharp/Amp/AutocastManager.cs         | 11 +++++++++++
 src/TorchSharp/Amp/GradScaler.cs              | 19 ++++++++++++++++---
 src/TorchSharp/NN/Sequential.cs               |  7 ++++++-
 .../Tensor/Factories/Tensor.Factories.cs      |  6 +++---
 test/TorchSharpTest/TorchSharpTest.csproj     |  3 +--
 6 files changed, 39 insertions(+), 11 deletions(-)
 create mode 100644 src/TorchSharp/Amp/AutocastManager.cs

diff --git a/src/Examples/Examples.csproj b/src/Examples/Examples.csproj
index 37ec4b75d..9b7a980b9 100644
--- a/src/Examples/Examples.csproj
+++ b/src/Examples/Examples.csproj
@@ -5,8 +5,8 @@
     <TestUsesLibTorch>true</TestUsesLibTorch>
     <TestCuda>true</TestCuda>
     <!-- The Directory.Build.props initialize TargetFrameworks to multiple targets. We have to clear that out to set only the targets we support. -->
-    <!--
-    <TargetFrameworks>net472;netstandard2.0;$(TargetFrameworks)</TargetFrameworks>-->
+    
+    <TargetFrameworks>net472;netstandard2.0;$(TargetFrameworks)</TargetFrameworks>
     <LangVersion>9.0</LangVersion>
     <!--
     <TargetFrameworks Condition="'$(TargetFrameworks)' == ''">net6.0</TargetFrameworks>-->
diff --git a/src/TorchSharp/Amp/AutocastManager.cs b/src/TorchSharp/Amp/AutocastManager.cs
new file mode 100644
index 000000000..d1808d316
--- /dev/null
+++ b/src/TorchSharp/Amp/AutocastManager.cs
@@ -0,0 +1,11 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace TorchSharp.Amp
+{
+    public class AutocastManager
+    {
+
+    }
+}
diff --git a/src/TorchSharp/Amp/GradScaler.cs b/src/TorchSharp/Amp/GradScaler.cs
index ac10ef6ea..060ad64ee 100644
--- a/src/TorchSharp/Amp/GradScaler.cs
+++ b/src/TorchSharp/Amp/GradScaler.cs
@@ -11,11 +11,10 @@ namespace TorchSharp.Amp
     public class GradScaler
     {
         private bool Enabled;
-
         private torch.Tensor _scale, _growth_tracker;
-
         private float InitScale, GrowthFactor, BackoffFactor, GrowthInterval, InitGrowthTracker;
 
+        private Dictionary<int, Dictionary<string, object>> _per_optimizer_states = new Dictionary<int, Dictionary<string, object>>();
         //https://github.com/pytorch/pytorch/blob/main/torch/amp/grad_scaler.py
         public GradScaler(torch.Device dev, float init_scale = 2.0e16f, float growth_factor = 2.0f,
             float backoff_factor = 0.5f, int growth_interval = 2000, bool enabled = true)
@@ -27,7 +26,8 @@ public GradScaler(torch.Device dev, float init_scale = 2.0e16f, float growth_fac
             BackoffFactor = backoff_factor;
             GrowthInterval = growth_interval;
             InitGrowthTracker = 0.0f;
-            throw new NotImplementedException();
+
+            throw new NotImplementedException("This need to finish");
         }
 
         private void LazyInitScaleGrowthTracker(torch.Device dev)
@@ -35,6 +35,7 @@ private void LazyInitScaleGrowthTracker(torch.Device dev)
             _scale = torch.full(0, InitScale, torch.ScalarType.Float32, device: dev);
             _growth_tracker = torch.full(0, InitGrowthTracker, torch.ScalarType.Int32, device: dev);
         }
+        //private Dictionary<string, object>
 
         //private check_scale_growth_tracker
         public torch.Tensor scale(torch.Tensor output)
@@ -140,10 +141,22 @@ private void apply_scale(IList<torch.Tensor> scales)
             return per_device_found_inf.per_device_tensors;
         }
 
+        private Tuple<torch.Tensor, torch.Tensor> check_scale_growth_tracker(string name)
+        {
+            var fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration.";
+            Debug.Assert(_scale.is_null(), $"Attempted {name} but {nameof(_scale)} is None {fix}");
+            Debug.Assert(_growth_tracker.is_null(), $"Attempted {name} but {nameof(_growth_tracker)} is None {fix}");
+            return new Tuple<torch.Tensor, torch.Tensor>(_scale, _growth_tracker);
+        }
+
         public void unscale(torch.optim.Optimizer optimizer)
         {
             if (!Enabled)
                 return;
+
+            check_scale_growth_tracker(nameof(unscale));
+
+            
         }
     }
 }
\ No newline at end of file
diff --git a/src/TorchSharp/NN/Sequential.cs b/src/TorchSharp/NN/Sequential.cs
index 711be65d1..2796aa913 100644
--- a/src/TorchSharp/NN/Sequential.cs
+++ b/src/TorchSharp/NN/Sequential.cs
@@ -31,7 +31,6 @@ public Sequential append(string name, torch.nn.IModule<Tensor, Tensor> module)
                 Add(name, module);
                 return this;
             }
-
             internal void Add(string name, torch.nn.IModule<Tensor, Tensor> sm)
             {
                 var submodule = (torch.nn.Module)sm;
@@ -51,6 +50,12 @@ public Sequential append(torch.nn.IModule<Tensor, Tensor> module)
                 return this;
             }
 
+            public Sequential append(IList<torch.nn.IModule<Tensor, Tensor>> modules)
+            {
+                for (int i = 0; i < modules.Count; i++)
+                    Add(_modules.Count.ToString(), modules[i]);
+                return this;
+            }
             internal void Add(torch.nn.IModule<Tensor, Tensor> module)
             {
                 var name = _modules.Count.ToString();
diff --git a/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs b/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs
index 67c28bd10..eee072261 100644
--- a/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs
+++ b/src/TorchSharp/Tensor/Factories/Tensor.Factories.cs
@@ -165,7 +165,7 @@ private static Tensor _tensor_generic(Array rawArray, ReadOnlySpan<long> dimensi
 
             unsafe {
                 void *ptr = null;
-                IntPtr iPtr = (IntPtr)ptr;
+                IntPtr iPtr = (IntPtr)ptr; //Warning: Unused variable 
 
                 fixed (long* shape = dimensions) {
                     var handle = THSTensor_new(dataArrayAddr, deleter, (IntPtr)shape, dimensions.Length, origType, (sbyte)dtype.Value, (int)device.type, device.index, requires_grad);
@@ -224,8 +224,8 @@ private static Tensor _tensor_generic<T>(Memory<T> rawArray, ReadOnlySpan<long>
                 deleters.TryAdd(deleter, deleter); // keep the delegate alive
 
                 void *ptr = null;
-                IntPtr iPtr = (IntPtr)ptr;
-                
+                IntPtr iPtr = (IntPtr)ptr; //Warning: Unused variable 
+
                 fixed (long* shape = dimensions) {
                     var handle = THSTensor_new(dataArrayAddr, deleter, (IntPtr)shape, dimensions.Length, origType, (sbyte)dtype.Value, (int)device.type, device.index, requires_grad);
 
diff --git a/test/TorchSharpTest/TorchSharpTest.csproj b/test/TorchSharpTest/TorchSharpTest.csproj
index d0d7ace08..808aa1ccf 100644
--- a/test/TorchSharpTest/TorchSharpTest.csproj
+++ b/test/TorchSharpTest/TorchSharpTest.csproj
@@ -114,7 +114,7 @@
   <ItemGroup Condition="'$(TargetFramework)' != 'net472'">
       <Compile Remove="netstandardTests.cs" />
   </ItemGroup>
-
+  <!--Condition="'$(TargetFramework)' == 'net6.0' OR '$(TargetFramework)' == 'net472'"-->
   <ItemGroup>
     <PackageReference Include="coverlet.collector" Version="3.2.0" Condition="'$(TargetFramework)' != 'net472'" />
     <PackageReference Include="System.Memory" Version="4.5.5" />
@@ -123,7 +123,6 @@
     <PackageReference Update="xunit.runner.visualstudio" Version="2.4.5" PrivateAssets="all" IncludeAssets="runtime; build; native; contentfiles; analyzers; buildtransitive" />
     <PackageReference Update="xunit" Version="2.4.2" />
   </ItemGroup>
-
   <PropertyGroup Condition="'$(Coverage)' == 'true'">
       <CollectCoverage>true</CollectCoverage>
       <SingleHit>true</SingleHit>

From 3c42a87bf4770d04fda2f67fc7ce1bca826b5598 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Fri, 19 Jul 2024 17:00:57 -0300
Subject: [PATCH 16/25] AMPManager

---
 src/TorchSharp/Amp/AMPManager.cs             | 89 ++++++++++++++++++
 src/TorchSharp/Amp/AutocastDisposeManager.cs | 29 ------
 src/TorchSharp/Amp/AutocastDisposeScope.cs   | 23 -----
 src/TorchSharp/Amp/AutocastManager.cs        | 11 ---
 src/TorchSharp/Amp/AutocastMode.cs           | 97 ++++++++++++++------
 src/TorchSharp/Amp/GradScaler.cs             |  7 +-
 src/TorchSharp/NN/Convolution/Conv1D.cs      | 28 +++++-
 src/TorchSharp/NN/Convolution/Conv2D.cs      | 60 +++++++++++-
 src/TorchSharp/NN/Module.cs                  | 10 ++
 src/TorchSharp/NN/Parameter.cs               | 13 +++
 src/TorchSharp/Tensor/Tensor.cs              | 13 ++-
 src/TorchSharp/Utils/ModuleInfo.cs           | 46 ++++++++++
 src/TorchSharp/Utils/UnorderedMap.cs         | 55 +++++++++++
 13 files changed, 376 insertions(+), 105 deletions(-)
 create mode 100644 src/TorchSharp/Amp/AMPManager.cs
 delete mode 100644 src/TorchSharp/Amp/AutocastDisposeManager.cs
 delete mode 100644 src/TorchSharp/Amp/AutocastDisposeScope.cs
 delete mode 100644 src/TorchSharp/Amp/AutocastManager.cs
 create mode 100644 src/TorchSharp/Utils/ModuleInfo.cs
 create mode 100644 src/TorchSharp/Utils/UnorderedMap.cs

diff --git a/src/TorchSharp/Amp/AMPManager.cs b/src/TorchSharp/Amp/AMPManager.cs
new file mode 100644
index 000000000..1ac24476a
--- /dev/null
+++ b/src/TorchSharp/Amp/AMPManager.cs
@@ -0,0 +1,89 @@
+using System;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
+using System.Text;
+using Google.Protobuf.WellKnownTypes;
+using TorchSharp.PInvoke;
+using TorchSharp.Utils;
+
+namespace TorchSharp.Amp
+{
+    public class AMPManager : IDisposable
+    {
+        //TODO: Make Singleton THREADSAFE
+        public UnorderedMap<IntPtr, torch.ScalarType> TensorPtrs;
+        private readonly AutocastMode autocastMode = AutocastMode.GetInstance();
+
+        private AMPManager() { }
+
+        public bool IsEnabled => autocastMode.Enabled;
+        private static AMPManager Instance;
+        //bool disposedValue;
+
+        public static AMPManager GetInstance()
+        {
+            return Instance ??= new AMPManager();
+        }
+
+        private void To(IntPtr ptr, torch.ScalarType type)
+        {
+            var res = NativeMethods.THSTensor_to_type(ptr, (sbyte)type);
+            if (res == IntPtr.Zero)
+                torch.CheckForErrors();
+        }
+        private void Revert()
+        {
+            using (var enumer = TensorPtrs.GetEnumerator())
+                while (enumer.MoveNext())
+                    To(enumer.Current.Key, enumer.Current.Value);
+            TensorPtrs.Clear(); //Or should use Stack for POP?? May better performance and better ram usage
+        }
+
+        public void Add(IntPtr ptr)
+        {
+            if (!autocastMode.Enabled) {
+                
+                if (TensorPtrs.ContainsKey(ptr))
+                    To(ptr, TensorPtrs[ptr]);
+                return;
+            }
+
+            TensorPtrs[ptr] = (torch.ScalarType)NativeMethods.THSTensor_type(ptr);
+            To(ptr, autocastMode.GetFastType()); //TODO: Set scalar autocast
+        }
+
+        public IDisposable Enter()
+        {
+            return null;
+        }
+        protected virtual void Dispose(bool disposing)
+        {
+            Revert();
+            autocastMode.Dispose();
+            /*if (!disposedValue) {
+                if (disposing) {
+                    
+                    
+                    // TODO: dispose managed state (managed objects)
+                }
+                
+                // TODO: free unmanaged resources (unmanaged objects) and override finalizer
+                // TODO: set large fields to null
+                disposedValue = true;
+            }*/
+        }
+
+        // // TODO: override finalizer only if 'Dispose(bool disposing)' has code to free unmanaged resources
+        ~AMPManager()
+        {
+            Dispose(false);
+        }
+
+        public void Dispose()
+        {
+            // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
+            Dispose(disposing: true);
+            GC.SuppressFinalize(this);
+        }
+    }
+}
diff --git a/src/TorchSharp/Amp/AutocastDisposeManager.cs b/src/TorchSharp/Amp/AutocastDisposeManager.cs
deleted file mode 100644
index 83c31f335..000000000
--- a/src/TorchSharp/Amp/AutocastDisposeManager.cs
+++ /dev/null
@@ -1,29 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Text;
-
-namespace TorchSharp.Amp
-{
-    public class AutocastDisposeManager
-    {
-
-        /*[ThreadStatic] private static AutocastDisposeManager _threadAutocastSingleton;
-
-        internal static AutocastDisposeManager ThreadAutocastSingleton => _threadAutocastSingleton ??= new AutocastDisposeManager();
-
-        internal AutocastDisposeScope CurrentAutocastDispose;
-        //internal HashSet<torch.nn.Module> Modules = new List<torch.nn.Module>();
-        public AutocastDisposeManager()
-        {
-            CurrentAutocastDispose = new AutocastDisposeScope(this);
-        }
-        internal AutocastDisposeScope RegisterTensorAutocastScope(torch.Tensor t)
-        {
-            if (CurrentAutocastDispose == null)
-                return null;
-            CurrentAutocastDispose.Tensors.Add(t);
-            return CurrentAutocastDispose;
-        }*/
-
-    }
-}
diff --git a/src/TorchSharp/Amp/AutocastDisposeScope.cs b/src/TorchSharp/Amp/AutocastDisposeScope.cs
deleted file mode 100644
index 8f5df9490..000000000
--- a/src/TorchSharp/Amp/AutocastDisposeScope.cs
+++ /dev/null
@@ -1,23 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Text;
-
-namespace TorchSharp.Amp
-{
-    public sealed class AutocastDisposeScope : IDisposable
-    {
-        //private AutocastDisposeManager autocastDisposeManager;
-        public bool IsEnabled;
-        /*internal AutocastMode autocastMode = AutocastMode.GetInstance();
-        internal HashSet<torch.Tensor> Tensors = new HashSet<torch.Tensor>();
-        public AutocastDisposeScope(AutocastDisposeManager autocastDisposeManager)
-        {
-            this.autocastDisposeManager = autocastDisposeManager;
-            IsEnabled = true;
-        }*/
-        public void Dispose()
-        {
-            IsEnabled = false;
-        }
-    }
-}
diff --git a/src/TorchSharp/Amp/AutocastManager.cs b/src/TorchSharp/Amp/AutocastManager.cs
deleted file mode 100644
index d1808d316..000000000
--- a/src/TorchSharp/Amp/AutocastManager.cs
+++ /dev/null
@@ -1,11 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Text;
-
-namespace TorchSharp.Amp
-{
-    public class AutocastManager
-    {
-
-    }
-}
diff --git a/src/TorchSharp/Amp/AutocastMode.cs b/src/TorchSharp/Amp/AutocastMode.cs
index 07c8149d2..0287e02d6 100644
--- a/src/TorchSharp/Amp/AutocastMode.cs
+++ b/src/TorchSharp/Amp/AutocastMode.cs
@@ -1,6 +1,7 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
+using System.Security.Cryptography;
 using System.Text;
 using System.Threading.Tasks;
 
@@ -17,22 +18,33 @@ public static torch.Tensor AutoCast(this torch.Tensor input)
     public sealed class AutocastMode : IDisposable
     {
         //NEED "Register" all tensor in scope for uncasting outer-scope
-        private bool Enabled, Prev;
+        internal bool Enabled, Prev;
         //private torch.ScalarType Dtype = torch.ScalarType.Float32;
-        private torch.ScalarType fast_dtype = torch.ScalarType.Float32;
-        private torch.Device Device = new torch.Device(DeviceType.CUDA);
+        internal torch.ScalarType fast_dtype = torch.ScalarType.Float32;
+        public torch.Device Device = new torch.Device(DeviceType.CUDA);
         private static AutocastMode instance;
+        bool disposedValue;
+
         /*public static AutocastMode GetInstance(torch.Device dev, torch.ScalarType? dtype = null, bool enabled = true, bool? cache_enabled = null)
-        {
-            if(instance ==null)
-                instance = new AutocastMode(dev, dtype, enabled, cache_enabled);
-            return instance;
-        }*/
+{
+if(instance ==null)
+instance = new AutocastMode(dev, dtype, enabled, cache_enabled);
+return instance;
+}*/
         public static AutocastMode GetInstance()
         {
             return instance ??= new AutocastMode(torch.CUDA, cache_enabled:true);
         }
 
+        public torch.ScalarType GetFastType()
+        {
+            var ft = torch.ScalarType.Float32;
+            if (Device.type == DeviceType.CUDA)
+                ft = torch.get_autocast_gpu_dtype();
+            if (Device.type == DeviceType.CPU)
+                ft = torch.get_autocast_cpu_dtype();
+            return ft;
+        }
         private AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enabled=true, bool? cache_enabled = null)
         {
             //var la = torch.tensor(9);
@@ -78,32 +90,57 @@ internal torch.Tensor CastTensor(torch.Tensor tensor)
                 return tensor;
             return tensor.to(fast_dtype, tensor.device);
         }
-        /*public IDisposable Enter()
-        {
 
-            return this;
-        }*/
-        public void Dispose()
+        private void Dispose(bool disposing)
         {
-            this.Enabled = false;
-            if (Device.type == DeviceType.CUDA) {
-                if(torch.autocast_decrement_nesting() == 0)
-                    torch.clear_autocast_cache();
-                torch.set_autocast_gpu_dtype(this.fast_dtype);
-                //torch.set_autocast_enabled(this.Prev);
-                torch.set_autocast_enabled(false);
-                torch.set_autocast_cache_enabled(false);
-            }
+            if (!disposedValue) {
+                if (disposing) {
 
-            if (Device.type == DeviceType.CPU) {
-                if (torch.autocast_decrement_nesting() == 0)
-                    torch.clear_autocast_cache();
-                //torch.set_autocast_enabled(this.Prev);
-                torch.set_autocast_cpu_dtype(this.fast_dtype);
-                torch.set_autocast_enabled(false);
-                torch.set_autocast_cache_enabled(false);
+                    this.Enabled = false;
+                    if (Device.type == DeviceType.CUDA) {
+                        if (torch.autocast_decrement_nesting() == 0)
+                            torch.clear_autocast_cache();
+                        torch.set_autocast_gpu_dtype(this.fast_dtype);
+                        //torch.set_autocast_enabled(this.Prev);
+                        torch.set_autocast_enabled(false);
+                        torch.set_autocast_cache_enabled(false);
+                    }
+
+                    if (Device.type == DeviceType.CPU) {
+                        if (torch.autocast_decrement_nesting() == 0)
+                            torch.clear_autocast_cache();
+                        //torch.set_autocast_enabled(this.Prev);
+                        torch.set_autocast_cpu_dtype(this.fast_dtype);
+                        torch.set_autocast_enabled(false);
+                        torch.set_autocast_cache_enabled(false);
+                    }
+                    //throw new NotImplementedException();
+                    // TODO: dispose managed state (managed objects)
+                }
+
+                // TODO: free unmanaged resources (unmanaged objects) and override finalizer
+                // TODO: set large fields to null
+                disposedValue = true;
             }
-            //throw new NotImplementedException();
         }
+
+        // // TODO: override finalizer only if 'Dispose(bool disposing)' has code to free unmanaged resources
+        // ~AutocastMode()
+        // {
+        //     // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
+        //     Dispose(disposing: false);
+        // }
+
+        public void Dispose()
+        {
+            // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
+            Dispose(disposing: true);
+            GC.SuppressFinalize(this);
+        }
+        /*public IDisposable Enter()
+{
+
+   return this;
+}*/
     }
 }
diff --git a/src/TorchSharp/Amp/GradScaler.cs b/src/TorchSharp/Amp/GradScaler.cs
index 060ad64ee..899c295cb 100644
--- a/src/TorchSharp/Amp/GradScaler.cs
+++ b/src/TorchSharp/Amp/GradScaler.cs
@@ -13,7 +13,6 @@ public class GradScaler
         private bool Enabled;
         private torch.Tensor _scale, _growth_tracker;
         private float InitScale, GrowthFactor, BackoffFactor, GrowthInterval, InitGrowthTracker;
-
         private Dictionary<int, Dictionary<string, object>> _per_optimizer_states = new Dictionary<int, Dictionary<string, object>>();
         //https://github.com/pytorch/pytorch/blob/main/torch/amp/grad_scaler.py
         public GradScaler(torch.Device dev, float init_scale = 2.0e16f, float growth_factor = 2.0f,
@@ -54,9 +53,9 @@ public torch.Tensor scale(torch.Tensor output)
         }
         private class MultiDeviceReplicator
         {
-            private torch.Tensor master;
+            private readonly torch.Tensor master;
 
-            internal Dictionary<torch.Device, torch.Tensor> per_device_tensors = new Dictionary<torch.Device, torch.Tensor>();
+            internal readonly Dictionary<torch.Device, torch.Tensor> per_device_tensors = new Dictionary<torch.Device, torch.Tensor>();
             public MultiDeviceReplicator(torch.Tensor master_tensor)
             {
                 master = master_tensor;
@@ -155,8 +154,6 @@ public void unscale(torch.optim.Optimizer optimizer)
                 return;
 
             check_scale_growth_tracker(nameof(unscale));
-
-            
         }
     }
 }
\ No newline at end of file
diff --git a/src/TorchSharp/NN/Convolution/Conv1D.cs b/src/TorchSharp/NN/Convolution/Conv1D.cs
index 9e9706e07..cf381af20 100644
--- a/src/TorchSharp/NN/Convolution/Conv1D.cs
+++ b/src/TorchSharp/NN/Convolution/Conv1D.cs
@@ -27,6 +27,10 @@ namespace Modules
     {
         public abstract class Convolution : torch.nn.Module<Tensor, Tensor>
         {
+            internal long _dimension, _in_channel, _out_channel, _kernel,_stride, _padding,_dilation,_groups;
+            internal PaddingModes _paddingModes;
+            internal (long, long)? _kernels, _strides, _paddings, _dilations;
+            internal bool _bias;
             protected Convolution(IntPtr handle, IntPtr boxedHandle, long input_channels) : base(handle, boxedHandle)
             {
                 this.input_channels = input_channels;
@@ -113,7 +117,17 @@ public static Conv1d Conv1d(long in_channels, long out_channels, long kernelSize
             {
                 var res = THSNN_Conv1d_ctor(in_channels, out_channels, kernelSize, stride, padding, dilation, (long)padding_mode, groups, bias, out var boxedHandle);
                 if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                return new Conv1d(res, boxedHandle, in_channels).MoveModule<Conv1d>(device, dtype);
+                return new Conv1d(res, boxedHandle, in_channels) {
+                    _in_channel = in_channels,
+                    _out_channel = out_channels,
+                    _kernel = kernelSize,
+                    _stride = stride,
+                    _padding = padding,
+                    _dilation = dilation,
+                    _paddingModes = padding_mode,
+                    _groups = groups,
+                    _bias = bias
+                }.MoveModule<Conv1d>(device, dtype);
             }
 
             /// <summary>
@@ -135,7 +149,17 @@ public static Conv1d Conv1d(long in_channels, long out_channels, long kernelSize
             {
                 var res = THSNN_Conv1d_ctor(in_channels, out_channels, kernelSize, stride, padding == Padding.Valid ? 0 : -1, dilation, (long)padding_mode, groups, bias, out var boxedHandle);
                 if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                return new Conv1d(res, boxedHandle, in_channels).MoveModule<Conv1d>(device, dtype);
+                return new Conv1d(res, boxedHandle, in_channels) {
+                    _in_channel = in_channels,
+                    _out_channel = out_channels,
+                    _kernel = kernelSize,
+                    _stride = stride,
+                    _padding = (long)padding,
+                    _dilation = dilation,
+                    _paddingModes = padding_mode,
+                    _groups = groups,
+                    _bias = bias
+                }.MoveModule<Conv1d>(device, dtype);
             }
 
             public static partial class functional
diff --git a/src/TorchSharp/NN/Convolution/Conv2D.cs b/src/TorchSharp/NN/Convolution/Conv2D.cs
index 28b37eef2..1143db639 100644
--- a/src/TorchSharp/NN/Convolution/Conv2D.cs
+++ b/src/TorchSharp/NN/Convolution/Conv2D.cs
@@ -12,8 +12,37 @@ namespace Modules
     {
         public sealed class Conv2d : Convolution
         {
+            
             internal Conv2d(IntPtr handle, IntPtr boxedHandle, long input_channels) : base(handle, boxedHandle, input_channels) { }
 
+            internal Conv2d(IntPtr handle, IntPtr boxedHandle, long input_channels, long in_channels, long out_channels, long kernelSize, long padding, long stride = 1, long dilation = 1, PaddingModes padding_mode = PaddingModes.Zeros, long groups = 1, bool bias = true)
+                : base(handle, boxedHandle, input_channels)
+            {
+                _dimension = 2; //because is conv 2D; 2 dimension
+                _in_channel = in_channels;
+                _out_channel = out_channels;
+                _kernel = kernelSize;
+                _stride = stride;
+                _padding = padding;
+                _dilation = dilation;
+                _paddingModes = padding_mode;
+                _groups = groups;
+                _bias = bias;
+            }
+            internal Conv2d(IntPtr handle, IntPtr boxedHandle, long input_channels, long in_channels, long out_channels, (long, long) kernelSize, Padding padding, (long, long)? stride = null, (long, long)? dilation = null, PaddingModes padding_mode = PaddingModes.Zeros, long groups = 1, bool bias = true)
+                : base(handle, boxedHandle, input_channels)
+            {
+                _dimension = 2; //because is conv 2D; 2 dimension
+                _in_channel = in_channels;
+                _out_channel = out_channels;
+                _kernels = kernelSize;
+                _strides = stride;
+                _padding = (long)padding;
+                _dilations = dilation;
+                _paddingModes = padding_mode;
+                _groups = groups;
+                _bias = bias;
+            }
             public override Tensor forward(Tensor input)
             {
                 if (ValidateShape(input, 2)) {
@@ -78,7 +107,19 @@ public static Conv2d Conv2d(long in_channels, long out_channels, long kernelSize
             {
                 var res = THSNN_Conv2d_ctor(in_channels, out_channels, kernelSize, stride, padding, dilation, (long)padding_mode, groups, bias, out var boxedHandle);
                 if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                return new Conv2d(res, boxedHandle, in_channels).MoveModule<Conv2d>(device, dtype);
+
+                return new Conv2d(res, boxedHandle, in_channels) {
+                    _in_channel = in_channels,
+                    _out_channel = out_channels,
+                    _kernel = kernelSize,
+                    _stride = stride,
+                    _padding = padding,
+                    _dilation = dilation,
+                    _paddingModes = padding_mode,
+                    _groups = groups,
+                    _bias = bias
+                }.MoveModule<Conv2d>(device, dtype);
+                //return conv2d.MoveModule<Conv2d>(device, dtype);
             }
 
             /// <summary>
@@ -104,7 +145,17 @@ public static Conv2d Conv2d(long in_channels, long out_channels, (long, long) ke
 
                 var res = THSNN_Conv2d_ctor_1(in_channels, out_channels, kernelSize.Item1, kernelSize.Item2, stride.Value.Item1, stride.Value.Item2, padding.Value.Item1, padding.Value.Item2, dilation.Value.Item1, dilation.Value.Item2, (long)padding_mode, groups, bias, out var boxedHandle);
                 if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                return new Conv2d(res, boxedHandle, in_channels).MoveModule<Conv2d>(device, dtype);
+                return new Conv2d(res, boxedHandle, in_channels) {
+                    _in_channel = in_channels,
+                    _out_channel = out_channels,
+                    _kernels = kernelSize,
+                    _strides = stride,
+                    _paddings = padding,
+                    _dilations = dilation,
+                    _paddingModes = padding_mode,
+                    _groups = groups,
+                    _bias = bias
+                }.MoveModule<Conv2d>(device, dtype);
             }
 
             /// <summary>
@@ -126,7 +177,7 @@ public static Conv2d Conv2d(long in_channels, long out_channels, long kernelSize
             {
                 var res = THSNN_Conv2d_ctor(in_channels, out_channels, kernelSize, stride, padding == Padding.Valid ? 0 : -1, dilation, (long)padding_mode, groups, bias, out var boxedHandle);
                 if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                return new Conv2d(res, boxedHandle, in_channels).MoveModule<Conv2d>(device, dtype);
+                return new Conv2d(res, boxedHandle, in_channels, in_channels, out_channels, kernelSize, (long)padding, stride, dilation, padding_mode, groups, bias).MoveModule<Conv2d>(device, dtype);
             }
 
             /// <summary>
@@ -151,7 +202,8 @@ public static Conv2d Conv2d(long in_channels, long out_channels, (long, long) ke
 
                 var res = THSNN_Conv2d_ctor_1(in_channels, out_channels, kernelSize.Item1, kernelSize.Item2, stride.Value.Item1, stride.Value.Item2, padding == Padding.Valid ? 0 : -1, 0, dilation.Value.Item1, dilation.Value.Item2, (long)padding_mode, groups, bias, out var boxedHandle);
                 if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                return new Conv2d(res, boxedHandle, in_channels).MoveModule<Conv2d>(device, dtype);
+                
+                return new Conv2d(res, boxedHandle, in_channels, in_channels, out_channels, kernelSize, padding,stride, dilation, padding_mode ,groups,bias).MoveModule<Conv2d>(device, dtype);
             }
 
             public static partial class functional
diff --git a/src/TorchSharp/NN/Module.cs b/src/TorchSharp/NN/Module.cs
index 1398ab4e3..19b64d8a9 100644
--- a/src/TorchSharp/NN/Module.cs
+++ b/src/TorchSharp/NN/Module.cs
@@ -778,6 +778,16 @@ public virtual void register_module(string name, Module submodule)
                     }
                 }
 
+                public virtual void unregister_module(string name)
+                {
+                    if (_internal_submodules.ContainsKey(name))
+                        _internal_submodules.Remove(name);
+                }
+                public virtual void unregister_module(Module module)
+                {
+                    unregister_module(module.GetName());
+                }
+
                 protected void ConditionallyRegisterParameter(string name, Tensor value)
                 {
                     if (value is null) {
diff --git a/src/TorchSharp/NN/Parameter.cs b/src/TorchSharp/NN/Parameter.cs
index 81e9051d8..cd3b66b44 100644
--- a/src/TorchSharp/NN/Parameter.cs
+++ b/src/TorchSharp/NN/Parameter.cs
@@ -36,6 +36,19 @@ internal Parameter(System.IntPtr handle) : base(handle)
             {
             }
 
+            /// <summary>
+            /// For prevent cast as torch.Tensor i provided the data method for get Tensor.
+            /// https://github.com/ultralytics/ultralytics/blob/dcde8bd23d12bbb4867ebf45f936dd37c2445974/ultralytics/nn/modules/conv.py#L78
+            /// </summary>
+            /// <returns></returns>
+            public torch.Tensor data {
+                get {
+                    return new Tensor(base.handle);
+                }
+                set {
+                    handle = value.handle;
+                }
+            }
         };
     }
 
diff --git a/src/TorchSharp/Tensor/Tensor.cs b/src/TorchSharp/Tensor/Tensor.cs
index 167fcb738..601544619 100644
--- a/src/TorchSharp/Tensor/Tensor.cs
+++ b/src/TorchSharp/Tensor/Tensor.cs
@@ -34,11 +34,13 @@ public partial class Tensor : IDisposable
             static long _peakCount = 0;
 
             internal DisposeScope? OwningDisposeScope { get; set; }
+
             //internal AutocastDisposeScope? AutocastDisposeScope;
             internal Tensor(IntPtr handle)
             {
                 this.handle = handle;
-                
+                if (AMPManager.GetInstance().IsEnabled)
+                    AMPManager.GetInstance().Add(handle); //MMM.... This is the more abstract of any method Tensor right????
                 /*if (_totalCount > 0) {
                     //have used
                     AutocastDisposeScope = AutocastDisposeManager.ThreadAutocastSingleton.RegisterTensorAutocastScope(this);
@@ -922,6 +924,15 @@ public Tensor to(ScalarType type, torch.Device device, bool copy = false, bool d
                 return new Tensor(res);
             }
 
+            /*internal static void to(this IntPtr ptr, ScalarType type)
+            {
+                var res = NativeMethods.THSTensor_to_type(ptr, (sbyte)type);
+                if (res == IntPtr.Zero)
+                    CheckForErrors();
+                if (disposeAfter)
+                    this.Dispose();
+                return new Tensor(res);
+            }*/
             public Tensor to(torch.Device device, ScalarType type, bool non_blocking)
             {
                 torch.InitializeDevice(device);
diff --git a/src/TorchSharp/Utils/ModuleInfo.cs b/src/TorchSharp/Utils/ModuleInfo.cs
new file mode 100644
index 000000000..800dc977d
--- /dev/null
+++ b/src/TorchSharp/Utils/ModuleInfo.cs
@@ -0,0 +1,46 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+using TorchSharp.Modules;
+
+namespace TorchSharp.Utils
+{
+    public static class ModuleInfo
+    {
+
+        public class ConvInfo
+        {
+            public long Dimension,InChannel,OutChannel, PaddingMode;
+            public object Kernel, Dilation, Stride; 
+            public ConvInfo(Convolution conv)
+            {
+                InChannel = conv._in_channel;
+                OutChannel = conv._out_channel;
+                if (conv._kernels.HasValue) {
+                    Kernel = conv._kernels.Value;
+                }
+                else {
+                    Kernel = conv._kernel;
+                }
+
+                //TODO: Make all props;
+                throw new NotImplementedException("Need finish");
+            }
+
+            public (long, long)? CastTuple(object obj)
+            {
+                if (obj.GetType() == typeof((long,long)))
+                    return obj as (long, long)?;
+                if (obj is long l)
+                    return (l, l);
+                return null;
+            }
+
+            public long CastValue(object obj)
+            {
+                var v = CastTuple(obj);
+                return v?.Item1 ?? 0;
+            }
+        } 
+    }
+}
diff --git a/src/TorchSharp/Utils/UnorderedMap.cs b/src/TorchSharp/Utils/UnorderedMap.cs
new file mode 100644
index 000000000..7db88a94c
--- /dev/null
+++ b/src/TorchSharp/Utils/UnorderedMap.cs
@@ -0,0 +1,55 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace TorchSharp.Utils
+{
+    public class UnorderedMap<TKey, TValue> : Dictionary<TKey, TValue>, IDisposable
+    {
+        bool disposedValue;
+
+        public UnorderedMap() { }
+        public new TValue this[TKey tk] {
+            get {
+                if (this.ContainsKey(tk))
+                    return base[tk];
+                return default(TValue);
+            }
+            set {
+                if (!this.ContainsKey(tk)) {
+                    this.Add(tk, value);
+                    return;
+                }
+                base[tk] = value;
+            }
+        }
+
+        protected virtual void Dispose(bool disposing)
+        {
+            if (!disposedValue) {
+                if (disposing) {
+                    base.Clear();
+                    // TODO: dispose managed state (managed objects)
+                }
+
+                // TODO: free unmanaged resources (unmanaged objects) and override finalizer
+                // TODO: set large fields to null
+                disposedValue = true;
+            }
+        }
+
+        // // TODO: override finalizer only if 'Dispose(bool disposing)' has code to free unmanaged resources
+        // ~UnorderedMap()
+        // {
+        //     // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
+        //     Dispose(disposing: false);
+        // }
+
+        public void Dispose()
+        {
+            // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
+            Dispose(disposing: true);
+            GC.SuppressFinalize(this);
+        }
+    }
+}

From 7cd7f9cfecfdb2e3958e1638f89899638d99836e Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sat, 20 Jul 2024 00:13:24 -0300
Subject: [PATCH 17/25] Amp

---
 src/TorchSharp/Amp/AMPManager.cs | 4 ++--
 src/TorchSharp/Tensor/Tensor.cs  | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/TorchSharp/Amp/AMPManager.cs b/src/TorchSharp/Amp/AMPManager.cs
index 1ac24476a..29c5da90c 100644
--- a/src/TorchSharp/Amp/AMPManager.cs
+++ b/src/TorchSharp/Amp/AMPManager.cs
@@ -11,7 +11,7 @@ namespace TorchSharp.Amp
     public class AMPManager : IDisposable
     {
         //TODO: Make Singleton THREADSAFE
-        public UnorderedMap<IntPtr, torch.ScalarType> TensorPtrs;
+        public UnorderedMap<IntPtr, torch.ScalarType> TensorPtrs= new UnorderedMap<IntPtr, torch.ScalarType>();
         private readonly AutocastMode autocastMode = AutocastMode.GetInstance();
 
         private AMPManager() { }
@@ -36,7 +36,6 @@ private void Revert()
             using (var enumer = TensorPtrs.GetEnumerator())
                 while (enumer.MoveNext())
                     To(enumer.Current.Key, enumer.Current.Value);
-            TensorPtrs.Clear(); //Or should use Stack for POP?? May better performance and better ram usage
         }
 
         public void Add(IntPtr ptr)
@@ -60,6 +59,7 @@ protected virtual void Dispose(bool disposing)
         {
             Revert();
             autocastMode.Dispose();
+            TensorPtrs.Dispose();
             /*if (!disposedValue) {
                 if (disposing) {
                     
diff --git a/src/TorchSharp/Tensor/Tensor.cs b/src/TorchSharp/Tensor/Tensor.cs
index 601544619..0e5b76537 100644
--- a/src/TorchSharp/Tensor/Tensor.cs
+++ b/src/TorchSharp/Tensor/Tensor.cs
@@ -39,8 +39,9 @@ public partial class Tensor : IDisposable
             internal Tensor(IntPtr handle)
             {
                 this.handle = handle;
-                if (AMPManager.GetInstance().IsEnabled)
-                    AMPManager.GetInstance().Add(handle); //MMM.... This is the more abstract of any method Tensor right????
+                /*if (AMPManager.GetInstance().IsEnabled)
+                    AMPManager.GetInstance().Add(handle); //MMM.... This is the more abstract of any method Tensor right????*/
+
                 /*if (_totalCount > 0) {
                     //have used
                     AutocastDisposeScope = AutocastDisposeManager.ThreadAutocastSingleton.RegisterTensorAutocastScope(this);

From 0c2769a28ab805dc14fc5344e9e47c8edc4e239e Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 21 Jul 2024 14:50:54 -0300
Subject: [PATCH 18/25] fix azure devops?

---
 .gitignore                                    |  24 +-
 .../FileRestitcher.csproj.nuget.dgspec.json   |  96 ++++++
 .../FileRestitcher.csproj.nuget.g.props       |  16 +
 .../FileRestitcher.csproj.nuget.g.targets     |   6 +
 .../project.assets.json                       | 276 ++++++++++++++++++
 .../project.nuget.cache                       |  11 +
 src/Native/build.cmd                          | 151 ++++++++++
 src/TorchSharp/NN/Linear.cs                   |  19 +-
 src/TorchVision/models/ResNet.cs              |   4 +-
 9 files changed, 576 insertions(+), 27 deletions(-)
 create mode 100644 pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.dgspec.json
 create mode 100644 pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.g.props
 create mode 100644 pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.g.targets
 create mode 100644 pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/project.assets.json
 create mode 100644 pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/project.nuget.cache
 create mode 100644 src/Native/build.cmd

diff --git a/.gitignore b/.gitignore
index 875954e1a..ed21b9d11 100644
--- a/.gitignore
+++ b/.gitignore
@@ -272,26 +272,4 @@ packages/
 *.code-workspace
 /.idea
 /test/TorchSharpTest/exportsd.py
-/src/Native/CMakeFiles
-/src/Native/LibTorchSharp/CMakeFiles
-/src/Native/ALL_BUILD.vcxproj
-/src/Native/ALL_BUILD.vcxproj.filters
-/src/Native/build.cmd
-/src/Native/CMakeCache.txt
-/src/Native/cmake_install.cmake
-/src/Native/INSTALL.vcxproj
-/src/Native/INSTALL.vcxproj.filters
-/src/Native/install_manifest.txt
-/src/Native/LibTorchSharp/ALL_BUILD.vcxproj
-/src/Native/LibTorchSharp/ALL_BUILD.vcxproj.filters
-/src/Native/LibTorchSharp/cmake_install.cmake
-/src/Native/LibTorchSharp/INSTALL.vcxproj
-/src/Native/LibTorchSharp/INSTALL.vcxproj.filters
-/src/Native/LibTorchSharp/LibTorchSharp.sln
-/src/Native/LibTorchSharp/LibTorchSharp.vcxproj
-/src/Native/LibTorchSharp/LibTorchSharp.vcxproj.filters
-/src/Native/Project.sln
-/src/Native/ZERO_CHECK.vcxproj
-/src/Native/ZERO_CHECK.vcxproj.filters
-/src/FSharp.Examples/FSharp.Examples.fsproj
-/pkg/FileRestitcher
+.vscode/settings.json
\ No newline at end of file
diff --git a/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.dgspec.json b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.dgspec.json
new file mode 100644
index 000000000..fc625189a
--- /dev/null
+++ b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.dgspec.json
@@ -0,0 +1,96 @@
+{
+  "format": 1,
+  "restore": {
+    "K:\\Proyects_Repos\\TorchSharp\\pkg\\FileRestitcher\\FileRestitcher\\FileRestitcher.csproj": {}
+  },
+  "projects": {
+    "K:\\Proyects_Repos\\TorchSharp\\pkg\\FileRestitcher\\FileRestitcher\\FileRestitcher.csproj": {
+      "version": "1.0.0",
+      "restore": {
+        "projectUniqueName": "K:\\Proyects_Repos\\TorchSharp\\pkg\\FileRestitcher\\FileRestitcher\\FileRestitcher.csproj",
+        "projectName": "FileRestitcher",
+        "projectPath": "K:\\Proyects_Repos\\TorchSharp\\pkg\\FileRestitcher\\FileRestitcher\\FileRestitcher.csproj",
+        "packagesPath": "C:\\Users\\Dimitri\\.nuget\\packages\\",
+        "outputPath": "K:\\Proyects_Repos\\TorchSharp\\pkg\\FileRestitcher\\FileRestitcher\\FileRestitcher.NupkgProj\\",
+        "projectStyle": "PackageReference",
+        "crossTargeting": true,
+        "fallbackFolders": [
+          "C:\\Program Files (x86)\\Progress\\ToolboxNuGetPackages"
+        ],
+        "configFilePaths": [
+          "C:\\Users\\Dimitri\\AppData\\Roaming\\NuGet\\NuGet.Config",
+          "C:\\Program Files (x86)\\NuGet\\Config\\Microsoft.VisualStudio.Offline.config",
+          "C:\\Program Files (x86)\\NuGet\\Config\\Telerik UI for WinForms.config"
+        ],
+        "originalTargetFrameworks": [
+          "net6.0",
+          "netstandard2.0"
+        ],
+        "sources": {
+          "C:\\Program Files (x86)\\Microsoft SDKs\\NuGetPackages\\": {},
+          "https://api.nuget.org/v3/index.json": {}
+        },
+        "frameworks": {
+          "net6.0": {
+            "targetAlias": "net6.0",
+            "projectReferences": {}
+          },
+          "netstandard2.0": {
+            "targetAlias": "netstandard2.0",
+            "projectReferences": {}
+          }
+        },
+        "warningProperties": {
+          "warnAsError": [
+            "NU1605"
+          ]
+        }
+      },
+      "frameworks": {
+        "net6.0": {
+          "targetAlias": "net6.0",
+          "imports": [
+            "net461",
+            "net462",
+            "net47",
+            "net471",
+            "net472",
+            "net48",
+            "net481"
+          ],
+          "assetTargetFallback": true,
+          "warn": true,
+          "frameworkReferences": {
+            "Microsoft.NETCore.App": {
+              "privateAssets": "all"
+            }
+          },
+          "runtimeIdentifierGraphPath": "C:\\Program Files\\dotnet\\sdk\\8.0.101\\RuntimeIdentifierGraph.json"
+        },
+        "netstandard2.0": {
+          "targetAlias": "netstandard2.0",
+          "dependencies": {
+            "NETStandard.Library": {
+              "suppressParent": "All",
+              "target": "Package",
+              "version": "[2.0.3, )",
+              "autoReferenced": true
+            }
+          },
+          "imports": [
+            "net461",
+            "net462",
+            "net47",
+            "net471",
+            "net472",
+            "net48",
+            "net481"
+          ],
+          "assetTargetFallback": true,
+          "warn": true,
+          "runtimeIdentifierGraphPath": "C:\\Program Files\\dotnet\\sdk\\8.0.101\\RuntimeIdentifierGraph.json"
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.g.props b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.g.props
new file mode 100644
index 000000000..1e9807451
--- /dev/null
+++ b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.g.props
@@ -0,0 +1,16 @@
+﻿<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Condition=" '$(ExcludeRestorePackageImports)' != 'true' ">
+    <RestoreSuccess Condition=" '$(RestoreSuccess)' == '' ">True</RestoreSuccess>
+    <RestoreTool Condition=" '$(RestoreTool)' == '' ">NuGet</RestoreTool>
+    <ProjectAssetsFile Condition=" '$(ProjectAssetsFile)' == '' ">$(MSBuildThisFileDirectory)project.assets.json</ProjectAssetsFile>
+    <NuGetPackageRoot Condition=" '$(NuGetPackageRoot)' == '' ">$(UserProfile)\.nuget\packages\</NuGetPackageRoot>
+    <NuGetPackageFolders Condition=" '$(NuGetPackageFolders)' == '' ">C:\Users\Dimitri\.nuget\packages\;C:\Program Files (x86)\Progress\ToolboxNuGetPackages</NuGetPackageFolders>
+    <NuGetProjectStyle Condition=" '$(NuGetProjectStyle)' == '' ">PackageReference</NuGetProjectStyle>
+    <NuGetToolVersion Condition=" '$(NuGetToolVersion)' == '' ">6.8.0</NuGetToolVersion>
+  </PropertyGroup>
+  <ItemGroup Condition=" '$(ExcludeRestorePackageImports)' != 'true' ">
+    <SourceRoot Include="C:\Users\Dimitri\.nuget\packages\" />
+    <SourceRoot Include="C:\Program Files (x86)\Progress\ToolboxNuGetPackages\" />
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.g.targets b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.g.targets
new file mode 100644
index 000000000..2192724bc
--- /dev/null
+++ b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/FileRestitcher.csproj.nuget.g.targets
@@ -0,0 +1,6 @@
+﻿<?xml version="1.0" encoding="utf-8" standalone="no"?>
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ImportGroup Condition=" '$(TargetFramework)' == 'netstandard2.0' AND '$(ExcludeRestorePackageImports)' != 'true' ">
+    <Import Project="$(NuGetPackageRoot)netstandard.library\2.0.3\build\netstandard2.0\NETStandard.Library.targets" Condition="Exists('$(NuGetPackageRoot)netstandard.library\2.0.3\build\netstandard2.0\NETStandard.Library.targets')" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/project.assets.json b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/project.assets.json
new file mode 100644
index 000000000..1f13839e4
--- /dev/null
+++ b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/project.assets.json
@@ -0,0 +1,276 @@
+{
+  "version": 3,
+  "targets": {
+    ".NETStandard,Version=v2.0": {
+      "Microsoft.NETCore.Platforms/1.1.0": {
+        "type": "package",
+        "compile": {
+          "lib/netstandard1.0/_._": {}
+        },
+        "runtime": {
+          "lib/netstandard1.0/_._": {}
+        }
+      },
+      "NETStandard.Library/2.0.3": {
+        "type": "package",
+        "dependencies": {
+          "Microsoft.NETCore.Platforms": "1.1.0"
+        },
+        "compile": {
+          "lib/netstandard1.0/_._": {}
+        },
+        "runtime": {
+          "lib/netstandard1.0/_._": {}
+        },
+        "build": {
+          "build/netstandard2.0/NETStandard.Library.targets": {}
+        }
+      }
+    },
+    "net6.0": {}
+  },
+  "libraries": {
+    "Microsoft.NETCore.Platforms/1.1.0": {
+      "sha512": "kz0PEW2lhqygehI/d6XsPCQzD7ff7gUJaVGPVETX611eadGsA3A877GdSlU0LRVMCTH/+P3o2iDTak+S08V2+A==",
+      "type": "package",
+      "path": "microsoft.netcore.platforms/1.1.0",
+      "files": [
+        ".nupkg.metadata",
+        ".signature.p7s",
+        "ThirdPartyNotices.txt",
+        "dotnet_library_license.txt",
+        "lib/netstandard1.0/_._",
+        "microsoft.netcore.platforms.1.1.0.nupkg.sha512",
+        "microsoft.netcore.platforms.nuspec",
+        "runtime.json"
+      ]
+    },
+    "NETStandard.Library/2.0.3": {
+      "sha512": "st47PosZSHrjECdjeIzZQbzivYBJFv6P2nv4cj2ypdI204DO+vZ7l5raGMiX4eXMJ53RfOIg+/s4DHVZ54Nu2A==",
+      "type": "package",
+      "path": "netstandard.library/2.0.3",
+      "files": [
+        ".nupkg.metadata",
+        ".signature.p7s",
+        "LICENSE.TXT",
+        "THIRD-PARTY-NOTICES.TXT",
+        "build/netstandard2.0/NETStandard.Library.targets",
+        "build/netstandard2.0/ref/Microsoft.Win32.Primitives.dll",
+        "build/netstandard2.0/ref/System.AppContext.dll",
+        "build/netstandard2.0/ref/System.Collections.Concurrent.dll",
+        "build/netstandard2.0/ref/System.Collections.NonGeneric.dll",
+        "build/netstandard2.0/ref/System.Collections.Specialized.dll",
+        "build/netstandard2.0/ref/System.Collections.dll",
+        "build/netstandard2.0/ref/System.ComponentModel.Composition.dll",
+        "build/netstandard2.0/ref/System.ComponentModel.EventBasedAsync.dll",
+        "build/netstandard2.0/ref/System.ComponentModel.Primitives.dll",
+        "build/netstandard2.0/ref/System.ComponentModel.TypeConverter.dll",
+        "build/netstandard2.0/ref/System.ComponentModel.dll",
+        "build/netstandard2.0/ref/System.Console.dll",
+        "build/netstandard2.0/ref/System.Core.dll",
+        "build/netstandard2.0/ref/System.Data.Common.dll",
+        "build/netstandard2.0/ref/System.Data.dll",
+        "build/netstandard2.0/ref/System.Diagnostics.Contracts.dll",
+        "build/netstandard2.0/ref/System.Diagnostics.Debug.dll",
+        "build/netstandard2.0/ref/System.Diagnostics.FileVersionInfo.dll",
+        "build/netstandard2.0/ref/System.Diagnostics.Process.dll",
+        "build/netstandard2.0/ref/System.Diagnostics.StackTrace.dll",
+        "build/netstandard2.0/ref/System.Diagnostics.TextWriterTraceListener.dll",
+        "build/netstandard2.0/ref/System.Diagnostics.Tools.dll",
+        "build/netstandard2.0/ref/System.Diagnostics.TraceSource.dll",
+        "build/netstandard2.0/ref/System.Diagnostics.Tracing.dll",
+        "build/netstandard2.0/ref/System.Drawing.Primitives.dll",
+        "build/netstandard2.0/ref/System.Drawing.dll",
+        "build/netstandard2.0/ref/System.Dynamic.Runtime.dll",
+        "build/netstandard2.0/ref/System.Globalization.Calendars.dll",
+        "build/netstandard2.0/ref/System.Globalization.Extensions.dll",
+        "build/netstandard2.0/ref/System.Globalization.dll",
+        "build/netstandard2.0/ref/System.IO.Compression.FileSystem.dll",
+        "build/netstandard2.0/ref/System.IO.Compression.ZipFile.dll",
+        "build/netstandard2.0/ref/System.IO.Compression.dll",
+        "build/netstandard2.0/ref/System.IO.FileSystem.DriveInfo.dll",
+        "build/netstandard2.0/ref/System.IO.FileSystem.Primitives.dll",
+        "build/netstandard2.0/ref/System.IO.FileSystem.Watcher.dll",
+        "build/netstandard2.0/ref/System.IO.FileSystem.dll",
+        "build/netstandard2.0/ref/System.IO.IsolatedStorage.dll",
+        "build/netstandard2.0/ref/System.IO.MemoryMappedFiles.dll",
+        "build/netstandard2.0/ref/System.IO.Pipes.dll",
+        "build/netstandard2.0/ref/System.IO.UnmanagedMemoryStream.dll",
+        "build/netstandard2.0/ref/System.IO.dll",
+        "build/netstandard2.0/ref/System.Linq.Expressions.dll",
+        "build/netstandard2.0/ref/System.Linq.Parallel.dll",
+        "build/netstandard2.0/ref/System.Linq.Queryable.dll",
+        "build/netstandard2.0/ref/System.Linq.dll",
+        "build/netstandard2.0/ref/System.Net.Http.dll",
+        "build/netstandard2.0/ref/System.Net.NameResolution.dll",
+        "build/netstandard2.0/ref/System.Net.NetworkInformation.dll",
+        "build/netstandard2.0/ref/System.Net.Ping.dll",
+        "build/netstandard2.0/ref/System.Net.Primitives.dll",
+        "build/netstandard2.0/ref/System.Net.Requests.dll",
+        "build/netstandard2.0/ref/System.Net.Security.dll",
+        "build/netstandard2.0/ref/System.Net.Sockets.dll",
+        "build/netstandard2.0/ref/System.Net.WebHeaderCollection.dll",
+        "build/netstandard2.0/ref/System.Net.WebSockets.Client.dll",
+        "build/netstandard2.0/ref/System.Net.WebSockets.dll",
+        "build/netstandard2.0/ref/System.Net.dll",
+        "build/netstandard2.0/ref/System.Numerics.dll",
+        "build/netstandard2.0/ref/System.ObjectModel.dll",
+        "build/netstandard2.0/ref/System.Reflection.Extensions.dll",
+        "build/netstandard2.0/ref/System.Reflection.Primitives.dll",
+        "build/netstandard2.0/ref/System.Reflection.dll",
+        "build/netstandard2.0/ref/System.Resources.Reader.dll",
+        "build/netstandard2.0/ref/System.Resources.ResourceManager.dll",
+        "build/netstandard2.0/ref/System.Resources.Writer.dll",
+        "build/netstandard2.0/ref/System.Runtime.CompilerServices.VisualC.dll",
+        "build/netstandard2.0/ref/System.Runtime.Extensions.dll",
+        "build/netstandard2.0/ref/System.Runtime.Handles.dll",
+        "build/netstandard2.0/ref/System.Runtime.InteropServices.RuntimeInformation.dll",
+        "build/netstandard2.0/ref/System.Runtime.InteropServices.dll",
+        "build/netstandard2.0/ref/System.Runtime.Numerics.dll",
+        "build/netstandard2.0/ref/System.Runtime.Serialization.Formatters.dll",
+        "build/netstandard2.0/ref/System.Runtime.Serialization.Json.dll",
+        "build/netstandard2.0/ref/System.Runtime.Serialization.Primitives.dll",
+        "build/netstandard2.0/ref/System.Runtime.Serialization.Xml.dll",
+        "build/netstandard2.0/ref/System.Runtime.Serialization.dll",
+        "build/netstandard2.0/ref/System.Runtime.dll",
+        "build/netstandard2.0/ref/System.Security.Claims.dll",
+        "build/netstandard2.0/ref/System.Security.Cryptography.Algorithms.dll",
+        "build/netstandard2.0/ref/System.Security.Cryptography.Csp.dll",
+        "build/netstandard2.0/ref/System.Security.Cryptography.Encoding.dll",
+        "build/netstandard2.0/ref/System.Security.Cryptography.Primitives.dll",
+        "build/netstandard2.0/ref/System.Security.Cryptography.X509Certificates.dll",
+        "build/netstandard2.0/ref/System.Security.Principal.dll",
+        "build/netstandard2.0/ref/System.Security.SecureString.dll",
+        "build/netstandard2.0/ref/System.ServiceModel.Web.dll",
+        "build/netstandard2.0/ref/System.Text.Encoding.Extensions.dll",
+        "build/netstandard2.0/ref/System.Text.Encoding.dll",
+        "build/netstandard2.0/ref/System.Text.RegularExpressions.dll",
+        "build/netstandard2.0/ref/System.Threading.Overlapped.dll",
+        "build/netstandard2.0/ref/System.Threading.Tasks.Parallel.dll",
+        "build/netstandard2.0/ref/System.Threading.Tasks.dll",
+        "build/netstandard2.0/ref/System.Threading.Thread.dll",
+        "build/netstandard2.0/ref/System.Threading.ThreadPool.dll",
+        "build/netstandard2.0/ref/System.Threading.Timer.dll",
+        "build/netstandard2.0/ref/System.Threading.dll",
+        "build/netstandard2.0/ref/System.Transactions.dll",
+        "build/netstandard2.0/ref/System.ValueTuple.dll",
+        "build/netstandard2.0/ref/System.Web.dll",
+        "build/netstandard2.0/ref/System.Windows.dll",
+        "build/netstandard2.0/ref/System.Xml.Linq.dll",
+        "build/netstandard2.0/ref/System.Xml.ReaderWriter.dll",
+        "build/netstandard2.0/ref/System.Xml.Serialization.dll",
+        "build/netstandard2.0/ref/System.Xml.XDocument.dll",
+        "build/netstandard2.0/ref/System.Xml.XPath.XDocument.dll",
+        "build/netstandard2.0/ref/System.Xml.XPath.dll",
+        "build/netstandard2.0/ref/System.Xml.XmlDocument.dll",
+        "build/netstandard2.0/ref/System.Xml.XmlSerializer.dll",
+        "build/netstandard2.0/ref/System.Xml.dll",
+        "build/netstandard2.0/ref/System.dll",
+        "build/netstandard2.0/ref/mscorlib.dll",
+        "build/netstandard2.0/ref/netstandard.dll",
+        "build/netstandard2.0/ref/netstandard.xml",
+        "lib/netstandard1.0/_._",
+        "netstandard.library.2.0.3.nupkg.sha512",
+        "netstandard.library.nuspec"
+      ]
+    }
+  },
+  "projectFileDependencyGroups": {
+    ".NETStandard,Version=v2.0": [
+      "NETStandard.Library >= 2.0.3"
+    ],
+    "net6.0": []
+  },
+  "packageFolders": {
+    "C:\\Users\\Dimitri\\.nuget\\packages\\": {},
+    "C:\\Program Files (x86)\\Progress\\ToolboxNuGetPackages": {}
+  },
+  "project": {
+    "version": "1.0.0",
+    "restore": {
+      "projectUniqueName": "K:\\Proyects_Repos\\TorchSharp\\pkg\\FileRestitcher\\FileRestitcher\\FileRestitcher.csproj",
+      "projectName": "FileRestitcher",
+      "projectPath": "K:\\Proyects_Repos\\TorchSharp\\pkg\\FileRestitcher\\FileRestitcher\\FileRestitcher.csproj",
+      "packagesPath": "C:\\Users\\Dimitri\\.nuget\\packages\\",
+      "outputPath": "K:\\Proyects_Repos\\TorchSharp\\pkg\\FileRestitcher\\FileRestitcher\\FileRestitcher.NupkgProj\\",
+      "projectStyle": "PackageReference",
+      "crossTargeting": true,
+      "fallbackFolders": [
+        "C:\\Program Files (x86)\\Progress\\ToolboxNuGetPackages"
+      ],
+      "configFilePaths": [
+        "C:\\Users\\Dimitri\\AppData\\Roaming\\NuGet\\NuGet.Config",
+        "C:\\Program Files (x86)\\NuGet\\Config\\Microsoft.VisualStudio.Offline.config",
+        "C:\\Program Files (x86)\\NuGet\\Config\\Telerik UI for WinForms.config"
+      ],
+      "originalTargetFrameworks": [
+        "net6.0",
+        "netstandard2.0"
+      ],
+      "sources": {
+        "C:\\Program Files (x86)\\Microsoft SDKs\\NuGetPackages\\": {},
+        "https://api.nuget.org/v3/index.json": {}
+      },
+      "frameworks": {
+        "net6.0": {
+          "targetAlias": "net6.0",
+          "projectReferences": {}
+        },
+        "netstandard2.0": {
+          "targetAlias": "netstandard2.0",
+          "projectReferences": {}
+        }
+      },
+      "warningProperties": {
+        "warnAsError": [
+          "NU1605"
+        ]
+      }
+    },
+    "frameworks": {
+      "net6.0": {
+        "targetAlias": "net6.0",
+        "imports": [
+          "net461",
+          "net462",
+          "net47",
+          "net471",
+          "net472",
+          "net48",
+          "net481"
+        ],
+        "assetTargetFallback": true,
+        "warn": true,
+        "frameworkReferences": {
+          "Microsoft.NETCore.App": {
+            "privateAssets": "all"
+          }
+        },
+        "runtimeIdentifierGraphPath": "C:\\Program Files\\dotnet\\sdk\\8.0.101\\RuntimeIdentifierGraph.json"
+      },
+      "netstandard2.0": {
+        "targetAlias": "netstandard2.0",
+        "dependencies": {
+          "NETStandard.Library": {
+            "suppressParent": "All",
+            "target": "Package",
+            "version": "[2.0.3, )",
+            "autoReferenced": true
+          }
+        },
+        "imports": [
+          "net461",
+          "net462",
+          "net47",
+          "net471",
+          "net472",
+          "net48",
+          "net481"
+        ],
+        "assetTargetFallback": true,
+        "warn": true,
+        "runtimeIdentifierGraphPath": "C:\\Program Files\\dotnet\\sdk\\8.0.101\\RuntimeIdentifierGraph.json"
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/project.nuget.cache b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/project.nuget.cache
new file mode 100644
index 000000000..2e00179eb
--- /dev/null
+++ b/pkg/FileRestitcher/FileRestitcher/FileRestitcher.NupkgProj/project.nuget.cache
@@ -0,0 +1,11 @@
+{
+  "version": 2,
+  "dgSpecHash": "GQbFl6JNwUfeVMRAQIxv+0FH84dIn8y+ZsWz3KR/dVMkJNNXpooEgJaT2UFkLhFNLf08uGLF+sf+HuE1qkdsqQ==",
+  "success": true,
+  "projectFilePath": "K:\\Proyects_Repos\\TorchSharp\\pkg\\FileRestitcher\\FileRestitcher\\FileRestitcher.csproj",
+  "expectedPackageFiles": [
+    "C:\\Users\\Dimitri\\.nuget\\packages\\microsoft.netcore.platforms\\1.1.0\\microsoft.netcore.platforms.1.1.0.nupkg.sha512",
+    "C:\\Users\\Dimitri\\.nuget\\packages\\netstandard.library\\2.0.3\\netstandard.library.2.0.3.nupkg.sha512"
+  ],
+  "logs": []
+}
\ No newline at end of file
diff --git a/src/Native/build.cmd b/src/Native/build.cmd
new file mode 100644
index 000000000..96ec8cacf
--- /dev/null
+++ b/src/Native/build.cmd
@@ -0,0 +1,151 @@
+@if not defined _echo @echo off
+setlocal
+
+:: Store current script directory before %~dp0 gets affected by another process later.
+set __currentScriptDir=%~dp0
+
+:SetupArgs
+:: Initialize the args that will be passed to cmake
+set __binDir=%__currentScriptDir%..\..\bin
+set __rootDir=%__currentScriptDir%..\..
+set __CMakeBinDir=""
+set __IntermediatesDir=""
+set __BuildArch=x64
+set __VCBuildArch=x86_amd64
+set CMAKE_BUILD_TYPE=Debug
+set LIBTORCH_PATH=""
+
+:Arg_Loop
+if [%1] == [] goto :ToolsVersion
+if /i [%1] == [Release]     ( set CMAKE_BUILD_TYPE=Release&&shift&goto Arg_Loop)
+if /i [%1] == [Debug]       ( set CMAKE_BUILD_TYPE=Debug&&shift&goto Arg_Loop)
+
+if /i [%1] == [x86]         ( set __BuildArch=x86&&set __VCBuildArch=x86&&shift&goto Arg_Loop)
+if /i [%1] == [x64]         ( set __BuildArch=x64&&set __VCBuildArch=x86_amd64&&shift&goto Arg_Loop)
+if /i [%1] == [amd64]       ( set __BuildArch=x64&&set __VCBuildArch=x86_amd64&&shift&goto Arg_Loop)
+
+if /i [%1] == [--libtorchpath] ( set LIBTORCH_PATH=%2&&shift&goto Arg_Loop)
+
+shift
+goto :Arg_Loop
+
+:ToolsVersion
+if defined VisualStudioVersion goto :RunVCVars
+
+set _VSWHERE="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+if exist %_VSWHERE% (
+  for /f "usebackq tokens=*" %%i in (`%_VSWHERE% -latest -prerelease -property installationPath`) do set _VSCOMNTOOLS=%%i\Common7\Tools
+)
+if not exist "%_VSCOMNTOOLS%" set _VSCOMNTOOLS=%VS140COMNTOOLS%
+if not exist "%_VSCOMNTOOLS%" goto :MissingVersion
+
+
+set "VSCMD_START_DIR=%__currentScriptDir%"
+call "%_VSCOMNTOOLS%\VsDevCmd.bat"
+
+:RunVCVars
+if "%VisualStudioVersion%"=="17.0" (
+    goto :VS2022
+) else if "%VisualStudioVersion%"=="16.0" (
+    goto :VS2019
+) else if "%VisualStudioVersion%"=="15.0" (
+    goto :VS2017
+) else if "%VisualStudioVersion%"=="14.0" (
+    goto :VS2015
+)
+
+:MissingVersion
+:: Can't find VS 2015, 2017 or 2019
+echo Error: Visual Studio 2015, 2017 or 2019 required
+echo        Please see https://github.com/dotnet/machinelearning/tree/master/Documentation for build instructions.
+exit /b 1
+
+:VS2022
+:: Setup vars for VS2022
+set __PlatformToolset=v143
+set __VSVersion=17 2022
+if NOT "%__BuildArch%" == "arm64" (
+    :: Set the environment for the native build
+    call "%VS160COMNTOOLS%..\..\VC\Auxiliary\Build\vcvarsall.bat" %__VCBuildArch%
+)
+goto :SetupDirs
+
+:VS2019
+:: Setup vars for VS2019
+set __PlatformToolset=v142
+set __VSVersion=16 2019
+if NOT "%__BuildArch%" == "arm64" (
+    :: Set the environment for the native build
+    call "%VS160COMNTOOLS%..\..\VC\Auxiliary\Build\vcvarsall.bat" %__VCBuildArch%
+)
+goto :SetupDirs
+
+:VS2017
+:: Setup vars for VS2017
+set __PlatformToolset=v141
+set __VSVersion=15 2017
+if NOT "%__BuildArch%" == "arm64" (
+    :: Set the environment for the native build
+    call "%VS150COMNTOOLS%..\..\VC\Auxiliary\Build\vcvarsall.bat" %__VCBuildArch%
+)
+goto :SetupDirs
+
+:VS2015
+:: Setup vars for VS2015build
+set __PlatformToolset=v140
+set __VSVersion=14 2015
+if NOT "%__BuildArch%" == "arm64" (
+    :: Set the environment for the native build
+    call "%VS140COMNTOOLS%..\..\VC\vcvarsall.bat" %__VCBuildArch%
+)
+
+:SetupDirs
+:: Setup to cmake the native components
+echo Commencing native build of dotnet/machinelearning
+echo.
+
+if %__CMakeBinDir% == "" (
+    set "__CMakeBinDir=%__binDir%\%__BuildArch%.%CMAKE_BUILD_TYPE%\Native"
+)
+if %__IntermediatesDir% == "" (
+    set "__IntermediatesDir=%__binDir%\obj\%__BuildArch%.%CMAKE_BUILD_TYPE%\Native"
+)
+set "__CMakeBinDir=%__CMakeBinDir:\=/%"
+set "__IntermediatesDir=%__IntermediatesDir:\=/%"
+
+:: Check that the intermediate directory exists so we can place our cmake build tree there
+if not exist "%__IntermediatesDir%" md "%__IntermediatesDir%"
+
+:: Regenerate the VS solution
+
+set "__gen-buildsys-win-path=%__currentScriptDir%\gen-buildsys-win.bat"
+set "__source-code-path=%__currentScriptDir%"
+
+echo Calling "%__gen-buildsys-win-path%" "%__source-code-path%" "%__VSVersion%" %__BuildArch%
+pushd "%__IntermediatesDir%"
+call "%__gen-buildsys-win-path%" "%__source-code-path%" "%__VSVersion%" %__BuildArch%
+popd
+
+:CheckForProj
+:: Check that the project created by Cmake exists
+if exist "%__IntermediatesDir%\INSTALL.vcxproj" goto BuildNativeProj
+goto :Failure
+
+:BuildNativeProj
+:: Build the project created by Cmake
+set __msbuildArgs=/p:Platform=%__BuildArch% /p:PlatformToolset="%__PlatformToolset%"
+
+cd %__rootDir%
+
+echo msbuild "%__IntermediatesDir%\INSTALL.vcxproj" /t:build /p:Configuration=%CMAKE_BUILD_TYPE% %__msbuildArgs%
+call msbuild "%__IntermediatesDir%\INSTALL.vcxproj" /t:build /p:Configuration=%CMAKE_BUILD_TYPE% %__msbuildArgs%
+IF ERRORLEVEL 1 (
+    goto :Failure
+)
+echo Done building Native components
+exit /B 0
+
+:Failure
+:: Build failed
+echo Failed to generate native component build project!
+exit /b 1
\ No newline at end of file
diff --git a/src/TorchSharp/NN/Linear.cs b/src/TorchSharp/NN/Linear.cs
index 4595582d7..e1b7b205c 100644
--- a/src/TorchSharp/NN/Linear.cs
+++ b/src/TorchSharp/NN/Linear.cs
@@ -11,10 +11,25 @@ namespace TorchSharp
 
     namespace Modules
     {
+        public class LinearInfo
+        {
+            public long InFeatures { get; }
+            public long OutFeatures { get; }
+            public LinearInfo(long inFeatures, long outFeatures)
+            {
+                InFeatures = inFeatures;
+                OutFeatures = outFeatures;
+            }
+        }
         public sealed class Linear : torch.nn.Module<Tensor, Tensor>
         {
-            internal Linear(IntPtr handle, IntPtr boxedHandle) : base(handle, boxedHandle)
+            public LinearInfo linearInfo;
+            /*internal Linear(IntPtr handle, IntPtr boxedHandle) : base(handle, boxedHandle)
+            {
+            }*/
+            internal Linear(IntPtr handle, IntPtr boxedHandle, long inFeat, long outFeat) : base(handle, boxedHandle)
             {
+                linearInfo = new LinearInfo(inFeat, outFeat);
             }
 
             public override Tensor forward(Tensor tensor)
@@ -71,7 +86,7 @@ public static Linear Linear(long inputSize, long outputSize, bool hasBias = true
                 var res = THSNN_Linear_ctor(inputSize, outputSize, hasBias, out var boxedHandle);
                 if (res == IntPtr.Zero) { torch.CheckForErrors(); }
 
-                return new Linear(res, boxedHandle).MoveModule<Linear>(device, dtype);
+                return new Linear(res, boxedHandle, inputSize, outputSize).MoveModule<Linear>(device, dtype);
             }
 
             public static partial class functional
diff --git a/src/TorchVision/models/ResNet.cs b/src/TorchVision/models/ResNet.cs
index 654d587c3..5eee7e5a2 100644
--- a/src/TorchVision/models/ResNet.cs
+++ b/src/TorchVision/models/ResNet.cs
@@ -581,7 +581,7 @@ public class ResNet : Module<Tensor, Tensor>
 
             private readonly Module<Tensor, Tensor> avgpool;
             private readonly Module<Tensor, Tensor> flatten;
-            private readonly Module<Tensor, Tensor> fc;
+            public readonly Module<Tensor, Tensor> fc;
 
             private readonly Func<int, Module<Tensor, Tensor>> norm_layer;
 
@@ -803,7 +803,7 @@ public ResNet(string name,
                             break;
                         }
                     }
-
+                    
                     if (zero_init_residual) {
                         foreach (var (_, m) in named_modules()) {
 

From eafdd1eccea359a27350c8c91af81f2631d0531e Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 21 Jul 2024 15:42:50 -0300
Subject: [PATCH 19/25] fix test?

---
 src/TorchSharp/Utils/FastTensorAccessor.cs | 712 +++++++++++++++++++++
 src/TorchSharp/Utils/TensorAccessor.cs     |  97 +--
 test/TorchSharpTest/TorchSharpTest.csproj  |   7 +-
 3 files changed, 739 insertions(+), 77 deletions(-)
 create mode 100644 src/TorchSharp/Utils/FastTensorAccessor.cs

diff --git a/src/TorchSharp/Utils/FastTensorAccessor.cs b/src/TorchSharp/Utils/FastTensorAccessor.cs
new file mode 100644
index 000000000..142b95d6c
--- /dev/null
+++ b/src/TorchSharp/Utils/FastTensorAccessor.cs
@@ -0,0 +1,712 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using System.Runtime.InteropServices;
+using static TorchSharp.PInvoke.NativeMethods;
+
+namespace TorchSharp.Utils
+{
+    /// <summary>
+    /// TensorAccessor is used to present the contents of a tensor or tensor view to the .NET world as an ordered collection
+    /// of values that integrates well with things like LINQ and foreach loops in the .NET world.
+    /// </summary>
+    /// <typeparam name="T">The type of the tensor elements.</typeparam>
+    public sealed class FastTensorAccessor<T> : IDisposable, IEnumerable<T> where T : unmanaged
+    {
+        internal FastTensorAccessor(torch.Tensor tensor)
+        {
+            if (tensor.device_type != DeviceType.CPU) {
+                throw new InvalidOperationException("Reading data from non-CPU memory is not supported. Move or copy the tensor to the cpu before reading.");
+            }
+
+            var strides = tensor.stride();
+            for (var i = 0; i < strides.Length; i++) {
+                if (strides[i] < 0)
+                    throw new NotImplementedException($"Negative tensor strides are not currently supported. tensor.strides({i}) == {strides[i]}");
+            }
+
+            // Get the data from native code.
+
+            unsafe {
+                var res = THSTensor_data(tensor.Handle);
+                if (res == IntPtr.Zero) { torch.CheckForErrors(); }
+                // NOTE: there is no safety here.
+                _tensor_data_ptr = res;
+            }
+
+            _tensor = tensor; // Keep the tensor alive now that everything is alright.
+        }
+
+        /// <summary>
+        /// This is important for performance because only called with CopyTo, CopyFrom. Is not necesary in each invocation call tensor.numel() because that use intensive CPU.
+        /// This temporary count avoid so much use CPU. The Property <see cref="Count"/> act as method.
+        /// If tensor is for example 640*640*3 = 1.228.800, <see cref="Count"/> property invoke 1 millons times!!!
+        /// If we only want copy is not necesary call that method so many times.
+        /// For some reason the method numel() use so much cpu.
+        /// </summary>
+        internal long TempCount = -1;
+        public long Count => _tensor?.numel() ?? 0;
+
+        public bool IsReadOnly => false;
+
+        public T[] ToArray()
+        {
+            if (_tensor.ndim < 2)
+                return (T[])ToNDArray();
+
+            var shps = _tensor.shape;
+            TempCount = 1;
+            for (int i = 0; i < shps.Length; i++)
+                TempCount *= shps[i]; //Theorically the numel is simple as product of each element shape
+
+            if (_tensor.is_contiguous()) { //This is very fast. And work VERY WELL
+                unsafe {
+                    return new Span<T>(_tensor_data_ptr.ToPointer(), Convert.ToInt32(TempCount)).ToArray();
+                }
+            }
+            var result = new T[TempCount];
+            CopyTo(result);
+            return result;
+        }
+
+        /// <summary>
+        /// Extract tensor data as a multi-dimensional .NET array, with the same number of dimensions as the tensor.
+        /// </summary>
+        /// <returns>An array object, which should be cast to the concrete array type.</returns>
+        public Array ToNDArray()
+        {
+            var shape = _tensor.shape;
+            var strides = _tensor.stride();
+            switch (_tensor.ndim) {
+            default:
+                return ToNDArray(shape, strides);
+            case 0:
+                unsafe {
+                    var result = new T[1];
+                    T* ptr = (T*)_tensor_data_ptr;
+                    result[0] = ptr[0];
+                    return result;
+                }
+            case 1:
+                unsafe {
+                    var result = new T[shape[0]];
+                    T* ptr = (T*)_tensor_data_ptr;
+                    for (long i0 = 0, off0 = 0; i0 < shape[0]; i0++, off0 += strides[0]) {
+                        result[i0] = ptr[off0];
+                    }
+                    return result;
+                }
+            case 2:
+                unsafe {
+                    var result = new T[shape[0], shape[1]];
+                    T* ptr = (T*)_tensor_data_ptr;
+                    for (long i0 = 0, off0 = 0; i0 < shape[0]; i0++, off0 += strides[0]) {
+                        for (long i1 = 0, off1 = off0; i1 < shape[1]; i1++, off1 += strides[1]) {
+                            result[i0, i1] = ptr[off1];
+                        }
+                    }
+                    return result;
+                }
+            case 3:
+                unsafe {
+                    var result = new T[shape[0], shape[1], shape[2]];
+                    T* ptr = (T*)_tensor_data_ptr;
+                    for (long i0 = 0, off0 = 0; i0 < shape[0]; i0++, off0 += strides[0]) {
+                        for (long i1 = 0, off1 = off0; i1 < shape[1]; i1++, off1 += strides[1]) {
+                            for (long i2 = 0, off2 = off1; i2 < shape[2]; i2++, off2 += strides[2]) {
+                                result[i0, i1, i2] = ptr[off2];
+                            }
+                        }
+                    }
+                    return result;
+                }
+            case 4:
+                unsafe {
+                    var result = new T[shape[0], shape[1], shape[2], shape[3]];
+                    T* ptr = (T*)_tensor_data_ptr;
+                    for (long i0 = 0, off0 = 0; i0 < shape[0]; i0++, off0 += strides[0]) {
+                        for (long i1 = 0, off1 = off0; i1 < shape[1]; i1++, off1 += strides[1]) {
+                            for (long i2 = 0, off2 = off1; i2 < shape[2]; i2++, off2 += strides[2]) {
+                                for (long i3 = 0, off3 = off2; i3 < shape[3]; i3++, off3 += strides[3]) {
+                                    result[i0, i1, i2, i3] = ptr[off3];
+                                }
+                            }
+                        }
+                    }
+                    return result;
+                }
+            case 5:
+                unsafe {
+                    var result = new T[shape[0], shape[1], shape[2], shape[3], shape[4]];
+                    T* ptr = (T*)_tensor_data_ptr;
+                    for (long i0 = 0, off0 = 0; i0 < shape[0]; i0++, off0 += strides[0]) {
+                        for (long i1 = 0, off1 = off0; i1 < shape[1]; i1++, off1 += strides[1]) {
+                            for (long i2 = 0, off2 = off1; i2 < shape[2]; i2++, off2 += strides[2]) {
+                                for (long i3 = 0, off3 = off2; i3 < shape[3]; i3++, off3 += strides[3]) {
+                                    for (long i4 = 0, off4 = off3; i4 < shape[4]; i4++, off4 += strides[4]) {
+                                        result[i0, i1, i2, i3, i4] = ptr[off4];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    return result;
+                }
+            case 6:
+                unsafe {
+                    var result = new T[shape[0], shape[1], shape[2], shape[3], shape[4], shape[5]];
+                    T* ptr = (T*)_tensor_data_ptr;
+                    for (long i0 = 0, off0 = 0; i0 < shape[0]; i0++, off0 += strides[0]) {
+                        for (long i1 = 0, off1 = off0; i1 < shape[1]; i1++, off1 += strides[1]) {
+                            for (long i2 = 0, off2 = off1; i2 < shape[2]; i2++, off2 += strides[2]) {
+                                for (long i3 = 0, off3 = off2; i3 < shape[3]; i3++, off3 += strides[3]) {
+                                    for (long i4 = 0, off4 = off3; i4 < shape[4]; i4++, off4 += strides[4]) {
+                                        for (long i5 = 0, off5 = off4; i5 < shape[5]; i5++, off5 += strides[5]) {
+                                            result[i0, i1, i2, i3, i4, i5] = ptr[off5];
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    return result;
+                }
+            }
+        }
+
+        private Array ToNDArray(long[] shape, long[] strides)
+        {
+            Array array = Array.CreateInstance(typeof(T), shape);
+            long[] indexes = new long[_tensor.ndim];
+            long[] off = new long[_tensor.ndim];
+
+            while (true) {
+                unsafe {
+                    T* ptr = (T*)_tensor_data_ptr;
+                    array.SetValue(ptr[off[array.Rank - 1]], indexes);
+                }
+
+                for (int i = array.Rank - 1; i >= 0; i--) {
+                    if (indexes[i] < shape[i] - 1) {
+                        indexes[i]++;
+                        off[i] += strides[i];
+                        for (int j = i; j < array.Rank - 1; j++)
+                            off[j + 1] = off[j];
+                        break;
+                    } else {
+                        if (i == 0) {
+                            return array;
+                        }
+                        indexes[i] = 0;
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Access elements of the underlying tensor / tensor view.
+        /// </summary>
+        /// <param name="indices">A linear index into the data.</param>
+        /// <returns></returns>
+        public T this[params long[] indices] {
+            get {
+                long index = 0;
+                if (indices.Length == 1) {
+                    index = indices[0];
+                    validate(index);
+                    unsafe {
+                        T* ptr = (T*)_tensor_data_ptr;
+                        return ptr[TranslateIndex(index, _tensor)];
+                    }
+                } else {
+                    unsafe {
+                        T* ptr = (T*)_tensor_data_ptr;
+                        return ptr[TranslateIndex(indices, _tensor)];
+                    }
+                }
+            }
+            set {
+                long index = 0;
+                if (indices.Length == 1) {
+                    index = indices[0];
+                    validate(index);
+                    unsafe {
+                        T* ptr = (T*)_tensor_data_ptr;
+                        ptr[TranslateIndex(indices, _tensor)] = value;
+                    }
+                } else {
+                    unsafe {
+                        T* ptr = (T*)_tensor_data_ptr;
+                        ptr[TranslateIndex(indices, _tensor)] = value;
+                    }
+                }
+            }
+        }
+
+        private void validate(long index)
+        {
+            if (index >= Count) throw new IndexOutOfRangeException();
+        }
+
+        public void CopyTo(T[] array, int arrayIndex = 0, long tensorIndex = 0)
+        {
+            int idx = arrayIndex;
+            /*if (_tensor.is_contiguous()) {
+                if (typeof(T) == typeof(float)) {
+                    float[] ff = new float[TempCount];
+                    Marshal.Copy(_tensor_data_ptr, ff, 0,ff.Length);
+                }
+            }*/
+            //Because the contiguous cause arange from tensorIndex to Numel. So is not necesary "create" array of arange, i said "create" because in fact enumerable do not create itself. Very cool.
+            if (_tensor.is_contiguous()) {
+                for (long i = tensorIndex; i < TempCount; i++)
+                    unsafe { array[i] = ((T*)_tensor_data_ptr)[i]; }
+                return;
+            }
+            foreach (int offset in GetSubsequentIndices(tensorIndex)) {
+                if (idx >= array.Length) break;
+                unsafe { array[idx] = ((T*)_tensor_data_ptr)[offset]; }
+                idx += 1;
+            }
+        }
+
+        public void CopyTo(Span<T> array, int arrayIndex = 0, long tensorIndex = 0)
+        {
+            int idx = arrayIndex;
+            foreach (int offset in GetSubsequentIndices(tensorIndex)) {
+                if (idx >= array.Length) break;
+                unsafe { array[idx] = ((T*)_tensor_data_ptr)[offset]; }
+                idx += 1;
+            }
+        }
+
+        public void CopyFrom(T[] array, int arrayIndex = 0, long tensorIndex = 0)
+        {
+            int idx = arrayIndex;
+            foreach (int offset in GetSubsequentIndices(tensorIndex)) {
+                if (idx >= array.Length) break;
+                unsafe { ((T*)_tensor_data_ptr)[offset] = array[idx]; }
+                idx += 1;
+            }
+        }
+
+        public void CopyFrom(ReadOnlySpan<T> array, int arrayIndex = 0, long tensorIndex = 0)
+        {
+            int idx = arrayIndex;
+            foreach (int offset in GetSubsequentIndices(tensorIndex)) {
+                if (idx >= array.Length) break;
+                unsafe { ((T*)_tensor_data_ptr)[offset] = array[idx]; }
+                idx += 1;
+            }
+        }
+
+        /// <summary>
+        /// Translates a linear index within the span represented by the accessor to a linear index
+        /// used by the underlying tensor. The two should only be different if the tensor is a view
+        /// rather than an allocated tensor.
+        /// </summary>
+        private static long TranslateIndex(long idx, torch.Tensor tensor)
+        {
+            if (idx >= tensor.numel() || idx < 0)
+                throw new ArgumentOutOfRangeException($"{idx} in a collection of  ${tensor.numel()} elements.");
+
+            if (tensor.is_contiguous() || idx == 0) return idx;
+
+            long result = 0;
+            var shape = tensor.shape;
+            var strides = tensor.stride();
+
+            for (var i = shape.Length - 1; i >= 0; i--) {
+                idx = Math.DivRem(idx, shape[i], out long s);
+                result += s * strides[i];
+            }
+
+            return result;
+        }
+        /// <summary>
+        /// WARNING: Test purpose not use in production
+        /// </summary>
+        private long TranslateIndexNonStatic(long idx, torch.Tensor tensor)
+        {
+            if (idx >= TempCount || idx < 0)
+                throw new ArgumentOutOfRangeException($"{idx} in a collection of  ${tensor.numel()} elements.");
+
+            if (tensor.is_contiguous() || idx == 0) return idx;
+
+            long result = 0;
+            var shape = tensor.shape;
+            var strides = tensor.stride();
+
+            for (var i = shape.Length - 1; i >= 0; i--) {
+                idx = Math.DivRem(idx, shape[i], out long s);
+                result += s * strides[i];
+            }
+
+            return result;
+        }
+        private static long TranslateIndex(long[] idx, torch.Tensor tensor)
+        {
+            long result = 0;
+            var shape = tensor.shape;
+            var strides = tensor.stride();
+
+            for (var i = shape.Length - 1; i >= 0; i--) {
+                if (idx[i] >= shape[i] || idx[i] < 0)
+                    throw new IndexOutOfRangeException($"{idx[i]} >= {shape[i]} in dimension {i}.");
+                result += idx[i] * strides[i];
+            }
+
+            return result;
+        }
+
+        internal static T ReadItemAt(torch.Tensor tensor, long index)
+        {
+            if (tensor.device_type != DeviceType.CPU) {
+                throw new InvalidOperationException("Reading data from non-CPU memory is not supported. Move or copy the tensor to the cpu before reading.");
+            }
+
+            tensor.ValidateType(typeof(T));
+
+            var strides = tensor.stride();
+            for (var i = 0; i < strides.Length; i++) {
+                if (strides[i] < 0)
+                    throw new NotImplementedException($"Negative tensor strides are not currently supported. tensor.strides({i}) == {strides[i]}");
+            }
+
+            unsafe {
+                var res = THSTensor_data(tensor.Handle);
+                if (res == IntPtr.Zero) { torch.CheckForErrors(); }
+                // NOTE: there is no safety here.
+                T* ptr = (T*)res;
+                return ptr[TranslateIndex(index, tensor)];
+            }
+        }
+
+        /// <summary>
+        /// Compare two tensors element-wise.
+        /// </summary>
+        /// <param name="left">A tensor</param>
+        /// <param name="right">Another tensor</param>
+        /// <returns></returns>
+        public static bool operator ==(FastTensorAccessor<T> left, FastTensorAccessor<T> right)
+        {
+            if (left.Count != right.Count) return false;
+
+            var lEnum = left.GetEnumerator();
+            var rEnum = right.GetEnumerator();
+
+            while (lEnum.MoveNext() && rEnum.MoveNext()) {
+                if (!lEnum.Current.Equals(rEnum.Current))
+                    return false;
+            }
+            return true;
+        }
+
+        /// <summary>
+        /// Compare two tensors element-wise.
+        /// </summary>
+        /// <param name="left">A tensor</param>
+        /// <param name="right">Another tensor</param>
+        /// <returns></returns>
+        public static bool operator !=(FastTensorAccessor<T> left, FastTensorAccessor<T> right)
+        {
+            return !(left == right);
+        }
+
+
+        private IEnumerable<long> GetSubsequentIndices(long startingIndex)
+        {
+            //TempCount = Count;
+
+            if (startingIndex < 0 || startingIndex >= TempCount)
+                throw new ArgumentOutOfRangeException(nameof(startingIndex));
+
+            if (TempCount <= 1) {
+                if (TempCount == 0) {
+                    return Enumerable.Empty<long>();
+                }
+
+                return new List<long>() { 0 };
+                //return (new long[] { 0 }).AsEnumerable<long>();
+            }
+
+            if (_tensor.is_contiguous()) {
+                return ContiguousIndices(startingIndex);
+            }
+
+            var stride = _tensor.stride();
+            Debug.Assert(stride.Length > 0);
+
+            if (stride.Length == 1) {
+                return SimpleIndices(startingIndex, stride[0]);
+            }
+
+            return MultiDimensionIndices(startingIndex);
+        }
+        private IEnumerable<long> MultiDimensionIndices(long startingIndex)
+        {
+            long[] shape = _tensor.shape;
+            long[] stride = _tensor.stride();
+            long[] inds = new long[stride.Length];
+
+            long index = startingIndex;
+            //long offset = TranslateIndex(startingIndex, _tensor);
+            long offset = TranslateIndexNonStatic(startingIndex, _tensor); //WARNING: Test purpose not use in production
+
+            while (true) {
+
+                index += 1;
+
+                yield return offset;
+
+                if (index >= TempCount) break;
+
+                for (int i = inds.Length - 1; ; i--) {
+                    Debug.Assert(i >= 0);
+                    offset += stride[i];
+                    if (++inds[i] < shape[i])
+                        break;
+
+                    // Overflow of current dimension so rewind accordingly.
+                    // Can't overflow the final (left-most) dimension.
+                    Debug.Assert(i > 0);
+                    // Note: for perf, this multiplication could be done once up front and cached in an array.
+                    offset -= inds[i] * stride[i];
+                    inds[i] = 0;
+                }
+            }
+        }
+
+        private IEnumerable<long> SimpleIndices(long startingIndex, long stride)
+        {
+            long index = startingIndex;
+            //long offset = TranslateIndex(startingIndex, _tensor);
+            long offset = TranslateIndexNonStatic(startingIndex, _tensor); //WARNING: Test purpose not use in production
+
+            while (index < TempCount) {
+                yield return offset;
+                offset += stride;
+                index += 1;
+            }
+        }
+
+        private IEnumerable<long> ContiguousIndices(long startingIndex)
+        {
+            // If there was an overload for Enumerable.Range that
+            // produced long integers, we wouldn't need this implementation.
+
+            long index = startingIndex;
+            while (index < TempCount) {
+                yield return index;
+                index += 1;
+            }
+        }
+
+
+        /// <summary>
+        /// Compare two tensors element-wise.
+        /// </summary>
+        /// <param name="obj">Another tensor</param>
+        /// <returns></returns>
+        public override bool Equals(object obj)
+        {
+            var left = this;
+            var right = obj as FastTensorAccessor<T>;
+            if (right == null) return false;
+
+            if (left._tensor_data_ptr == right._tensor_data_ptr) return true;
+            if (left.Count != right.Count) return false;
+            for (long i = 0; i < left.Count; i++) {
+                if (!left[i].Equals(right[i])) return false;
+            }
+            return true;
+        }
+
+        public override int GetHashCode()
+        {
+            return base.GetHashCode();
+        }
+
+        IEnumerator IEnumerable.GetEnumerator()
+        {
+            return GetEnumerator();
+        }
+
+        public void Dispose()
+        {
+            Dispose(true);
+            GC.SuppressFinalize(this);
+        }
+
+        private void Dispose(bool disposing)
+        {
+            _tensor_data_ptr = IntPtr.Zero;
+            // Clear the tensor that we've been keeping alive.
+            _tensor = null;
+        }
+
+        private torch.Tensor _tensor;   // Keeping it alive.
+        private IntPtr _tensor_data_ptr;
+
+#if true
+        public IEnumerator<T> GetEnumerator()
+        {
+            if (TempCount <= 1) {
+                if (TempCount == 0)
+                    return Enumerable.Empty<T>().GetEnumerator();
+                return new T[1] { this[0] }.AsEnumerable<T>().GetEnumerator();
+            }
+            /*if (Count <= 1) {
+                if (Count == 0)
+                    return Enumerable.Empty<T>().GetEnumerator();
+                return new T[1] { this[0] }.AsEnumerable<T>().GetEnumerator();
+            }*/
+
+            if (_tensor.is_contiguous()) {
+                return new SimpleAtorImpl(this, 1);
+            }
+
+            var stride = _tensor.stride();
+            Debug.Assert(stride.Length > 0);
+
+            if (stride.Length == 1) {
+                return new SimpleAtorImpl(this, stride[0]);
+            }
+
+            return new GeneralAtorImpl(this, stride);
+        }
+
+        private class SimpleAtorImpl : IEnumerator<T>
+        {
+            private FastTensorAccessor<T> _span;
+            private readonly long _count;
+            private readonly long _stride;
+
+            // State.
+            private long _index;
+            private long _offset;
+            private T _current;
+
+            public SimpleAtorImpl(FastTensorAccessor<T> span, long stride)
+            {
+                _span = span;
+                _count = span.TempCount;
+                Debug.Assert(_count > 0);
+                _stride = stride;
+                Reset();
+            }
+
+            public T Current => _current;
+            object IEnumerator.Current => Current;
+
+            public void Dispose()
+            {
+                _span = null;
+                Reset();
+            }
+
+            public bool MoveNext()
+            {
+                if (_index < 0) {
+                    _index = 0;
+                    _offset = 0;
+                } else if (++_index >= _count) {
+                    Reset();
+                    return false;
+                } else {
+                    _offset += _stride;
+                }
+
+                unsafe { _current = ((T*)_span._tensor_data_ptr)[_offset]; }
+                return true;
+            }
+
+            public void Reset()
+            {
+                _index = -1;
+                _offset = -1;
+                _current = default;
+            }
+        }
+
+        private class GeneralAtorImpl : IEnumerator<T>
+        {
+            private FastTensorAccessor<T> _span;
+            private readonly long _count;
+            private readonly long[] _shape;
+            private readonly long[] _stride;
+            private readonly long[] _inds;
+
+            // State.
+            private long _index;
+            private long _offset;
+
+            public GeneralAtorImpl(FastTensorAccessor<T> span, long[] stride)
+            {
+                Debug.Assert(stride.Length > 1);
+                _span = span;
+                _count = span.TempCount;
+                Debug.Assert(_count > 0);
+                _shape = span._tensor.shape;
+                Debug.Assert(_shape.Length == stride.Length);
+                _stride = stride;
+                _inds = new long[stride.Length];
+                Reset();
+            }
+
+            public T Current { get; private set; }
+
+            object IEnumerator.Current => Current;
+
+            public void Dispose()
+            {
+                // Just clear the span field.
+                _span = null;
+            }
+
+            public bool MoveNext()
+            {
+                if (_index < 0) {
+                    _index = 0;
+                    _offset = 0;
+                    Array.Clear(_inds, 0, _inds.Length);
+                } else if (++_index >= _count) {
+                    Reset();
+                    return false;
+                } else {
+                    for (int i = _inds.Length - 1; ; i--) {
+                        Debug.Assert(i >= 0);
+                        _offset += _stride[i];
+                        if (++_inds[i] < _shape[i])
+                            break;
+
+                        // Overflow of current dimension so rewind accordingly.
+                        // Can't overflow the final (left-most) dimension.
+                        Debug.Assert(i > 0);
+                        // Note: for perf, this multiplication could be done once up front and cached in an array.
+                        _offset -= _inds[i] * _stride[i];
+                        _inds[i] = 0;
+                    }
+                }
+
+                unsafe { Current = ((T*)_span._tensor_data_ptr)[_offset]; }
+                return true;
+            }
+
+            public void Reset()
+            {
+                _index = -1;
+                _offset = -1;
+                Current = default;
+            }
+        }
+#else
+        public IEnumerator<T> GetEnumerator()
+        {
+            return new TensorAccessorEnumerator(this);
+        }
+#endif
+    }
+}
diff --git a/src/TorchSharp/Utils/TensorAccessor.cs b/src/TorchSharp/Utils/TensorAccessor.cs
index f7f825ffc..31641529b 100644
--- a/src/TorchSharp/Utils/TensorAccessor.cs
+++ b/src/TorchSharp/Utils/TensorAccessor.cs
@@ -39,15 +39,7 @@ internal TensorAccessor(torch.Tensor tensor)
             _tensor = tensor; // Keep the tensor alive now that everything is alright.
         }
 
-        /// <summary>
-        /// This is important for performance because only called with CopyTo, CopyFrom. Is not necesary in each invocation call tensor.numel() because that use intensive CPU.
-        /// This temporary count avoid so much use CPU. The Property <see cref="Count"/> act as method.
-        /// If tensor is for example 640*640*3 = 1.228.800, <see cref="Count"/> property invoke 1 millons times!!!
-        /// If we only want copy is not necesary call that method so many times.
-        /// For some reason the method numel() use so much cpu.
-        /// </summary>
-        internal long TempCount = -1;
-        public long Count => _tensor?.numel() ?? 0;
+        public long Count => (_tensor is not null ? _tensor.numel() : 0);
 
         public bool IsReadOnly => false;
 
@@ -56,17 +48,18 @@ public T[] ToArray()
             if (_tensor.ndim < 2)
                 return (T[])ToNDArray();
 
-            var shps = _tensor.shape;
-            TempCount = 1;
-            for(int i=0;i<shps.Length;i++)
-                TempCount *= shps[i]; //Theorically the numel is simple as product of each element shape
-            
-            if (_tensor.is_contiguous()) { //This is very fast. And work VERY WELL
+            if (_tensor.is_contiguous()) {
+                //This is very fast. And work VERY WELL
+                var shps = _tensor.shape;
+                long TempCount = 1;
+                for (int i = 0; i < shps.Length; i++)
+                    TempCount *= shps[i]; //Theorically the numel is simple as product of each element shape
                 unsafe {
                     return new Span<T>(_tensor_data_ptr.ToPointer(), Convert.ToInt32(TempCount)).ToArray();
                 }
             }
-            var result = new T[TempCount];
+
+            var result = new T[Count];
             CopyTo(result);
             return result;
         }
@@ -253,18 +246,6 @@ private void validate(long index)
         public void CopyTo(T[] array, int arrayIndex = 0, long tensorIndex = 0)
         {
             int idx = arrayIndex;
-            /*if (_tensor.is_contiguous()) {
-                if (typeof(T) == typeof(float)) {
-                    float[] ff = new float[TempCount];
-                    Marshal.Copy(_tensor_data_ptr, ff, 0,ff.Length);
-                }
-            }*/
-            //Because the contiguous cause arange from tensorIndex to Numel. So is not necesary "create" array of arange, i said "create" because in fact enumerable do not create itself. Very cool.
-            if (_tensor.is_contiguous()) {
-                for(long i= tensorIndex; i<TempCount;i++)
-                    unsafe { array[i] = ((T*)_tensor_data_ptr)[i]; }
-                return;
-            }
             foreach (int offset in GetSubsequentIndices(tensorIndex)) {
                 if (idx >= array.Length) break;
                 unsafe { array[idx] = ((T*)_tensor_data_ptr)[offset]; }
@@ -325,27 +306,7 @@ private static long TranslateIndex(long idx, torch.Tensor tensor)
 
             return result;
         }
-        /// <summary>
-        /// WARNING: Test purpose not use in production
-        /// </summary>
-        private long TranslateIndexNonStatic(long idx, torch.Tensor tensor)
-        {
-            if (idx >= TempCount || idx < 0)
-                throw new ArgumentOutOfRangeException($"{idx} in a collection of  ${tensor.numel()} elements.");
-
-            if (tensor.is_contiguous() || idx == 0) return idx;
-
-            long result = 0;
-            var shape = tensor.shape;
-            var strides = tensor.stride();
-
-            for (var i = shape.Length - 1; i >= 0; i--) {
-                idx = Math.DivRem(idx, shape[i], out long s);
-                result += s * strides[i];
-            }
 
-            return result;
-        }
         private static long TranslateIndex(long[] idx, torch.Tensor tensor)
         {
             long result = 0;
@@ -418,18 +379,15 @@ internal static T ReadItemAt(torch.Tensor tensor, long index)
 
         private IEnumerable<long> GetSubsequentIndices(long startingIndex)
         {
-            //TempCount = Count;
-
-            if (startingIndex < 0 || startingIndex >= TempCount)
+            if (startingIndex < 0 || startingIndex >= Count)
                 throw new ArgumentOutOfRangeException(nameof(startingIndex));
 
-            if (TempCount <= 1) {
-                if (TempCount == 0) {
+            if (Count <= 1) {
+                if (Count == 0) {
                     return Enumerable.Empty<long>();
                 }
 
-                return new List<long>() { 0 };
-                //return (new long[] { 0 }).AsEnumerable<long>();
+                return (new long[] { 0 }).AsEnumerable<long>();
             }
 
             if (_tensor.is_contiguous()) {
@@ -445,6 +403,7 @@ private IEnumerable<long> GetSubsequentIndices(long startingIndex)
 
             return MultiDimensionIndices(startingIndex);
         }
+
         private IEnumerable<long> MultiDimensionIndices(long startingIndex)
         {
             long[] shape = _tensor.shape;
@@ -452,8 +411,7 @@ private IEnumerable<long> MultiDimensionIndices(long startingIndex)
             long[] inds = new long[stride.Length];
 
             long index = startingIndex;
-            //long offset = TranslateIndex(startingIndex, _tensor);
-            long offset = TranslateIndexNonStatic(startingIndex, _tensor); //WARNING: Test purpose not use in production
+            long offset = TranslateIndex(startingIndex, _tensor);
 
             while (true) {
 
@@ -461,7 +419,7 @@ private IEnumerable<long> MultiDimensionIndices(long startingIndex)
 
                 yield return offset;
 
-                if (index >= TempCount) break;
+                if (index >= Count) break;
 
                 for (int i = inds.Length - 1; ; i--) {
                     Debug.Assert(i >= 0);
@@ -482,23 +440,21 @@ private IEnumerable<long> MultiDimensionIndices(long startingIndex)
         private IEnumerable<long> SimpleIndices(long startingIndex, long stride)
         {
             long index = startingIndex;
-            //long offset = TranslateIndex(startingIndex, _tensor);
-            long offset = TranslateIndexNonStatic(startingIndex, _tensor); //WARNING: Test purpose not use in production
+            long offset = TranslateIndex(startingIndex, _tensor);
 
-            while (index < TempCount) {
+            while (index < Count) {
                 yield return offset;
                 offset += stride;
                 index += 1;
             }
         }
-
         private IEnumerable<long> ContiguousIndices(long startingIndex)
         {
             // If there was an overload for Enumerable.Range that
             // produced long integers, we wouldn't need this implementation.
-            
+
             long index = startingIndex;
-            while (index < TempCount) {
+            while (index < Count) {
                 yield return index;
                 index += 1;
             }
@@ -553,16 +509,11 @@ private void Dispose(bool disposing)
 #if true
         public IEnumerator<T> GetEnumerator()
         {
-            if (TempCount <= 1) {
-                if (TempCount == 0)
-                    return Enumerable.Empty<T>().GetEnumerator();
-                return new T[1] { this[0] }.AsEnumerable<T>().GetEnumerator();
-            }
-            /*if (Count <= 1) {
+            if (Count <= 1) {
                 if (Count == 0)
                     return Enumerable.Empty<T>().GetEnumerator();
                 return new T[1] { this[0] }.AsEnumerable<T>().GetEnumerator();
-            }*/
+            }
 
             if (_tensor.is_contiguous()) {
                 return new SimpleAtorImpl(this, 1);
@@ -592,7 +543,7 @@ private class SimpleAtorImpl : IEnumerator<T>
             public SimpleAtorImpl(TensorAccessor<T> span, long stride)
             {
                 _span = span;
-                _count = span.TempCount;
+                _count = span.Count;
                 Debug.Assert(_count > 0);
                 _stride = stride;
                 Reset();
@@ -647,7 +598,7 @@ public GeneralAtorImpl(TensorAccessor<T> span, long[] stride)
             {
                 Debug.Assert(stride.Length > 1);
                 _span = span;
-                _count = span.TempCount;
+                _count = span.Count;
                 Debug.Assert(_count > 0);
                 _shape = span._tensor.shape;
                 Debug.Assert(_shape.Length == stride.Length);
diff --git a/test/TorchSharpTest/TorchSharpTest.csproj b/test/TorchSharpTest/TorchSharpTest.csproj
index 808aa1ccf..065301040 100644
--- a/test/TorchSharpTest/TorchSharpTest.csproj
+++ b/test/TorchSharpTest/TorchSharpTest.csproj
@@ -13,7 +13,6 @@
     <VSTestLogger>trx</VSTestLogger>
     <VSTestResultsDirectory>$(OutputPath)</VSTestResultsDirectory>
     <LangVersion>10.0</LangVersion>
-    <Configurations>Debug;Release;LibTorch2.3.1</Configurations>
   </PropertyGroup>
 
   <ItemGroup>
@@ -114,7 +113,7 @@
   <ItemGroup Condition="'$(TargetFramework)' != 'net472'">
       <Compile Remove="netstandardTests.cs" />
   </ItemGroup>
-  <!--Condition="'$(TargetFramework)' == 'net6.0' OR '$(TargetFramework)' == 'net472'"-->
+
   <ItemGroup>
     <PackageReference Include="coverlet.collector" Version="3.2.0" Condition="'$(TargetFramework)' != 'net472'" />
     <PackageReference Include="System.Memory" Version="4.5.5" />
@@ -123,6 +122,7 @@
     <PackageReference Update="xunit.runner.visualstudio" Version="2.4.5" PrivateAssets="all" IncludeAssets="runtime; build; native; contentfiles; analyzers; buildtransitive" />
     <PackageReference Update="xunit" Version="2.4.2" />
   </ItemGroup>
+
   <PropertyGroup Condition="'$(Coverage)' == 'true'">
       <CollectCoverage>true</CollectCoverage>
       <SingleHit>true</SingleHit>
@@ -132,5 +132,4 @@
       <ExcludeByAttribute>Obsolete,ExcludeFromCodeCoverage</ExcludeByAttribute>
   </PropertyGroup>
 
-</Project>
-
+</Project>
\ No newline at end of file

From c0883d9fad6686c38d33b6713332397b61e47c86 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 21 Jul 2024 16:31:07 -0300
Subject: [PATCH 20/25] fix mac test?

---
 src/TorchSharp/NN/Module.cs |  4 ++--
 src/TorchSharp/Torch.cs     | 16 +++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/TorchSharp/NN/Module.cs b/src/TorchSharp/NN/Module.cs
index 19b64d8a9..f7309ed51 100644
--- a/src/TorchSharp/NN/Module.cs
+++ b/src/TorchSharp/NN/Module.cs
@@ -765,7 +765,7 @@ public virtual void register_module(string name, Module submodule)
                         }
 
                         submodule.RegisterComponents();
-                        if (!is_autocast_cache_enabled()) {
+                        /*if (!is_autocast_cache_enabled()) {
                             _internal_submodules.Add(name, submodule);
                             return;
                         }
@@ -773,7 +773,7 @@ public virtual void register_module(string name, Module submodule)
                             submodule = submodule.to(get_autocast_dtype(CUDA));
                         if (is_autocast_cpu_enabled())
                             submodule = submodule.to(get_autocast_dtype(CPU));
-                        
+                        */
                         _internal_submodules.Add(name, submodule);
                     }
                 }
diff --git a/src/TorchSharp/Torch.cs b/src/TorchSharp/Torch.cs
index d10254a2c..bc019d8df 100644
--- a/src/TorchSharp/Torch.cs
+++ b/src/TorchSharp/Torch.cs
@@ -53,7 +53,8 @@ public static partial class torch
 
         public static string __version__ => libtorchPackageVersion;
 
-        internal static bool TryLoadNativeLibraryFromFile(string path, StringBuilder trace) {
+        internal static bool TryLoadNativeLibraryFromFile(string path, StringBuilder trace)
+        {
             bool ok;
             try {
                 trace.AppendLine($"    Trying to load native component {path}");
@@ -158,7 +159,7 @@ private static void LoadNativeBackend(bool useCudaBackend, out StringBuilder? tr
                     var torchsharpLoc = Path.GetDirectoryName(typeof(torch).Assembly.Location);
                     var packagesDir = Path.GetFullPath(Path.Combine(torchsharpLoc!, "..", "..", "..", ".."));
                     var torchsharpHome = Path.GetFullPath(Path.Combine(torchsharpLoc!, "..", ".."));
-                    //torchsharpLoc = @"K:\Proyects_Repos\TorchSharp";
+
                     trace.AppendLine($"    torchsharpLoc = {torchsharpLoc}");
                     trace.AppendLine($"    packagesDir = {packagesDir}");
                     trace.AppendLine($"    torchsharpHome = {torchsharpHome}");
@@ -204,8 +205,7 @@ private static void LoadNativeBackend(bool useCudaBackend, out StringBuilder? tr
                                 throw new NotSupportedException(message);
                             }
                         }
-                    }
-                    else {
+                    } else {
                         trace.AppendLine("    Giving up, TorchSharp.dll does not appear to have been loaded from package directories");
                     }
                     if (!ok) {
@@ -214,7 +214,7 @@ private static void LoadNativeBackend(bool useCudaBackend, out StringBuilder? tr
                         throw new NotSupportedException(message);
                     }
                 }
-                
+
 
                 // Record the successful load
                 if (useCudaBackend)
@@ -265,8 +265,7 @@ private static bool CopyNativeComponentsIntoSingleDirectory(string packagesDir,
 
         public static bool TryInitializeDeviceType(DeviceType deviceType)
         {
-            if (deviceType == DeviceType.MPS && !isAppleSilicon)
-            {
+            if (deviceType == DeviceType.MPS && !isAppleSilicon) {
                 return false;
             }
 
@@ -280,8 +279,7 @@ public static bool TryInitializeDeviceType(DeviceType deviceType)
 
         public static void InitializeDeviceType(DeviceType deviceType)
         {
-            if (deviceType == DeviceType.MPS && !isAppleSilicon)
-            {
+            if (deviceType == DeviceType.MPS && !isAppleSilicon) {
                 throw new InvalidOperationException($"Torch device type 'MPS' is not available on this platform.");
             }
 

From 9ac78bd7ec50600fa137a97e05402b1121e357c3 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Wed, 24 Jul 2024 19:08:23 -0300
Subject: [PATCH 21/25] AMP Problem outscope

---
 src/Examples.Utils/Examples.Utils.csproj |   2 +-
 src/TorchSharp/Amp/AMPManager.cs         | 133 +++++++++++++++++++----
 src/TorchSharp/Amp/AutocastMode.cs       |  25 ++++-
 src/TorchSharp/Tensor/Tensor.cs          |  29 ++---
 src/TorchSharp/Utils/UnorderedMap.cs     |  16 ++-
 5 files changed, 161 insertions(+), 44 deletions(-)

diff --git a/src/Examples.Utils/Examples.Utils.csproj b/src/Examples.Utils/Examples.Utils.csproj
index 11a1f2b91..60dc0a292 100644
--- a/src/Examples.Utils/Examples.Utils.csproj
+++ b/src/Examples.Utils/Examples.Utils.csproj
@@ -26,7 +26,7 @@
   </ItemGroup>
 
   <ItemGroup Condition="'$(TargetFrameworks)' != ''">
-    <PackageReference Include="SixLabors.ImageSharp" Version="2.1.8" />
+    <PackageReference Include="SixLabors.ImageSharp" Version="2.1.9" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/src/TorchSharp/Amp/AMPManager.cs b/src/TorchSharp/Amp/AMPManager.cs
index 29c5da90c..870728dca 100644
--- a/src/TorchSharp/Amp/AMPManager.cs
+++ b/src/TorchSharp/Amp/AMPManager.cs
@@ -1,65 +1,154 @@
 using System;
 using System.Collections.Generic;
-using System.Runtime.InteropServices;
-using System.Text;
-using Google.Protobuf.WellKnownTypes;
+using System.Diagnostics;
 using TorchSharp.PInvoke;
-using TorchSharp.Utils;
 
 namespace TorchSharp.Amp
 {
     public class AMPManager : IDisposable
     {
+        
         //TODO: Make Singleton THREADSAFE
-        public UnorderedMap<IntPtr, torch.ScalarType> TensorPtrs= new UnorderedMap<IntPtr, torch.ScalarType>();
+        public class TensorConverter
+        {
+            //public torch.Tensor Tensor;
+            public IntPtr PrevHandle;
+            public IntPtr Handle;
+            public torch.ScalarType Dtype;
+            public torch.ScalarType FastDtype;
+            public TensorCalledIn Called, Status;
+            public enum TensorCalledIn
+            {
+                OutSide,
+                InsideEnter
+            }
+
+            public TensorConverter(IntPtr handle)
+            {
+                this.PrevHandle = handle;
+                this.Handle = handle;
+                this.Dtype = (torch.ScalarType)NativeMethods.THSTensor_type(handle);
+                this.FastDtype = AutocastMode.GetInstance().GetFastType();
+                
+                Status = TensorConverter.TensorCalledIn.InsideEnter;
+            }
+            /*public TensorConverter(torch.Tensor tensor) : this(tensor.handle)
+            {
+                this.Tensor = tensor;
+            }*/
+        }
+
+        public IList<TensorConverter> TensorsCasts = new List<TensorConverter>();
+        public bool IsEnter = false;
+        public bool IsDisposed = false;
+        /*public UnorderedMap<IntPtr, torch.ScalarType> TensorPtrs= new UnorderedMap<IntPtr, torch.ScalarType>();
+        public UnorderedMap<torch.Tensor, torch.ScalarType> TensorMap= new UnorderedMap<torch.Tensor, torch.ScalarType>();*/
         private readonly AutocastMode autocastMode = AutocastMode.GetInstance();
 
         private AMPManager() { }
 
         public bool IsEnabled => autocastMode.Enabled;
         private static AMPManager Instance;
-        //bool disposedValue;
-
         public static AMPManager GetInstance()
         {
             return Instance ??= new AMPManager();
         }
 
-        private void To(IntPtr ptr, torch.ScalarType type)
+        private torch.ScalarType GetType(IntPtr handle)
+        {
+            return (torch.ScalarType)NativeMethods.THSTensor_type(handle);
+        }
+        private IntPtr To(IntPtr ptr, torch.ScalarType type)
         {
+            Debug.WriteLine($"{nameof(AMPManager)} Tensor converting from: {(torch.ScalarType)NativeMethods.THSTensor_type(ptr)} to: {type}");
             var res = NativeMethods.THSTensor_to_type(ptr, (sbyte)type);
             if (res == IntPtr.Zero)
                 torch.CheckForErrors();
+            return res;
         }
         private void Revert()
         {
-            using (var enumer = TensorPtrs.GetEnumerator())
-                while (enumer.MoveNext())
-                    To(enumer.Current.Key, enumer.Current.Value);
+            for (int i = 0; i < TensorsCasts.Count; i++) {
+                var tc = TensorsCasts[i];
+                //var tt = new torch.Tensor(tc.Handle);
+                //var t = new torch.Tensor(tc.Handle) { handle = To(tc.Handle, tc.Dtype) };
+                //var t = new torch.Tensor(tc.Handle).to(tc.Dtype);
+                tc.Handle= To(tc.Handle, tc.Dtype);
+                if (tc.Handle != tc.PrevHandle)
+                    tc.PrevHandle = To(tc.PrevHandle, tc.Dtype);
+            }
+            //Cast Work very well but UNCASTING (if outscope, not working i dont know why...)
+            //TensorsCasts.Clear();
         }
+       
 
-        public void Add(IntPtr ptr)
+        private int ExistsHandle(IntPtr handle)
         {
-            if (!autocastMode.Enabled) {
-                
-                if (TensorPtrs.ContainsKey(ptr))
-                    To(ptr, TensorPtrs[ptr]);
-                return;
+            for (int i = 0; i < TensorsCasts.Count; i++)
+                if (TensorsCasts[i].PrevHandle == handle || TensorsCasts[i].Handle == handle)
+                    return i;
+            return -1;
+        }
+
+        public IntPtr Work(IntPtr handle, IntPtr prev)
+        {
+            
+            /*if (IsDisposed && !IsEnter) {
+                Revert(); //Is for cleaned all
+                return IntPtr.Zero;
+            }*/
+            var idx = ExistsHandle(handle);
+            Console.WriteLine($"PTR: {handle}, PREV: {prev}, IDX: {idx}");
+            if (idx == -1) {
+                var tc = new TensorConverter(handle) { Called = IsEnter
+                    ? TensorConverter.TensorCalledIn.InsideEnter
+                    : TensorConverter.TensorCalledIn.OutSide
+                };
+                if (IsEnter)
+                    tc.Handle = To(tc.Handle, tc.FastDtype);
+                TensorsCasts.Add(tc);
+                return tc.Handle;
             }
+            var tcidx = TensorsCasts[idx];
+            if (!IsEnter && IsDisposed) {
+                if (tcidx.Called == TensorConverter.TensorCalledIn.OutSide) { //Is created outside so this can revert
+                    //Is From Outside and is disposed, the tensor is created Outside so i will revert this
+                    tcidx.PrevHandle = tcidx.Handle;
+                    tcidx.Handle = To(tcidx.Handle, tcidx.Dtype);
+                }
+                return tcidx.Handle;
+            }
+            if (GetType(tcidx.Handle) == tcidx.FastDtype)
+                return tcidx.Handle;
 
-            TensorPtrs[ptr] = (torch.ScalarType)NativeMethods.THSTensor_type(ptr);
-            To(ptr, autocastMode.GetFastType()); //TODO: Set scalar autocast
+            if (IsEnter) {
+                tcidx.PrevHandle = tcidx.Handle;
+                tcidx.Handle = To(tcidx.Handle, tcidx.FastDtype);
+            }
+            return tcidx.Handle;
         }
-
+        
         public IDisposable Enter()
         {
-            return null;
+            IsEnter = true;
+            IsDisposed = false;
+            Debug.WriteLine($"{nameof(AMPManager)} Enter call");
+            return this;
         }
         protected virtual void Dispose(bool disposing)
         {
+            
+            Debug.WriteLine($"{nameof(AMPManager)} Disposed call");
             Revert();
+
+            IsDisposed = true;
+            IsEnter = false;
+           
+            //Work(IntPtr.Zero, IntPtr.Zero);
             autocastMode.Dispose();
-            TensorPtrs.Dispose();
+            //Revert();
+            /*TensorPtrs.Dispose();
+            TensorMap.Dispose();*/
             /*if (!disposedValue) {
                 if (disposing) {
                     
diff --git a/src/TorchSharp/Amp/AutocastMode.cs b/src/TorchSharp/Amp/AutocastMode.cs
index 0287e02d6..720fb3e67 100644
--- a/src/TorchSharp/Amp/AutocastMode.cs
+++ b/src/TorchSharp/Amp/AutocastMode.cs
@@ -23,7 +23,7 @@ public sealed class AutocastMode : IDisposable
         internal torch.ScalarType fast_dtype = torch.ScalarType.Float32;
         public torch.Device Device = new torch.Device(DeviceType.CUDA);
         private static AutocastMode instance;
-        bool disposedValue;
+        //bool disposedValue;
 
         /*public static AutocastMode GetInstance(torch.Device dev, torch.ScalarType? dtype = null, bool enabled = true, bool? cache_enabled = null)
 {
@@ -93,7 +93,26 @@ internal torch.Tensor CastTensor(torch.Tensor tensor)
 
         private void Dispose(bool disposing)
         {
-            if (!disposedValue) {
+            this.Enabled = false;
+            if (Device.type == DeviceType.CUDA) {
+                if (torch.autocast_decrement_nesting() == 0)
+                    torch.clear_autocast_cache();
+                torch.set_autocast_gpu_dtype(this.fast_dtype);
+                //torch.set_autocast_enabled(this.Prev);
+                torch.set_autocast_enabled(false);
+                torch.set_autocast_cache_enabled(false);
+            }
+
+            if (Device.type == DeviceType.CPU) {
+                if (torch.autocast_decrement_nesting() == 0)
+                    torch.clear_autocast_cache();
+                //torch.set_autocast_enabled(this.Prev);
+                torch.set_autocast_cpu_dtype(this.fast_dtype);
+                torch.set_autocast_enabled(false);
+                torch.set_autocast_cache_enabled(false);
+            }
+            //disposedValue = true;
+            /*if (!disposedValue) {
                 if (disposing) {
 
                     this.Enabled = false;
@@ -121,7 +140,7 @@ private void Dispose(bool disposing)
                 // TODO: free unmanaged resources (unmanaged objects) and override finalizer
                 // TODO: set large fields to null
                 disposedValue = true;
-            }
+            }*/
         }
 
         // // TODO: override finalizer only if 'Dispose(bool disposing)' has code to free unmanaged resources
diff --git a/src/TorchSharp/Tensor/Tensor.cs b/src/TorchSharp/Tensor/Tensor.cs
index 0e5b76537..2ec774b2e 100644
--- a/src/TorchSharp/Tensor/Tensor.cs
+++ b/src/TorchSharp/Tensor/Tensor.cs
@@ -38,24 +38,18 @@ public partial class Tensor : IDisposable
             //internal AutocastDisposeScope? AutocastDisposeScope;
             internal Tensor(IntPtr handle)
             {
-                this.handle = handle;
-                /*if (AMPManager.GetInstance().IsEnabled)
-                    AMPManager.GetInstance().Add(handle); //MMM.... This is the more abstract of any method Tensor right????*/
-
-                /*if (_totalCount > 0) {
-                    //have used
-                    AutocastDisposeScope = AutocastDisposeManager.ThreadAutocastSingleton.RegisterTensorAutocastScope(this);
-                    this = AutocastDisposeScope.autocastMode.CastTensor(this); //should cast when using INSIDE NOT WHERE CREATED
-                }*/
-                System.Threading.Interlocked.Increment(ref _totalCount);
-                _peakCount = Math.Max(_totalCount, _peakCount);
-                OwningDisposeScope = DisposeScopeManager.ThreadSingleton.RegisterOnCurrentDisposeScope(this);
 
                 //TODO: Add Autocast/AMP ScopeManager, need improve this.. 1) is not threadsafe and may have big problem while casting and uncasting.
                 //DANGER: DONT USE THIS ON PRODUCTION
-                /*AutocastDisposeScope = AutocastDisposeManager.ThreadAutocastSingleton.RegisterTensorAutocastScope(this);
-                this = AutocastDisposeScope.autocastMode.CastTensor(this); //should cast when using INSIDE NOT WHERE CREATED*/
-                //Should cast inner scope when get tensors for every each method? example prod, sum, div, reshape, etc???
+                if (AMPManager.GetInstance().IsEnabled) {
+                    this.handle = AMPManager.GetInstance().Work(handle, this.handle); //MMM.... This is the more abstract of any method Tensor right????
+                } else {
+                    this.handle = handle;
+                }
+
+                System.Threading.Interlocked.Increment(ref _totalCount);
+                _peakCount = Math.Max(_totalCount, _peakCount);
+                OwningDisposeScope = DisposeScopeManager.ThreadSingleton.RegisterOnCurrentDisposeScope(this);
             }
 
             /// <summary>
@@ -226,8 +220,9 @@ public IntPtr Handle {
                     if (handle == IntPtr.Zero)
                         throw new InvalidOperationException("Tensor invalid -- empty handle.");
 
-                    //AutocastDisposeScope.autocastMode.CastTensor(this); //This is wrong right???
-
+                    /*if (AMPManager.GetInstance().IsEnabled) {
+                        this.handle = AMPManager.GetInstance().Work(handle, this.handle); //MMM.... This is the more abstract of any method Tensor right????
+                    }*/
                     return handle;
                 }
             }
diff --git a/src/TorchSharp/Utils/UnorderedMap.cs b/src/TorchSharp/Utils/UnorderedMap.cs
index 7db88a94c..f890d7a56 100644
--- a/src/TorchSharp/Utils/UnorderedMap.cs
+++ b/src/TorchSharp/Utils/UnorderedMap.cs
@@ -1,5 +1,7 @@
 using System;
+using System.Collections;
 using System.Collections.Generic;
+using System.Linq;
 using System.Text;
 
 namespace TorchSharp.Utils
@@ -9,11 +11,23 @@ public class UnorderedMap<TKey, TValue> : Dictionary<TKey, TValue>, IDisposable
         bool disposedValue;
 
         public UnorderedMap() { }
+        private static bool IsCollectionType(Type type)
+        {
+            if (!type.GetGenericArguments().Any())
+                return false;
+            Type genericTypeDefinition = type.GetGenericTypeDefinition();
+            var collectionTypes = new[] { typeof(IEnumerable<>), typeof(ICollection<>), typeof(IList<>), typeof(List<>), typeof(IList) };
+            return collectionTypes.Any(x => x.IsAssignableFrom(genericTypeDefinition));
+        }
         public new TValue this[TKey tk] {
             get {
                 if (this.ContainsKey(tk))
                     return base[tk];
-                return default(TValue);
+                var t = typeof(TValue);
+                if (!IsCollectionType(t))
+                    return default;
+                base[tk] = (TValue)(IList)Activator.CreateInstance(typeof(List<>).MakeGenericType(t.GetGenericArguments()));
+                return base[tk];
             }
             set {
                 if (!this.ContainsKey(tk)) {

From 21ce055d6e9083fb0c92b6dbd91e3ffc917cf0e6 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Tue, 3 Sep 2024 17:25:54 -0300
Subject: [PATCH 22/25] some gradscaler. Need grad_scale and found_inf attr in
 optimizer

---
 src/Native/LibTorchSharp/CMakeLists.txt       |   5 +
 src/Native/LibTorchSharp/THSAmp.cpp           |  23 ++-
 src/Native/LibTorchSharp/THSAmp.h             |  12 +-
 src/Native/LibTorchSharp/THSCuda.cpp          |  15 +-
 src/Native/LibTorchSharp/THSCuda.h            |   4 +-
 src/TorchSharp/Amp/GradScaler.cs              | 145 ++++++++++++++++--
 .../PInvoke/LibTorchSharp.THSAmp.cs           |   9 ++
 src/TorchSharp/Tensor/torch.Amp.cs            |  29 ++++
 src/TorchSharp/Utils/UnorderedMap.cs          |  10 +-
 9 files changed, 229 insertions(+), 23 deletions(-)

diff --git a/src/Native/LibTorchSharp/CMakeLists.txt b/src/Native/LibTorchSharp/CMakeLists.txt
index 1565eae2d..f94d70302 100644
--- a/src/Native/LibTorchSharp/CMakeLists.txt
+++ b/src/Native/LibTorchSharp/CMakeLists.txt
@@ -1,8 +1,11 @@
 project(LibTorchSharp)
 
 find_package(CUDA)
+IF(CUDA_FOUND)
 include_directories(${CUDA_INCLUDE_DIRS})
 link_directories(${CUDA_LIBRARY_DIRS})
+add_compile_definitions(TORCHSHARP_CUDA_TOOLKIT_FOUND)
+ENDIF()
 
 if(APPLE AND NOT LIBTORCH_ARCH STREQUAL "arm64")
  include_directories("/usr/local/include" "/usr/local/opt/llvm/include")
@@ -79,7 +82,9 @@ include_directories(${TORCH_INCLUDE_DIRS})
 
 add_library(LibTorchSharp SHARED ${SOURCES} ${RESOURCES})
 
+IF(CUDA_FOUND)
 target_link_libraries(LibTorchSharp ${CUDA_LIBRARIES})
+ENDIF()
 
 target_link_libraries(LibTorchSharp ${TORCH_LIBRARIES})
 
diff --git a/src/Native/LibTorchSharp/THSAmp.cpp b/src/Native/LibTorchSharp/THSAmp.cpp
index 2f6a603e5..0b4f29cb8 100644
--- a/src/Native/LibTorchSharp/THSAmp.cpp
+++ b/src/Native/LibTorchSharp/THSAmp.cpp
@@ -3,6 +3,8 @@
 
 #include <iostream>
 #include <fstream>
+#include "torch/torch.h"
+#include "torch/cuda.h"
 
 /*void THSAmp_amp_foreach_non_finite_check_and_unscale_(const at::TensorList self, at::Tensor& found_inf, const at::Tensor& inv_scale)
 {
@@ -12,14 +14,25 @@
 void THSAmp_amp_foreach_non_finite_check_and_unscale_(Tensor* self, const int64_t tLength, at::Tensor& found_inf, const at::Tensor& inv_scale)
 {
     torch::_amp_foreach_non_finite_check_and_unscale_(toTensors<at::Tensor>((torch::Tensor**)self, tLength),found_inf,inv_scale);
-    
 }
 
-/*void THSAmp_amp_update_scale_(Tensor* self, const int64_t tLength, __resharper_unknown_type& found_inf, const __resharper_unknown_type& inv_scale)
-{
-    torch::_amp_update_scale()
-}*/
+Tensor THSAmp_amp_update_scale_(at::Tensor& self, at::Tensor& growth_tracker, const at::Tensor& found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) {
+    CATCH_TENSOR(torch::_amp_update_scale_(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);)
+}
+Tensor THSAmp_amp_update_scale_out(at::Tensor& out, const at::Tensor& self, at::Tensor& growth_tracker, const at::Tensor& found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval){
+    CATCH_TENSOR(torch::_amp_update_scale_out(out, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);)
+}
+Tensor THSAmp_amp_update_scale_outf(const at::Tensor& self, at::Tensor& growth_tracker, const at::Tensor& found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, at::Tensor& out){
+    CATCH_TENSOR(torch::_amp_update_scale_outf(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out);)
+}
 
+Tensor THSAMP_amp_update_scale(const at::Tensor& self, const at::Tensor& growth_tracker, const at::Tensor& found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, Tensor* sec)
+{
+    std::tuple<at::Tensor, at::Tensor> res;
+    CATCH(res = torch::_amp_update_scale(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);)
+    *sec = ResultTensor(std::get<1>(res));
+    return ResultTensor(std::get<0>(res));
+}
 
 bool THSAmp_is_torch_function_mode_enabled()
 {
diff --git a/src/Native/LibTorchSharp/THSAmp.h b/src/Native/LibTorchSharp/THSAmp.h
index 27183ef14..3a0718db4 100644
--- a/src/Native/LibTorchSharp/THSAmp.h
+++ b/src/Native/LibTorchSharp/THSAmp.h
@@ -2,16 +2,20 @@
 #pragma once
 
 #include "../Stdafx.h"
-
-#include "torch/torch.h"
-
 #include "Utils.h"
 
 //https://github.com/pytorch/pytorch/blob/main/torch/_meta_registrations.py#L5957
 //EXPORT_API(void) THSAmp_amp_foreach_non_finite_check_and_unscale_(const at::TensorList self, at::Tensor& found_inf, const at::Tensor& inv_scale);
 
 EXPORT_API(void) THSAmp_amp_foreach_non_finite_check_and_unscale_(Tensor* self, const int64_t tLength, at::Tensor& found_inf, const at::Tensor& inv_scale);
-//EXPORT_API(void) THSAmp_amp_update_scale_(at::Tensor& found_inf, const at::Tensor& inv_scale);
+
+//EXPORT_API(void) THSAmp_amp_update_scale_(const at::Tensor& self, const at::Tensor& inv_scale);
+
+EXPORT_API(Tensor) THSAmp_amp_update_scale_(at::Tensor& self, at::Tensor& growth_tracker, const at::Tensor& found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
+EXPORT_API(Tensor) THSAmp_amp_update_scale_out(at::Tensor& out, const at::Tensor& self, at::Tensor& growth_tracker, const at::Tensor& found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
+EXPORT_API(Tensor) THSAmp_amp_update_scale_outf(const at::Tensor& self, at::Tensor& growth_tracker, const at::Tensor& found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, at::Tensor& out);
+EXPORT_API(Tensor) THSAMP_amp_update_scale(const at::Tensor& self, const at::Tensor& growth_tracker, const at::Tensor& found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, Tensor* sec);
+   
 EXPORT_API(bool) THSAmp_is_torch_function_mode_enabled();
 
 //Maybe the best work is call THSTorch_is_autocast_enabled(enum of devices c# as int8_t);
diff --git a/src/Native/LibTorchSharp/THSCuda.cpp b/src/Native/LibTorchSharp/THSCuda.cpp
index 475187beb..01d583229 100644
--- a/src/Native/LibTorchSharp/THSCuda.cpp
+++ b/src/Native/LibTorchSharp/THSCuda.cpp
@@ -4,22 +4,31 @@
 #include <iostream>
 #include <fstream>
 
-
+#ifdef TORCHSHARP_CUDA_TOOLKIT_FOUND
 cudaDeviceProp THSCuda_get_device_prop()
 {
     int device = 0;
     cudaDeviceProp cdp;
-    //cudaGetDeviceProperties_v2(&cdp, device);
-    cudaGetDeviceProperties(&cdp, device);
+    //cudaGetDeviceProperties(&cdp, device);
+    cudaGetDeviceProperties_v2(&cdp, device);
     return cdp;
 }
+#endif
 
 int THSCuda_get_major_compute_capability()
 {
+#ifdef TORCHSHARP_CUDA_TOOLKIT_FOUND
     return THSCuda_get_device_prop().major;
+#else
+    return -1;
+#endif
 }
 
 int THSCuda_get_minor_compute_capability()
 {
+#ifdef TORCHSHARP_CUDA_TOOLKIT_FOUND
     return THSCuda_get_device_prop().minor;
+#else
+    return -1;
+#endif
 }
diff --git a/src/Native/LibTorchSharp/THSCuda.h b/src/Native/LibTorchSharp/THSCuda.h
index 2c6e6c17f..c951dd7a2 100644
--- a/src/Native/LibTorchSharp/THSCuda.h
+++ b/src/Native/LibTorchSharp/THSCuda.h
@@ -6,11 +6,13 @@
 #include "torch/torch.h"
 
 #include "Utils.h"
-
+#ifdef TORCHSHARP_CUDA_TOOLKIT_FOUND
 #include "cuda.h"
 #include "cuda_runtime_api.h"
 
 cudaDeviceProp THSCuda_get_device_prop();
 
+#endif
+
 EXPORT_API(int) THSCuda_get_major_compute_capability();
 EXPORT_API(int) THSCuda_get_minor_compute_capability();
\ No newline at end of file
diff --git a/src/TorchSharp/Amp/GradScaler.cs b/src/TorchSharp/Amp/GradScaler.cs
index be4833f4f..b2cbd3988 100644
--- a/src/TorchSharp/Amp/GradScaler.cs
+++ b/src/TorchSharp/Amp/GradScaler.cs
@@ -4,18 +4,23 @@
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
+using Tensorboard;
 using TorchSharp.Modules;
 using TorchSharp.Utils;
 
 namespace TorchSharp.Amp
 {
-    public class GradScaler
+    public class GradScaler : IDisposable
     {
         private bool Enabled;
         public torch.Device device;
         private torch.Tensor _scale, _growth_tracker;
-        private float InitScale, GrowthFactor, BackoffFactor, GrowthInterval, InitGrowthTracker;
+        private float InitScale, InitGrowthTracker;
+        public float _growth_factor { set; get; }
+        public float _backoff_factor { set; get; }
+        private int _growth_interval { set; get; }
         private UnorderedMap<int, UnorderedMap<string, object>> _per_optimizer_states = new UnorderedMap<int, UnorderedMap<string, object>>();
+        bool disposedValue;
 
         public enum OptState
         {
@@ -38,9 +43,9 @@ public GradScaler(torch.Device dev, float init_scale = 2.0e16f, float growth_fac
             device = dev;
             Enabled = enabled;
             InitScale = init_scale;
-            GrowthFactor = growth_factor;
-            BackoffFactor = backoff_factor;
-            GrowthInterval = growth_interval;
+            this._growth_factor = growth_factor;
+            _backoff_factor = backoff_factor;
+            _growth_interval = growth_interval;
             InitGrowthTracker = 0.0f;
 
             throw new NotImplementedException("This need to finish");
@@ -218,17 +223,44 @@ public void unscale(torch.optim.Optimizer optimizer)
             //https://github.com/pytorch/pytorch/blob/a00fad017719346bac6e08da0819358146e647e3/torch/amp/grad_scaler.py#L398
             var f = optimizer.GetType().GetField("_step_support_amp_scaling");
             if (f != null && f.GetValue(optimizer) is bool b && !b) {
+                bool has_grad_scaler = false;//I dont know how deal this...
+                if (has_grad_scaler) {
 
+                } else {
+                    if (optimizer_state["stage"] is OptState optstate && optstate == OptState.Ready)
+                        check_inf_per_device(optimizer);
+                    var scaler = _get_scale_async();
+                    Debug.Assert(!scaler.is_null(), "!scaler.is_null()");
+                    torch.Tensor found_inf;
+                    if (optimizer_state["found_inf_per_device"] is torch.Tensor[] ts) {
+                        for (int i = 0; i < ts.Length; i++)
+                            ts[i].to(scaler.device, true);
+                        found_inf=torch.sum(torch.cat(ts));
+                    }
+                    //if(optimizer is SGD ad)
+                    //Info: All optimizer have grad_scale and found_inf //https://github.com/pytorch/pytorch/blob/main/torch/optim/adam.py, etc.
+                    //DANGER: Optimizer in TorchShapr not have grad_scaler or found_inf, we need grad_scale for https://github.com/pytorch/pytorch/blob/758d78790164bfb041555daed380de96e06f78a3/torch/amp/grad_scaler.py#L440
+
+                    //optimizer.GetType().GetField("grad_scale").GetValue(optimizer) as torch.Tensor t
+                }
+                retval = optimizer.step().item<float>();
+                optimizer_state["stage"] = OptState.Stepped;
+                //https://github.com/pytorch/pytorch/blob/758d78790164bfb041555daed380de96e06f78a3/torch/amp/grad_scaler.py#L445
+                return retval;
             }
             if (optimizer_state["stage"] is OptState state1 && state1 == OptState.Ready)
                 unscale(optimizer);
-            Debug.Assert((optimizer_state["found_inf_per_device"] as float[]).Length > 0, "(optimizer_state['found_inf_per_device'] as float[]).Length > 0");
-
+            Debug.Assert((optimizer_state["found_inf_per_device"] as torch.Tensor[]).Length > 0, "(optimizer_state['found_inf_per_device'] as torch.Tensor).size(0) > 0");
             retval = maybe_opt_step(optimizer, optimizer_state);
             optimizer_state["stage"] = OptState.Stepped;
             return retval;
         }
 
+        private torch.Tensor _get_scale_async()
+        {
+            return _scale;
+        }
+
         /// <summary>
         /// 
         /// </summary>
@@ -252,9 +284,104 @@ public void update(object new_scale = null)
                     _scale.copy_(t);
                 }
             } else {
-                //var found_infs = 
+                IList<torch.Tensor> found_infs = new List<torch.Tensor>();
+                foreach (var state in _per_optimizer_states)
+                    foreach (var found_inf in state.Value)
+                        if(found_inf.Value is torch.Tensor t)
+                            found_infs.Add(t);
+                Debug.Assert(found_infs.Count > 0, "No inf checks were recorded prior to update.");
+                torch.Tensor found_inf_combined = found_infs[0];
+                if (found_infs.Count > 1)
+                    for (int i = 1; i < found_infs.Count; i++)
+                        found_inf_combined += found_infs[i];
+                torch.amp_update_scale_(_scale, _growth_tracker, found_inf_combined, (double)_growth_factor, (double)_backoff_factor, (long)_growth_interval);
+
+            }
+            //TODO: Implement defaultdict https://github.com/pytorch/pytorch/blob/758d78790164bfb041555daed380de96e06f78a3/torch/amp/grad_scaler.py#L531
+        }
+
+        public float get_scale()
+        {
+            if (this.Enabled) {
+
+                var scale = _get_scale_async();
+                if (scale.is_null())
+                    return InitScale;
+                return scale.item<float>();
+            }
+            return 1.0f;
+        }
+
+        public bool IsEnabled()
+        {
+            return this.Enabled;
+        }
+
+        public UnorderedMap<string, object> state_dict()
+        {
+            if (Enabled) {
+                var res = new UnorderedMap<string, object>();
+                res["scale"] = get_scale();
+                res[nameof(_growth_factor)] = _growth_factor;
+                res[nameof(_backoff_factor)] = _backoff_factor;
+                res[nameof(_growth_interval)] = _growth_interval;
+                res[nameof(_growth_tracker)] = _growth_tracker;
+                return res;
             }
-            
+            return null;
+        }
+
+        public void load_state_dict(Dictionary<string, object> state_dict)
+        {
+            if (!Enabled)
+                return;
+            if (state_dict.Count == 0)
+                throw new Exception("The source state dict is empty, possibly because it was saved from a disabled instance of GradScaler.");
+            //TODO: implement reflection to set field/properties based on state_dict
+        }
+
+        torch.Tensor check_inf_per_device(torch.optim.Optimizer optimizer)
+        {
+            _scale = check_scale_growth_tracker(nameof(check_inf_per_device)).Item1;
+            var dummy_inv_scale = torch.full(new ReadOnlySpan<long>(new long[] { 0 }), 1.0f, torch.ScalarType.Float32, _scale.device);
+            var foundd_inf = torch.full(new ReadOnlySpan<long>(new long[] { 0 }), 0.0f, torch.ScalarType.Float32, _scale.device);
+            _per_optimizer_states[optimizer.GetHashCode()]["found_inf_per_device"] = unscale_grads(optimizer, dummy_inv_scale, foundd_inf, true);
+            return _per_optimizer_states[optimizer.GetHashCode()]["found_inf_per_device"] as torch.Tensor;
+        }
+
+        private object _found_inf_per_device(torch.optim.Optimizer optimizer)
+        {
+            return _per_optimizer_states[optimizer.GetHashCode()]["found_inf_per_device"];
+        }
+
+        protected virtual void Dispose(bool disposing)
+        {
+            if (!disposedValue) {
+                if (disposing) {
+                    _per_optimizer_states.Dispose();
+                    _growth_tracker.Dispose();
+                    _scale.Dispose();
+                    // TODO: dispose managed state (managed objects)
+                }
+
+                // TODO: free unmanaged resources (unmanaged objects) and override finalizer
+                // TODO: set large fields to null
+                disposedValue = true;
+            }
+        }
+
+        // // TODO: override finalizer only if 'Dispose(bool disposing)' has code to free unmanaged resources
+        // ~GradScaler()
+        // {
+        //     // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
+        //     Dispose(disposing: false);
+        // }
+
+        public void Dispose()
+        {
+            // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
+            Dispose(disposing: true);
+            GC.SuppressFinalize(this);
         }
     }
 }
\ No newline at end of file
diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
index 984637336..7829da992 100644
--- a/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
+++ b/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
@@ -11,6 +11,14 @@ internal static partial class NativeMethods
         [DllImport("LibTorchSharp")]
         internal static extern void THSAmp_amp_foreach_non_finite_check_and_unscale_(IntPtr tensors, long tLength, IntPtr found_inf, IntPtr inv_scale);
         [DllImport("LibTorchSharp")]
+        internal static extern IntPtr THSAmp_amp_update_scale_(IntPtr self, IntPtr growth_tracker, IntPtr found_inf, double scale_growth_factor, double scale_backoff_factor, long growth_interval);
+        [DllImport("LibTorchSharp")]
+        internal static extern IntPtr THSAmp_amp_update_scale_out(IntPtr outt,IntPtr self, IntPtr growth_tracker,  IntPtr found_inf, double scale_growth_factor, double scale_backoff_factor, long growth_interval);
+        [DllImport("LibTorchSharp")]
+        internal static extern IntPtr THSAmp_amp_update_scale_outf(IntPtr self,IntPtr growth_tracker,  IntPtr found_inf, double scale_growth_factor, double scale_backoff_factor, long growth_interval, IntPtr outt);
+        [DllImport("LibTorchSharp")]
+        internal static extern IntPtr THSAMP_amp_update_scale(IntPtr self,IntPtr growth_tracker,  IntPtr found_inf, double scale_growth_factor, double scale_backoff_factor, long growth_interval, out IntPtr sec);
+        [DllImport("LibTorchSharp")]
         internal static extern bool THSAmp_is_torch_function_mode_enabled();
         [DllImport("LibTorchSharp")]
         internal static extern bool THSAmp_is_autocast_cache_enabled();
@@ -49,5 +57,6 @@ internal static partial class NativeMethods
         [DllImport("LibTorchSharp")]
         internal static extern void THSAmp_clear_autocast_cache();
 
+
     }
 }
\ No newline at end of file
diff --git a/src/TorchSharp/Tensor/torch.Amp.cs b/src/TorchSharp/Tensor/torch.Amp.cs
index dfa4245fd..319afe65c 100644
--- a/src/TorchSharp/Tensor/torch.Amp.cs
+++ b/src/TorchSharp/Tensor/torch.Amp.cs
@@ -13,5 +13,34 @@ public static void _amp_foreach_non_finite_check_and_unscale_(IList<Tensor> tens
             IntPtr tens = ts.CreateArray(tensors.Select(x => x.Handle).ToArray());
             THSAmp_amp_foreach_non_finite_check_and_unscale_(tens, ts.Array.Length, found_inf.Handle, inv_scale.Handle);
         }
+
+        public static torch.Tensor amp_update_scale_(Tensor self, Tensor growth_tracker, Tensor found_inf, double scale_growth_factor, double scale_backoff_factor, long growth_interval)
+        {
+            var res = THSAmp_amp_update_scale_(self.Handle, growth_tracker.Handle, found_inf.Handle, scale_growth_factor, scale_backoff_factor, growth_interval);
+            if(res == IntPtr.Zero)
+                torch.CheckForErrors();
+            return new Tensor(res);
+        }
+        public static torch.Tensor amp_update_scale_out(Tensor outt, Tensor self, Tensor growth_tracker, Tensor found_inf, double scale_growth_factor, double scale_backoff_factor, long growth_interval)
+        {
+            var res = THSAmp_amp_update_scale_out(outt.Handle, self.Handle, growth_tracker.Handle, found_inf.Handle, scale_growth_factor, scale_backoff_factor, growth_interval);
+            if(res == IntPtr.Zero)
+                torch.CheckForErrors();
+            return new Tensor(res);
+        }
+        public static torch.Tensor amp_update_scale_outf(Tensor self, Tensor growth_tracker, Tensor found_inf, double scale_growth_factor, double scale_backoff_factor, long growth_interval, Tensor outt)
+        {
+            var res = THSAmp_amp_update_scale_outf(self.Handle, growth_tracker.Handle, found_inf.Handle, scale_growth_factor, scale_backoff_factor, growth_interval, outt.Handle);
+            if(res == IntPtr.Zero)
+                torch.CheckForErrors();
+            return new Tensor(res);
+        }
+        public static (torch.Tensor, torch.Tensor) amp_update_scale(Tensor self, Tensor growth_tracker, Tensor found_inf, double scale_growth_factor, double scale_backoff_factor, long growth_interval)
+        {
+            var res = THSAMP_amp_update_scale(self.Handle, growth_tracker.Handle, found_inf.Handle, scale_growth_factor, scale_backoff_factor, growth_interval, out var res1);
+            if(res == IntPtr.Zero || res1 == IntPtr.Zero)
+                torch.CheckForErrors();
+            return (new Tensor(res), new Tensor(res1));
+        }
     }
 }
diff --git a/src/TorchSharp/Utils/UnorderedMap.cs b/src/TorchSharp/Utils/UnorderedMap.cs
index f890d7a56..92446906a 100644
--- a/src/TorchSharp/Utils/UnorderedMap.cs
+++ b/src/TorchSharp/Utils/UnorderedMap.cs
@@ -9,7 +9,8 @@ namespace TorchSharp.Utils
     public class UnorderedMap<TKey, TValue> : Dictionary<TKey, TValue>, IDisposable
     {
         bool disposedValue;
-
+        private TValue default_dict;
+        //TODO: Add DefautlDict behaviour
         public UnorderedMap() { }
         private static bool IsCollectionType(Type type)
         {
@@ -21,6 +22,8 @@ private static bool IsCollectionType(Type type)
         }
         public new TValue this[TKey tk] {
             get {
+                /*if (!this.ContainsKey(tk) && default_dict == null)
+                    return default_dict;*/
                 if (this.ContainsKey(tk))
                     return base[tk];
                 var t = typeof(TValue);
@@ -38,6 +41,11 @@ private static bool IsCollectionType(Type type)
             }
         }
 
+        public void SetDefaultDict(TValue def)
+        {
+            this.default_dict = def;
+        }
+
         protected virtual void Dispose(bool disposing)
         {
             if (!disposedValue) {

From c70b5237b80d68a735ca5effbe79f998b29d9f52 Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Tue, 3 Sep 2024 19:54:49 -0300
Subject: [PATCH 23/25] update v2.4.0

---
 src/Native/LibTorchSharp/THSAmp.cpp           | 76 +++----------------
 src/Native/LibTorchSharp/THSAmp.h             | 22 +-----
 src/TorchSharp/Amp/AutocastMode.cs            | 40 ++++------
 .../PInvoke/LibTorchSharp.THSAmp.cs           | 24 +-----
 src/TorchSharp/Tensor/torch.Autocast.cs       | 59 +++-----------
 5 files changed, 42 insertions(+), 179 deletions(-)

diff --git a/src/Native/LibTorchSharp/THSAmp.cpp b/src/Native/LibTorchSharp/THSAmp.cpp
index 0b4f29cb8..c1fa3cd9e 100644
--- a/src/Native/LibTorchSharp/THSAmp.cpp
+++ b/src/Native/LibTorchSharp/THSAmp.cpp
@@ -44,60 +44,25 @@ bool THSAmp_is_autocast_cache_enabled()
     return at::autocast::is_autocast_cache_enabled();
 }
 
-bool THSAmp_is_autocast_cpu_enabled()
+bool THSAmp_is_autocast_enabled(int8_t device)
 {
-    return at::autocast::is_cpu_enabled();  //https://github.com/pytorch/pytorch/blob/2c91e13afc6edcfe0a0e6189a88aae4ecbbf3516/torch/csrc/autograd/init.cpp#L523
+    return at::autocast::is_autocast_enabled((at::DeviceType)device);
 }
 
-bool THSAmp_is_autocast_gpu_enabled()
+int8_t THSAmp_get_autocast_dtype(int8_t device)
 {
-    return at::autocast::is_enabled(); //https://github.com/pytorch/pytorch/blob/2c91e13afc6edcfe0a0e6189a88aae4ecbbf3516/torch/amp/autocast_mode.py#L363
+    return (int8_t)at::autocast::get_autocast_dtype((at::DeviceType)device);
 }
-bool THSAmp_is_autocast_xpu_enabled()
-{
-    return at::autocast::is_xpu_enabled();
-}
-bool THSAmp_is_autocast_hpu_enabled()
-{
-    return at::autocast::is_hpu_enabled();
-}
-
-#if (TORCH_VERSION_MAJOR ==2 && TORCH_VERSION_MINOR > 0)
-bool THSAmp_is_autocast_ipu_enabled()
-{
-    return at::autocast::is_ipu_enabled();
-}
-
-bool THSAmp_is_autocast_xla_enabled()
-{
-    return at::autocast::is_xla_enabled();
-}
-
-#endif
 
-int8_t THSAmp_get_autocast_cpu_dtype()
+void THSAmp_set_autocast_dtype(int8_t device, int8_t dtype)
 {
-    return (int8_t)at::autocast::get_autocast_cpu_dtype();
+    at::autocast::set_autocast_dtype((at::DeviceType)device, (at::ScalarType)dtype);
 }
 
-int8_t THSAmp_get_autocast_gpu_dtype()
+void THSAmp_set_autocast_enabled(int8_t device, bool enabled)
 {
-    //TODO: Implement AUTOCAST AMP AND GRADSCALER
-
-    //INFO: Enter/Exit function of autocast_mode not need to do in C/C++ only in C# with Disposable can handle all of that function (if exists)
-    //https://github.com/pytorch/pytorch/blob/main/torch/amp/autocast_mode.py
-
-    //https://github.com/pytorch/pytorch/blob/2c91e13afc6edcfe0a0e6189a88aae4ecbbf3516/torch/csrc/autograd/init.cpp#L629
-    //https://github.com/pytorch/pytorch/blob/2c91e13afc6edcfe0a0e6189a88aae4ecbbf3516/aten/src/ATen/autocast_mode.h#L20
-    return (int8_t)at::autocast::get_autocast_gpu_dtype();
+    at::autocast::set_autocast_enabled((at::DeviceType)device, enabled);
 }
-
-int8_t THSAmp_get_autocast_xpu_dtype()
-{
-    return (int8_t)at::autocast::get_autocast_xpu_dtype();
-}
-
-
 int THSAmp_autocast_increment_nesting()
 {
     return at::autocast::increment_nesting();
@@ -108,32 +73,11 @@ int THSAmp_autocast_decrement_nesting()
     return at::autocast::decrement_nesting();
 }
 
-void THSAmp_set_autocast_enabled(bool enabled)
+void THSAmp_clear_autocast_cache()
 {
-    at::autocast::set_enabled(enabled);
+    at::autocast::clear_cache();
 }
-
 void THSAmp_set_autocast_cache_enabled(bool enabled)
 {
     at::autocast::set_autocast_cache_enabled(enabled);
-}
-
-void THSAmp_set_autocast_cpu_dtype(int8_t dtype)
-{
-    at::autocast::set_autocast_cpu_dtype((c10::ScalarType)dtype);
-}
-
-void THSAmp_set_autocast_gpu_dtype(int8_t dtype)
-{
-    at::autocast::set_autocast_gpu_dtype((c10::ScalarType)dtype);
-}
-
-void THSAmp_set_autocast_xpu_dtype(int8_t dtype)
-{
-    at::autocast::set_autocast_xpu_dtype((c10::ScalarType)dtype);
-}
-
-void THSAmp_clear_autocast_cache()
-{
-    at::autocast::clear_cache();
 }
\ No newline at end of file
diff --git a/src/Native/LibTorchSharp/THSAmp.h b/src/Native/LibTorchSharp/THSAmp.h
index 3a0718db4..23d56fb2c 100644
--- a/src/Native/LibTorchSharp/THSAmp.h
+++ b/src/Native/LibTorchSharp/THSAmp.h
@@ -18,31 +18,17 @@ EXPORT_API(Tensor) THSAMP_amp_update_scale(const at::Tensor& self, const at::Ten
    
 EXPORT_API(bool) THSAmp_is_torch_function_mode_enabled();
 
-//Maybe the best work is call THSTorch_is_autocast_enabled(enum of devices c# as int8_t);
 EXPORT_API(bool) THSAmp_is_autocast_cache_enabled();
-EXPORT_API(bool) THSAmp_is_autocast_cpu_enabled();
-EXPORT_API(bool) THSAmp_is_autocast_gpu_enabled();
-EXPORT_API(bool) THSAmp_is_autocast_xpu_enabled();
-EXPORT_API(bool) THSAmp_is_autocast_hpu_enabled();
 
-#if (TORCH_VERSION_MAJOR ==2 && TORCH_VERSION_MINOR > 0)
-EXPORT_API(bool) THSAmp_is_autocast_ipu_enabled();
-EXPORT_API(bool) THSAmp_is_autocast_xla_enabled();
-#endif
-
-EXPORT_API(int8_t) THSAmp_get_autocast_cpu_dtype();
-EXPORT_API(int8_t) THSAmp_get_autocast_gpu_dtype();
-EXPORT_API(int8_t) THSAmp_get_autocast_xpu_dtype();
+EXPORT_API(bool) THSAmp_is_autocast_enabled(int8_t device);
+EXPORT_API(int8_t) THSAmp_get_autocast_dtype(int8_t device);
+EXPORT_API(void) THSAmp_set_autocast_enabled(int8_t device, bool enabled);
+EXPORT_API(void) THSAmp_set_autocast_dtype(int8_t device, int8_t dtype);
 
 EXPORT_API(int) THSAmp_autocast_increment_nesting();
 EXPORT_API(int) THSAmp_autocast_decrement_nesting();
 
-EXPORT_API(void) THSAmp_set_autocast_enabled(bool enabled);
 EXPORT_API(void) THSAmp_set_autocast_cache_enabled(bool enabled);
-EXPORT_API(void) THSAmp_set_autocast_cpu_dtype(int8_t dtype);
-EXPORT_API(void) THSAmp_set_autocast_gpu_dtype(int8_t dtype);
-EXPORT_API(void) THSAmp_set_autocast_xpu_dtype(int8_t dtype);
-
 EXPORT_API(void) THSAmp_clear_autocast_cache();
 
 //EXPORT_API(bool) THSTorch_jit_is_scripting();
\ No newline at end of file
diff --git a/src/TorchSharp/Amp/AutocastMode.cs b/src/TorchSharp/Amp/AutocastMode.cs
index 63821e64f..fa7512bb5 100644
--- a/src/TorchSharp/Amp/AutocastMode.cs
+++ b/src/TorchSharp/Amp/AutocastMode.cs
@@ -39,21 +39,23 @@ public static AutocastMode GetInstance()
 
         public torch.ScalarType GetFastType()
         {
-            var ft = torch.ScalarType.Float32;
+            return torch.get_autocast_dtype(Device.type);
+            /*var ft = torch.ScalarType.Float32;
             if (Device.type == DeviceType.CUDA)
                 ft = torch.get_autocast_gpu_dtype();
             if (Device.type == DeviceType.CPU)
                 ft = torch.get_autocast_cpu_dtype();
-            return ft;
+            return ft;*/
         }
         private AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enabled=true, bool? cache_enabled = null)
         {
             //var la = torch.tensor(9);
             fast_dtype = dtype ?? torch.ScalarType.Float32;
-            if (dev.type == DeviceType.CUDA)
-                fast_dtype = torch.get_autocast_gpu_dtype();
+            fast_dtype = torch.get_autocast_dtype(dev.type);
+            /*if (dev.type == DeviceType.CUDA)
+                fast_dtype = torch.get_autocast_dtype(dev);
             if (dev.type == DeviceType.CPU)
-                fast_dtype = torch.get_autocast_cpu_dtype();
+                fast_dtype = torch.get_autocast_cpu_dtype();*/
             //IntPtr ptr = IntPtr.Zero;
             
             bool _cache_enabled = torch.is_autocast_cache_enabled();
@@ -74,11 +76,10 @@ private AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enab
 
             this.Enabled = enabled;
 
-            this.Prev = torch.is_autocast_cpu_enabled();
+            this.Prev = torch.is_autocast_enabled(DeviceType.CPU);
             if (dev.type == DeviceType.CUDA) {
-                this.Prev = torch.is_autocast_gpu_enabled();
+                this.Prev = torch.is_autocast_enabled(dev.type);
             }
-
             torch.set_autocast_cache_enabled(_cache_enabled);
             torch.set_autocast_enabled(this.Enabled);
             //throw new NotImplementedException();
@@ -99,23 +100,12 @@ internal torch.Tensor CastTensor(torch.Tensor tensor)
         private void Dispose(bool disposing)
         {
             this.Enabled = false;
-            if (Device.type == DeviceType.CUDA) {
-                if (torch.autocast_decrement_nesting() == 0)
-                    torch.clear_autocast_cache();
-                torch.set_autocast_gpu_dtype(this.fast_dtype);
-                //torch.set_autocast_enabled(this.Prev);
-                torch.set_autocast_enabled(false);
-                torch.set_autocast_cache_enabled(false);
-            }
-
-            if (Device.type == DeviceType.CPU) {
-                if (torch.autocast_decrement_nesting() == 0)
-                    torch.clear_autocast_cache();
-                //torch.set_autocast_enabled(this.Prev);
-                torch.set_autocast_cpu_dtype(this.fast_dtype);
-                torch.set_autocast_enabled(false);
-                torch.set_autocast_cache_enabled(false);
-            }
+            if (torch.autocast_decrement_nesting() == 0)
+                torch.clear_autocast_cache();
+            //torch.set_autocast_enabled(this.Prev);
+            torch.set_autocast_cache_enabled(Device.type, this.fast_dtype);
+            torch.set_autocast_enabled(false);
+            torch.set_autocast_cache_enabled(false);
         }
         
         public void Dispose()
diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
index 7829da992..a91d4816a 100644
--- a/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
+++ b/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
@@ -23,23 +23,9 @@ internal static partial class NativeMethods
         [DllImport("LibTorchSharp")]
         internal static extern bool THSAmp_is_autocast_cache_enabled();
         [DllImport("LibTorchSharp")]
-        internal static extern bool THSAmp_is_autocast_cpu_enabled();
+        internal static extern bool THSAmp_is_autocast_enabled(int device_type);
         [DllImport("LibTorchSharp")]
-        internal static extern bool THSAmp_is_autocast_gpu_enabled();
-        [DllImport("LibTorchSharp")]
-        internal static extern bool THSAmp_is_autocast_xpu_enabled();
-        [DllImport("LibTorchSharp")]
-        internal static extern bool THSAmp_is_autocast_hpu_enabled();
-        [DllImport("LibTorchSharp")]
-        internal static extern bool THSAmp_is_autocast_ipu_enabled();
-        [DllImport("LibTorchSharp")]
-        internal static extern bool THSAmp_is_autocast_xla_enabled();
-        [DllImport("LibTorchSharp")]
-        internal static extern sbyte THSAmp_get_autocast_cpu_dtype();
-        [DllImport("LibTorchSharp")]
-        internal static extern sbyte THSAmp_get_autocast_gpu_dtype();
-        [DllImport("LibTorchSharp")]
-        internal static extern sbyte THSAmp_get_autocast_xpu_dtype();
+        internal static extern sbyte THSAmp_get_autocast_dtype(int device_type);
         [DllImport("LibTorchSharp")]
         internal static extern int THSAmp_autocast_increment_nesting();
         [DllImport("LibTorchSharp")]
@@ -49,11 +35,7 @@ internal static partial class NativeMethods
         [DllImport("LibTorchSharp")]
         internal static extern void THSAmp_set_autocast_cache_enabled(bool enabled);
         [DllImport("LibTorchSharp")]
-        internal static extern void THSAmp_set_autocast_cpu_dtype(sbyte dtype);
-        [DllImport("LibTorchSharp")]
-        internal static extern void THSAmp_set_autocast_gpu_dtype(sbyte dtype);
-        [DllImport("LibTorchSharp")]
-        internal static extern void THSAmp_set_autocast_xpu_dtype(sbyte dtype);
+        internal static extern void THSAmp_set_autocast_dtype(int device_type, sbyte dtype);
         [DllImport("LibTorchSharp")]
         internal static extern void THSAmp_clear_autocast_cache();
 
diff --git a/src/TorchSharp/Tensor/torch.Autocast.cs b/src/TorchSharp/Tensor/torch.Autocast.cs
index e295c8e62..d817e4ab9 100644
--- a/src/TorchSharp/Tensor/torch.Autocast.cs
+++ b/src/TorchSharp/Tensor/torch.Autocast.cs
@@ -10,52 +10,22 @@ public static bool is_autocast_cache_enabled()
             return THSAmp_is_autocast_cache_enabled();
         }
 
-        public static bool is_autocast_enabled(Device device)
+        public static bool is_autocast_enabled(DeviceType device)
         {
-            if(device.type == DeviceType.CPU)
-                return THSAmp_is_autocast_cpu_enabled();
-            if(device.type == DeviceType.CUDA)
-                return THSAmp_is_autocast_gpu_enabled();
-            return THSAmp_is_autocast_cache_enabled();
-        }
-        public static bool is_autocast_cpu_enabled()
-        {
-            return THSAmp_is_autocast_cpu_enabled();
+            return THSAmp_is_autocast_enabled((int)device);
+            //return THSAmp_is_autocast_cache_enabled();
         }
-        public static bool is_autocast_gpu_enabled()
+        public static ScalarType get_autocast_dtype(DeviceType device)
         {
-            return THSAmp_is_autocast_gpu_enabled();
-        }
-        public static bool is_autocast_xpu_enabled()
-        {
-            return THSAmp_is_autocast_xpu_enabled();
-        }
-        public static bool is_autocast_hpu_enabled()
-        {
-            return THSAmp_is_autocast_hpu_enabled();
-        }
-
-        public static ScalarType get_autocast_dtype(Device device)
-        {
-            if (device.type == DeviceType.CPU)
+            return (ScalarType)THSAmp_get_autocast_dtype((int)device);
+            /*if (device.type == DeviceType.CPU)
                 return get_autocast_cpu_dtype();
             if (device.type == DeviceType.CUDA)
                 return get_autocast_gpu_dtype();
-            return ScalarType.Float32;
-        }
-        public static ScalarType get_autocast_cpu_dtype()
-        {
-            return (ScalarType)THSAmp_get_autocast_cpu_dtype();
-        }
-        public static ScalarType get_autocast_gpu_dtype()
-        {
-            return (ScalarType)THSAmp_get_autocast_gpu_dtype();
-        }
-        public static ScalarType get_autocast_xpu_dtype()
-        {
-            return (ScalarType)THSAmp_get_autocast_xpu_dtype();
+            return ScalarType.Float32;*/
         }
 
+
         public static int autocast_increment_nesting()
         {
             return THSAmp_autocast_increment_nesting();
@@ -74,18 +44,9 @@ public static void set_autocast_cache_enabled(bool enabled)
         {
             THSAmp_set_autocast_cache_enabled(enabled);
         }
-
-        public static void set_autocast_cpu_dtype(ScalarType dtype)
-        {
-            THSAmp_set_autocast_cpu_dtype((sbyte)dtype);
-        }
-        public static void set_autocast_gpu_dtype(ScalarType dtype)
-        {
-            THSAmp_set_autocast_gpu_dtype((sbyte)dtype);
-        }
-        public static void set_autocast_xpu_dtype(ScalarType dtype)
+        public static void set_autocast_cache_enabled(DeviceType device, ScalarType dtype)
         {
-            THSAmp_set_autocast_xpu_dtype((sbyte)dtype);
+            THSAmp_set_autocast_dtype((int)device, (sbyte)dtype);
         }
 
         public static void clear_autocast_cache()

From 36b79b9f30a03db72e620edf65ea1756a8e6266d Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Wed, 4 Sep 2024 21:07:30 -0300
Subject: [PATCH 24/25] some advance

---
 src/TorchSharp/Amp/AMPManager.cs   | 33 ++++++++++++++++++++--------
 src/TorchSharp/Amp/AutocastMode.cs | 35 +++++++++++++++---------------
 src/TorchSharp/Amp/GradScaler.cs   |  8 ++++++-
 3 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/src/TorchSharp/Amp/AMPManager.cs b/src/TorchSharp/Amp/AMPManager.cs
index 0262f8934..9d79d59e7 100644
--- a/src/TorchSharp/Amp/AMPManager.cs
+++ b/src/TorchSharp/Amp/AMPManager.cs
@@ -16,7 +16,7 @@ public class TensorConverter
             public IntPtr PrevHandle;
             public IntPtr Handle;
             public torch.ScalarType Dtype;
-            public torch.ScalarType FastDtype;
+            public torch.ScalarType FastDtype = torch.ScalarType.Float32;
             public TensorCalledIn Called, Status;
             public enum TensorCalledIn
             {
@@ -44,15 +44,26 @@ public TensorConverter(IntPtr handle)
         public bool IsDisposed = false;
         /*public UnorderedMap<IntPtr, torch.ScalarType> TensorPtrs= new UnorderedMap<IntPtr, torch.ScalarType>();
         public UnorderedMap<torch.Tensor, torch.ScalarType> TensorMap= new UnorderedMap<torch.Tensor, torch.ScalarType>();*/
-        private readonly AutocastMode autocastMode = AutocastMode.GetInstance();
+        private AutocastMode autocastMode=null;
+        public bool IsEnabled {
+            get {
+                if (autocastMode == null)
+                    return false;
+                return autocastMode.Enabled;
+            }
+        }
 
-        private AMPManager() { }
+        private AMPManager(bool enabled)
+        {
+            if (!torch.cuda_is_available())
+                return;
+            autocastMode = AutocastMode.GetInstance(enabled);
+        }
 
-        public bool IsEnabled => autocastMode.Enabled;
         private static AMPManager Instance;
-        public static AMPManager GetInstance()
+        public static AMPManager GetInstance(bool enabled = false)
         {
-            return Instance ??= new AMPManager();
+            return Instance ??= new AMPManager(enabled);
         }
 
         private torch.ScalarType GetType(IntPtr handle)
@@ -67,7 +78,8 @@ public IntPtr AutoCast(IntPtr handle)
 
         public torch.Tensor AutoCast(torch.Tensor tensor)
         {
-            return tensor.to(AutocastMode.GetInstance().GetFastType());
+            return new torch.Tensor(AutoCast(tensor.Handle));
+            //return tensor.to(AutocastMode.GetInstance().GetFastType());
         }
         public static IntPtr To(IntPtr ptr, torch.ScalarType type)
         {
@@ -154,8 +166,11 @@ public IntPtr Work(IntPtr handle, IntPtr prev)
         
         public IDisposable Enter()
         {
+            if (!torch.cuda_is_available())
+                return this;
             IsEnter = true;
             IsDisposed = false;
+            autocastMode.SetEnabled(true, torch.CUDA);
             Debug.WriteLine($"{nameof(AMPManager)} Enter call");
             return this;
         }
@@ -184,10 +199,10 @@ protected virtual void Dispose(bool disposing)
         }
 
         // // TODO: override finalizer only if 'Dispose(bool disposing)' has code to free unmanaged resources
-        ~AMPManager()
+        /*~AMPManager()
         {
             Dispose(false);
-        }
+        }*/
 
         public void Dispose()
         {
diff --git a/src/TorchSharp/Amp/AutocastMode.cs b/src/TorchSharp/Amp/AutocastMode.cs
index fa7512bb5..808df715b 100644
--- a/src/TorchSharp/Amp/AutocastMode.cs
+++ b/src/TorchSharp/Amp/AutocastMode.cs
@@ -32,43 +32,39 @@ public sealed class AutocastMode : IDisposable
 instance = new AutocastMode(dev, dtype, enabled, cache_enabled);
 return instance;
 }*/
-        public static AutocastMode GetInstance()
+        public static AutocastMode GetInstance(bool enabled=false)
         {
-            return instance ??= new AutocastMode(torch.CUDA, cache_enabled:true);
+            return instance ??= new AutocastMode(torch.cuda_is_available() ? torch.CUDA : torch.CPU, enabled:enabled,cache_enabled:true);
         }
 
         public torch.ScalarType GetFastType()
         {
             return torch.get_autocast_dtype(Device.type);
-            /*var ft = torch.ScalarType.Float32;
-            if (Device.type == DeviceType.CUDA)
-                ft = torch.get_autocast_gpu_dtype();
-            if (Device.type == DeviceType.CPU)
-                ft = torch.get_autocast_cpu_dtype();
-            return ft;*/
         }
         private AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enabled=true, bool? cache_enabled = null)
+        {
+            if (!torch.cuda_is_available())
+                return;
+            Process(dev, dtype, enabled, cache_enabled);
+        }
+
+        private void Process(torch.Device dev, torch.ScalarType? dtype=null, bool enabled=true, bool? cache_enabled=null)
         {
             //var la = torch.tensor(9);
             fast_dtype = dtype ?? torch.ScalarType.Float32;
             fast_dtype = torch.get_autocast_dtype(dev.type);
-            /*if (dev.type == DeviceType.CUDA)
-                fast_dtype = torch.get_autocast_dtype(dev);
-            if (dev.type == DeviceType.CPU)
-                fast_dtype = torch.get_autocast_cpu_dtype();*/
             //IntPtr ptr = IntPtr.Zero;
-            
+
             bool _cache_enabled = torch.is_autocast_cache_enabled();
             if (!torch.cuda.is_available() && dev.type == DeviceType.CUDA) //Is not available for doing multicast
                 Enabled = false;
             if (dtype.HasValue)
                 fast_dtype = dtype.Value;
-            if(cache_enabled.HasValue)
-                _cache_enabled=cache_enabled.Value;
+            if (cache_enabled.HasValue)
+                _cache_enabled = cache_enabled.Value;
             if (dev.type == DeviceType.CPU) {
 
-            }
-            else if (dev.type == DeviceType.CUDA) {
+            } else if (dev.type == DeviceType.CUDA) {
 
                 if (enabled && fast_dtype == torch.ScalarType.BFloat16 && !torch.cuda.is_bf16_supported())
                     throw new Exception("Current CUDA Device does not support bfloat16. Please switch dtype to float16.");
@@ -82,7 +78,6 @@ private AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enab
             }
             torch.set_autocast_cache_enabled(_cache_enabled);
             torch.set_autocast_enabled(this.Enabled);
-            //throw new NotImplementedException();
         }
 
         /*internal void Cast(torch.Tensor tensor)
@@ -97,6 +92,10 @@ internal torch.Tensor CastTensor(torch.Tensor tensor)
             return tensor.to(fast_dtype, tensor.device);
         }
 
+        internal void SetEnabled(bool enabled, torch.Device dev)
+        {
+            Process(dev, null, enabled, true);
+        }
         private void Dispose(bool disposing)
         {
             this.Enabled = false;
diff --git a/src/TorchSharp/Amp/GradScaler.cs b/src/TorchSharp/Amp/GradScaler.cs
index b2cbd3988..f9070f3c2 100644
--- a/src/TorchSharp/Amp/GradScaler.cs
+++ b/src/TorchSharp/Amp/GradScaler.cs
@@ -201,7 +201,13 @@ public void unscale(torch.optim.Optimizer optimizer)
         private float? maybe_opt_step(torch.optim.Optimizer optimizer, UnorderedMap<string, object> optimizer_state)
         {
             //https://github.com/pytorch/pytorch/blob/a00fad017719346bac6e08da0819358146e647e3/torch/amp/grad_scaler.py#L351
-            throw new NotImplementedException();
+            float? retval=0;
+            foreach(var d in optimizer_state)
+                if (d.Value is torch.Tensor t)
+                    retval += t.item<float>();
+            if (retval==0)
+                retval = optimizer.step().item<float>();
+            return retval;
         }
 
         public float? step(torch.optim.Optimizer optimizer, params object[] obj)

From 376f4fbb4af0a028d1d541b0533b966f5120ec7c Mon Sep 17 00:00:00 2001
From: Dimitri <haytham2597@gmail.com>
Date: Sun, 8 Sep 2024 09:13:19 -0300
Subject: [PATCH 25/25] Improve autocastmode

---
 src/Native/LibTorchSharp/THSAmp.cpp           |   6 +
 src/Native/LibTorchSharp/THSAmp.h             |   2 +
 src/TorchSharp/Amp/AMPManager.cs              |   2 +-
 src/TorchSharp/Amp/AutocastMode.cs            | 148 ++++++++++++------
 src/TorchSharp/LinearAlgebra.cs               |   5 +-
 src/TorchSharp/NN/Convolution/Conv1D.cs       |   3 +-
 src/TorchSharp/NN/Convolution/Conv2D.cs       |   3 +-
 src/TorchSharp/NN/Convolution/Conv3D.cs       |   3 +-
 .../NN/Convolution/ConvTranspose1D.cs         |   3 +-
 .../NN/Convolution/ConvTranspose2D.cs         |   3 +-
 .../NN/Convolution/ConvTranspose3D.cs         |   3 +-
 src/TorchSharp/NN/Linear.cs                   |   3 +-
 src/TorchSharp/NN/Recurrent/GRUCell.cs        |   3 +-
 src/TorchSharp/NN/Recurrent/LSTMCell.cs       |   3 +-
 src/TorchSharp/NN/Recurrent/RNNCell.cs        |   3 +-
 .../PInvoke/LibTorchSharp.THSAmp.cs           |   4 +-
 src/TorchSharp/Tensor/Tensor.LinearAlgebra.cs |   7 +-
 src/TorchSharp/Tensor/Tensor.Math.cs          |   6 +-
 src/TorchSharp/Tensor/Tensor.Trig.cs          |   3 +
 src/TorchSharp/Tensor/Tensor.cs               |  14 +-
 src/TorchSharp/Tensor/torch.Autocast.cs       |  19 ++-
 src/TorchSharp/TorchSharp.csproj              |   4 +
 src/TorchSharp/Utils/UnorderedMap.cs          |  59 +++++++
 23 files changed, 222 insertions(+), 87 deletions(-)

diff --git a/src/Native/LibTorchSharp/THSAmp.cpp b/src/Native/LibTorchSharp/THSAmp.cpp
index c1fa3cd9e..79c6da9f2 100644
--- a/src/Native/LibTorchSharp/THSAmp.cpp
+++ b/src/Native/LibTorchSharp/THSAmp.cpp
@@ -44,6 +44,12 @@ bool THSAmp_is_autocast_cache_enabled()
     return at::autocast::is_autocast_cache_enabled();
 }
 
+bool THSAmp_is_autocast_available(int8_t device)
+{
+    return at::autocast::is_autocast_available((c10::DeviceType)device);
+}
+
+
 bool THSAmp_is_autocast_enabled(int8_t device)
 {
     return at::autocast::is_autocast_enabled((at::DeviceType)device);
diff --git a/src/Native/LibTorchSharp/THSAmp.h b/src/Native/LibTorchSharp/THSAmp.h
index 23d56fb2c..4ae115dda 100644
--- a/src/Native/LibTorchSharp/THSAmp.h
+++ b/src/Native/LibTorchSharp/THSAmp.h
@@ -20,6 +20,8 @@ EXPORT_API(bool) THSAmp_is_torch_function_mode_enabled();
 
 EXPORT_API(bool) THSAmp_is_autocast_cache_enabled();
 
+EXPORT_API(bool) THSAmp_is_autocast_available(int8_t device);
+
 EXPORT_API(bool) THSAmp_is_autocast_enabled(int8_t device);
 EXPORT_API(int8_t) THSAmp_get_autocast_dtype(int8_t device);
 EXPORT_API(void) THSAmp_set_autocast_enabled(int8_t device, bool enabled);
diff --git a/src/TorchSharp/Amp/AMPManager.cs b/src/TorchSharp/Amp/AMPManager.cs
index 9d79d59e7..c5a120b03 100644
--- a/src/TorchSharp/Amp/AMPManager.cs
+++ b/src/TorchSharp/Amp/AMPManager.cs
@@ -49,7 +49,7 @@ public bool IsEnabled {
             get {
                 if (autocastMode == null)
                     return false;
-                return autocastMode.Enabled;
+                return autocastMode.IsEnabled;
             }
         }
 
diff --git a/src/TorchSharp/Amp/AutocastMode.cs b/src/TorchSharp/Amp/AutocastMode.cs
index 808df715b..dacfc9721 100644
--- a/src/TorchSharp/Amp/AutocastMode.cs
+++ b/src/TorchSharp/Amp/AutocastMode.cs
@@ -1,9 +1,13 @@
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
+using System.Runtime.CompilerServices;
 using System.Security.Cryptography;
 using System.Text;
 using System.Threading.Tasks;
+using TorchSharp.PInvoke;
+using TorchSharp.Utils;
 
 namespace TorchSharp.Amp
 {
@@ -17,21 +21,17 @@ public static torch.Tensor AutoCast(this torch.Tensor input)
     //TODO: Should make Singleton and IDisposable on ENTER
     public sealed class AutocastMode : IDisposable
     {
-        //NEED "Register" all tensor in scope for uncasting outer-scope
-        public bool Enabled=false;
-        internal bool Prev;
-        //private torch.ScalarType Dtype = torch.ScalarType.Float32;
+        public bool _enabled=false;
+        public bool IsEnter = false;
+        public bool IsDisposed = false;
+        private bool prev_cache_enabled, prev;
+        private torch.ScalarType prev_fastdtype;
+        //internal bool Prev;
+        private bool _cache_enabled=false;
         internal torch.ScalarType fast_dtype = torch.ScalarType.Float32;
-        public torch.Device Device = new torch.Device(DeviceType.CUDA);
+        internal torch.ScalarType? dtype = torch.ScalarType.Float32;
+        public DeviceType device = DeviceType.CUDA;
         private static AutocastMode instance;
-        //bool disposedValue;
-
-        /*public static AutocastMode GetInstance(torch.Device dev, torch.ScalarType? dtype = null, bool enabled = true, bool? cache_enabled = null)
-{
-if(instance ==null)
-instance = new AutocastMode(dev, dtype, enabled, cache_enabled);
-return instance;
-}*/
         public static AutocastMode GetInstance(bool enabled=false)
         {
             return instance ??= new AutocastMode(torch.cuda_is_available() ? torch.CUDA : torch.CPU, enabled:enabled,cache_enabled:true);
@@ -39,72 +39,118 @@ public static AutocastMode GetInstance(bool enabled=false)
 
         public torch.ScalarType GetFastType()
         {
-            return torch.get_autocast_dtype(Device.type);
+            return torch.get_autocast_dtype(device);
         }
         private AutocastMode(torch.Device dev, torch.ScalarType? dtype = null, bool enabled=true, bool? cache_enabled = null)
         {
-            if (!torch.cuda_is_available())
-                return;
-            Process(dev, dtype, enabled, cache_enabled);
-        }
-
-        private void Process(torch.Device dev, torch.ScalarType? dtype=null, bool enabled=true, bool? cache_enabled=null)
-        {
-            //var la = torch.tensor(9);
-            fast_dtype = dtype ?? torch.ScalarType.Float32;
-            fast_dtype = torch.get_autocast_dtype(dev.type);
+            /*dtype_by_methods[nameof(torch.matmul), DeviceType.CUDA] = torch.ScalarType.Float16;
+            dtype_by_methods[nameof(torch.matmul), DeviceType.CUDA] = torch.ScalarType.Float16;*/
+            //https://pytorch.org/docs/stable/amp.html#cuda-ops-that-can-autocast-to-float16
+            if (dtype == null)
+                dtype = torch.get_autocast_dtype(dev.type);
+            this.device = dev.type;
+            if (!torch.is_autocast_available(device))
+                throw new Exception($"User specified an unsupported autocast device_type {device}");
+            fast_dtype = torch.get_autocast_dtype(device);
+            //TODO: is_autocast_available();
             //IntPtr ptr = IntPtr.Zero;
 
-            bool _cache_enabled = torch.is_autocast_cache_enabled();
-            if (!torch.cuda.is_available() && dev.type == DeviceType.CUDA) //Is not available for doing multicast
-                Enabled = false;
-            if (dtype.HasValue)
+            _cache_enabled = torch.is_autocast_cache_enabled();
+            if (enabled && !torch.cuda_is_available() && dev.type == DeviceType.CUDA) //Is not available for doing multicast
+                enabled = false;
+            if (this.dtype.HasValue)
                 fast_dtype = dtype.Value;
             if (cache_enabled.HasValue)
                 _cache_enabled = cache_enabled.Value;
-            if (dev.type == DeviceType.CPU) {
 
+            if (dev.type == DeviceType.CPU) {
+                if (fast_dtype != torch.ScalarType.Float16 || fast_dtype != torch.ScalarType.BFloat16) {
+                    Debug.WriteLine($"In CPU autocast, but the target d type is not suported. Disabling autocast. CPU autocast only supports dtype of {torch.ScalarType.Float16} or {torch.ScalarType.BFloat16}");
+                    enabled = false;
+                }
             } else if (dev.type == DeviceType.CUDA) {
 
                 if (enabled && fast_dtype == torch.ScalarType.BFloat16 && !torch.cuda.is_bf16_supported())
                     throw new Exception("Current CUDA Device does not support bfloat16. Please switch dtype to float16.");
             }
+            this._enabled = enabled;
+        }
+        private torch.ScalarType GetType(IntPtr handle)
+        {
+            return (torch.ScalarType)NativeMethods.THSTensor_type(handle);
+        }
 
-            this.Enabled = enabled;
-
-            this.Prev = torch.is_autocast_enabled(DeviceType.CPU);
-            if (dev.type == DeviceType.CUDA) {
-                this.Prev = torch.is_autocast_enabled(dev.type);
-            }
-            torch.set_autocast_cache_enabled(_cache_enabled);
-            torch.set_autocast_enabled(this.Enabled);
+        public static IntPtr AutoCast(IntPtr handle)
+        {
+            return ToIf(handle, GetInstance().GetFastType());
+        }
+        public static IntPtr AutoCast(IntPtr handle, torch.ScalarType dtype)
+        {
+            return ToIf(handle, dtype);
         }
 
-        /*internal void Cast(torch.Tensor tensor)
+
+        public static torch.Tensor AutoCast(torch.Tensor tensor)
         {
-            tensor.to(fast_dtype, tensor.device);
-        }*/
+            return new torch.Tensor(AutoCast(tensor.Handle));
+            //return tensor.to(AutocastMode.GetInstance().GetFastType());
+        }
+        public static IntPtr To(IntPtr ptr, torch.ScalarType type)
+        {
+            Debug.WriteLine($"{nameof(AutocastMode)} Tensor converting from: {(torch.ScalarType)NativeMethods.THSTensor_type(ptr)} to: {type}");
+            var res = NativeMethods.THSTensor_to_type(ptr, (sbyte)type);
+            if (res == IntPtr.Zero)
+                torch.CheckForErrors();
+            return res;
+        }
+        public static IntPtr ToIf(IntPtr ptr, torch.ScalarType type)
+        {
+            if (!GetInstance()._enabled)
+                return ptr;
+            /*if (!NativeMethods.THSAmp_is_autocast_enabled(NativeMethods.THSTensor_device_type(ptr)))
+                return ptr;*/
+            var res = NativeMethods.THSTensor_to_type(ptr, (sbyte)type);
+            if (res == IntPtr.Zero)
+                torch.CheckForErrors();
+            return res;
+        }
+        public static IntPtr ToIf(IntPtr ptr, torch.ScalarType type, DeviceType device_type)
+        {
+            bool is_elegible = (torch.ScalarType)NativeMethods.THSTensor_type(ptr) != torch.ScalarType.Float64 && (DeviceType)NativeMethods.THSTensor_device_type(ptr) == device_type;
+            
+            if (!NativeMethods.THSAmp_is_autocast_enabled(NativeMethods.THSTensor_device_type(ptr)))
+                return ptr;
+            var res = NativeMethods.THSTensor_to_type(ptr, (sbyte)type);
+            if (res == IntPtr.Zero)
+                torch.CheckForErrors();
+            return res;
+        }
 
-        internal torch.Tensor CastTensor(torch.Tensor tensor)
+        public static bool IsAutocastEnabled(DeviceType device = DeviceType.CUDA)
         {
-            if (!Enabled)
-                return tensor;
-            return tensor.to(fast_dtype, tensor.device);
+            return torch.is_autocast_enabled(!torch.cuda_is_available() ? DeviceType.CPU : device);
         }
 
-        internal void SetEnabled(bool enabled, torch.Device dev)
+        public IDisposable Enter()
         {
-            Process(dev, null, enabled, true);
+            prev_cache_enabled = torch.is_autocast_cache_enabled();
+            prev = torch.is_autocast_enabled(device);
+            prev_fastdtype = torch.get_autocast_dtype(device);
+            torch.set_autocast_enabled(device, _enabled);
+            torch.set_autocast_dtype(device, fast_dtype);
+            torch.autocast_increment_nesting();
+            torch.set_autocast_cache_enabled(_cache_enabled);
+            return this;
         }
+
         private void Dispose(bool disposing)
         {
-            this.Enabled = false;
+            this._enabled = false;
             if (torch.autocast_decrement_nesting() == 0)
                 torch.clear_autocast_cache();
-            //torch.set_autocast_enabled(this.Prev);
-            torch.set_autocast_cache_enabled(Device.type, this.fast_dtype);
-            torch.set_autocast_enabled(false);
-            torch.set_autocast_cache_enabled(false);
+            torch.set_autocast_enabled(device, prev);
+            torch.set_autocast_dtype(device, prev_fastdtype);
+            torch.set_autocast_cache_enabled(prev_cache_enabled);
         }
         
         public void Dispose()
diff --git a/src/TorchSharp/LinearAlgebra.cs b/src/TorchSharp/LinearAlgebra.cs
index c9964d536..43d9ed82d 100644
--- a/src/TorchSharp/LinearAlgebra.cs
+++ b/src/TorchSharp/LinearAlgebra.cs
@@ -2,6 +2,7 @@
 using System;
 using System.Linq;
 using System.Collections.Generic;
+using TorchSharp.Amp;
 using static TorchSharp.PInvoke.NativeMethods;
 
 #nullable enable
@@ -440,7 +441,7 @@ public static Tensor multi_dot(IList<Tensor> tensors)
                     throw new ArgumentException(nameof(tensors));
                 }
                 if (tensors.Count == 1) {
-                    tensors[0] = Amp.AMPManager.GetInstance().AutoCast(tensors[0]);
+                    tensors[0] = AutocastMode.AutoCast(tensors[0]);
                     return tensors[0];
                 }
 
@@ -449,7 +450,7 @@ public static Tensor multi_dot(IList<Tensor> tensors)
                     var res = THSLinalg_multi_dot(tensorsRef, parray.Array.Length);
                     if (res == IntPtr.Zero)
                         torch.CheckForErrors();
-                    res = Amp.AMPManager.GetInstance().AutoCast(res);
+                    res = AutocastMode.AutoCast(res);
                     return new Tensor(res);
                 }
             }
diff --git a/src/TorchSharp/NN/Convolution/Conv1D.cs b/src/TorchSharp/NN/Convolution/Conv1D.cs
index 0064020fd..dd7b4c263 100644
--- a/src/TorchSharp/NN/Convolution/Conv1D.cs
+++ b/src/TorchSharp/NN/Convolution/Conv1D.cs
@@ -1,5 +1,6 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.torch;
 using static TorchSharp.PInvoke.NativeMethods;
 
@@ -194,7 +195,7 @@ public static Tensor conv1d(Tensor input, Tensor weight, Tensor? bias = null,
                                     (IntPtr)pdilation, dilationArray.Length,
                                     groups);
                             if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                            res = Amp.AMPManager.GetInstance().AutoCast(res);
+                            res = AutocastMode.AutoCast(res);
                             return new Tensor(res);
                         }
                     }
diff --git a/src/TorchSharp/NN/Convolution/Conv2D.cs b/src/TorchSharp/NN/Convolution/Conv2D.cs
index 277b695eb..4008b51fa 100644
--- a/src/TorchSharp/NN/Convolution/Conv2D.cs
+++ b/src/TorchSharp/NN/Convolution/Conv2D.cs
@@ -1,5 +1,6 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.torch;
 using static TorchSharp.PInvoke.NativeMethods;
 
@@ -238,7 +239,7 @@ public static Tensor conv2d(Tensor input, Tensor weight, Tensor? bias = null,
                                     (IntPtr)pdilation, dilation.Length,
                                     groups);
                             if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                            res = Amp.AMPManager.GetInstance().AutoCast(res);
+                            res = AutocastMode.AutoCast(res);
                             return new Tensor(res);
                         }
                     }
diff --git a/src/TorchSharp/NN/Convolution/Conv3D.cs b/src/TorchSharp/NN/Convolution/Conv3D.cs
index e8a670b7d..ef37aaa6a 100644
--- a/src/TorchSharp/NN/Convolution/Conv3D.cs
+++ b/src/TorchSharp/NN/Convolution/Conv3D.cs
@@ -1,5 +1,6 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.torch;
 using static TorchSharp.PInvoke.NativeMethods;
 
@@ -181,7 +182,7 @@ public static Tensor conv3d(Tensor input, Tensor weight, Tensor? bias = null,
                                     (IntPtr)pdilation, dilation.Length,
                                     groups);
                             if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                            res = Amp.AMPManager.GetInstance().AutoCast(res);
+                            res = AutocastMode.AutoCast(res);
                             return new Tensor(res);
                         }
                     }
diff --git a/src/TorchSharp/NN/Convolution/ConvTranspose1D.cs b/src/TorchSharp/NN/Convolution/ConvTranspose1D.cs
index 954e4ab1b..9700a58b7 100644
--- a/src/TorchSharp/NN/Convolution/ConvTranspose1D.cs
+++ b/src/TorchSharp/NN/Convolution/ConvTranspose1D.cs
@@ -1,5 +1,6 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.torch;
 using static TorchSharp.PInvoke.NativeMethods;
 
@@ -117,7 +118,7 @@ public static Tensor conv_transpose1d(Tensor input, Tensor weight, Tensor? bias
                                     (IntPtr)pdilation, dilations.Length,
                                     groups);
                             if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                            res = Amp.AMPManager.GetInstance().AutoCast(res);
+                            res = AutocastMode.AutoCast(res);
                             return new Tensor(res);
                         }
                     }
diff --git a/src/TorchSharp/NN/Convolution/ConvTranspose2D.cs b/src/TorchSharp/NN/Convolution/ConvTranspose2D.cs
index 8a074dce1..63fc0d6e5 100644
--- a/src/TorchSharp/NN/Convolution/ConvTranspose2D.cs
+++ b/src/TorchSharp/NN/Convolution/ConvTranspose2D.cs
@@ -1,5 +1,6 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.torch;
 using static TorchSharp.PInvoke.NativeMethods;
 
@@ -148,7 +149,7 @@ public static Tensor conv_transpose2d(Tensor input, Tensor weight, Tensor? bias
                                     (IntPtr)pdilation, dilation.Length,
                                     groups);
                             if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                            res = Amp.AMPManager.GetInstance().AutoCast(res);
+                            res = AutocastMode.AutoCast(res);
                             return new Tensor(res);
                         }
                     }
diff --git a/src/TorchSharp/NN/Convolution/ConvTranspose3D.cs b/src/TorchSharp/NN/Convolution/ConvTranspose3D.cs
index 4362a8738..faeb279ad 100644
--- a/src/TorchSharp/NN/Convolution/ConvTranspose3D.cs
+++ b/src/TorchSharp/NN/Convolution/ConvTranspose3D.cs
@@ -1,5 +1,6 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.torch;
 using static TorchSharp.PInvoke.NativeMethods;
 
@@ -144,7 +145,7 @@ public static Tensor conv_transpose3d(Tensor input, Tensor weight, Tensor? bias
                                     (IntPtr)pdilation, dilation.Length,
                                     groups);
                             if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                            res = Amp.AMPManager.GetInstance().AutoCast(res);
+                            res = AutocastMode.AutoCast(res);
                             return new Tensor(res);
                         }
                     }
diff --git a/src/TorchSharp/NN/Linear.cs b/src/TorchSharp/NN/Linear.cs
index 675952cef..68b34ffd5 100644
--- a/src/TorchSharp/NN/Linear.cs
+++ b/src/TorchSharp/NN/Linear.cs
@@ -1,5 +1,6 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.torch;
 using static TorchSharp.torch.nn;
 using static TorchSharp.PInvoke.NativeMethods;
@@ -104,7 +105,7 @@ public static Tensor linear(Tensor input, Tensor weights, Tensor? bias = null)
                     IntPtr bPtr = bias?.Handle ?? IntPtr.Zero;
                     var res = THSNN_functional_linear(input.Handle, weights.Handle, bPtr);
                     if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                    res = Amp.AMPManager.GetInstance().AutoCast(res);
+                    res = AutocastMode.AutoCast(res);
                     return new Tensor(res);
                 }
             }
diff --git a/src/TorchSharp/NN/Recurrent/GRUCell.cs b/src/TorchSharp/NN/Recurrent/GRUCell.cs
index 50be405e1..610762542 100644
--- a/src/TorchSharp/NN/Recurrent/GRUCell.cs
+++ b/src/TorchSharp/NN/Recurrent/GRUCell.cs
@@ -1,5 +1,6 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.torch;
 using static TorchSharp.torch.nn;
 using static TorchSharp.PInvoke.NativeMethods;
@@ -106,7 +107,7 @@ public static GRUCell GRUCell(long inputSize, long hiddenSize, bool bias = true,
             {
                 var res = THSNN_GRUCell_ctor(inputSize, hiddenSize, bias, out var boxedHandle);
                 if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                res = Amp.AMPManager.GetInstance().AutoCast(res); //TODO: Research if this work...
+                res = AutocastMode.AutoCast(res);
                 return new GRUCell(res, boxedHandle).MoveModule<GRUCell>(device, dtype);
             }
         }
diff --git a/src/TorchSharp/NN/Recurrent/LSTMCell.cs b/src/TorchSharp/NN/Recurrent/LSTMCell.cs
index 2449348fb..44f6e5bbc 100644
--- a/src/TorchSharp/NN/Recurrent/LSTMCell.cs
+++ b/src/TorchSharp/NN/Recurrent/LSTMCell.cs
@@ -1,5 +1,6 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.torch;
 using static TorchSharp.torch.nn;
 using static TorchSharp.PInvoke.NativeMethods;
@@ -108,7 +109,7 @@ public static LSTMCell LSTMCell(long inputSize, long hiddenSize, bool bias = tru
             {
                 var res = THSNN_LSTMCell_ctor(inputSize, hiddenSize, bias, out var boxedHandle);
                 if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                res = Amp.AMPManager.GetInstance().AutoCast(res);
+                res = AutocastMode.AutoCast(res);
                 return new LSTMCell(res, boxedHandle).MoveModule<LSTMCell>(device, dtype);
             }
         }
diff --git a/src/TorchSharp/NN/Recurrent/RNNCell.cs b/src/TorchSharp/NN/Recurrent/RNNCell.cs
index 0557dfe2e..05bf7088b 100644
--- a/src/TorchSharp/NN/Recurrent/RNNCell.cs
+++ b/src/TorchSharp/NN/Recurrent/RNNCell.cs
@@ -1,5 +1,6 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.torch;
 using static TorchSharp.torch.nn;
 using static TorchSharp.PInvoke.NativeMethods;
@@ -112,7 +113,7 @@ public static RNNCell RNNCell(long inputSize, long hiddenSize, NonLinearities no
             {
                 var res = THSNN_RNNCell_ctor(inputSize, hiddenSize, (long)nonLinearity, bias, out var boxedHandle);
                 if (res == IntPtr.Zero) { torch.CheckForErrors(); }
-                res = Amp.AMPManager.GetInstance().AutoCast(res);
+                res = AutocastMode.AutoCast(res);
                 return new RNNCell(res, boxedHandle).MoveModule<RNNCell>(device, dtype);
             }
         }
diff --git a/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs b/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
index a91d4816a..cfc9cda91 100644
--- a/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
+++ b/src/TorchSharp/PInvoke/LibTorchSharp.THSAmp.cs
@@ -23,6 +23,8 @@ internal static partial class NativeMethods
         [DllImport("LibTorchSharp")]
         internal static extern bool THSAmp_is_autocast_cache_enabled();
         [DllImport("LibTorchSharp")]
+        internal static extern bool THSAmp_is_autocast_available(int device_type);
+        [DllImport("LibTorchSharp")]
         internal static extern bool THSAmp_is_autocast_enabled(int device_type);
         [DllImport("LibTorchSharp")]
         internal static extern sbyte THSAmp_get_autocast_dtype(int device_type);
@@ -31,7 +33,7 @@ internal static partial class NativeMethods
         [DllImport("LibTorchSharp")]
         internal static extern int THSAmp_autocast_decrement_nesting();
         [DllImport("LibTorchSharp")]
-        internal static extern void THSAmp_set_autocast_enabled(bool enabled);
+        internal static extern void THSAmp_set_autocast_enabled(int device_type, bool enabled);
         [DllImport("LibTorchSharp")]
         internal static extern void THSAmp_set_autocast_cache_enabled(bool enabled);
         [DllImport("LibTorchSharp")]
diff --git a/src/TorchSharp/Tensor/Tensor.LinearAlgebra.cs b/src/TorchSharp/Tensor/Tensor.LinearAlgebra.cs
index 9f62cda4a..6289990a4 100644
--- a/src/TorchSharp/Tensor/Tensor.LinearAlgebra.cs
+++ b/src/TorchSharp/Tensor/Tensor.LinearAlgebra.cs
@@ -1,6 +1,7 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
 using System.Linq;
+using TorchSharp.Amp;
 using static TorchSharp.PInvoke.NativeMethods;
 
 namespace TorchSharp
@@ -171,7 +172,7 @@ public Tensor matmul(Tensor target)
             {
                 var res = THSTensor_matmul(Handle, target.Handle);
                 if (res == IntPtr.Zero) { CheckForErrors(); }
-                res = Amp.AMPManager.GetInstance().AutoCast(res);
+                res = AutocastMode.AutoCast(res);
                 return new Tensor(res);
             }
 
@@ -184,7 +185,7 @@ public Tensor mm(Tensor target)
             {
                 var res = THSTensor_mm(Handle, target.Handle);
                 if (res == IntPtr.Zero) { CheckForErrors(); }
-                res = Amp.AMPManager.GetInstance().AutoCast(res);
+                res = AutocastMode.AutoCast(res);
                 return new Tensor(res);
             }
 
@@ -197,7 +198,7 @@ public Tensor mv(Tensor target)
             {
                 var res = THSTensor_mv(Handle, target.Handle);
                 if (res == IntPtr.Zero) { CheckForErrors(); }
-                res = Amp.AMPManager.GetInstance().AutoCast(res);
+                res = AutocastMode.AutoCast(res);
                 return new Tensor(res);
             }
 
diff --git a/src/TorchSharp/Tensor/Tensor.Math.cs b/src/TorchSharp/Tensor/Tensor.Math.cs
index 4970a9658..32db3a478 100644
--- a/src/TorchSharp/Tensor/Tensor.Math.cs
+++ b/src/TorchSharp/Tensor/Tensor.Math.cs
@@ -1,6 +1,7 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 #nullable enable
 using System;
+using TorchSharp.Amp;
 using static TorchSharp.PInvoke.NativeMethods;
 
 namespace TorchSharp
@@ -270,7 +271,7 @@ public Tensor addmm(Tensor mat1, Tensor mat2, float beta = 1, float alpha = 1)
                 var res = THSTensor_addmm(Handle, mat1.Handle, mat2.Handle, beta, alpha);
                 if (res == IntPtr.Zero)
                     CheckForErrors();
-                res = Amp.AMPManager.GetInstance().AutoCast(res);
+                res = AutocastMode.AutoCast(res);
                 return new Tensor(res);
             }
 
@@ -302,7 +303,7 @@ public Tensor addmv(Tensor mat, Tensor vec, float beta = 1.0f, float alpha = 1.0
                 var res = THSTensor_addmv(Handle, mat.Handle, vec.Handle, beta, alpha);
                 if (res == IntPtr.Zero)
                     CheckForErrors();
-                res = Amp.AMPManager.GetInstance().AutoCast(res);
+                res = AutocastMode.AutoCast(res);
                 return new Tensor(res);
             }
 
@@ -1387,6 +1388,7 @@ public Tensor pow(Tensor exponent)
             {
                 var res = THSTensor_pow(Handle, exponent.Handle);
                 if (res == IntPtr.Zero) { CheckForErrors(); }
+                res = AutocastMode.AutoCast(res, ScalarType.Float32); //https://pytorch.org/docs/stable/amp.html#cuda-ops-that-can-autocast-to-float32
                 return new Tensor(res);
             }
 
diff --git a/src/TorchSharp/Tensor/Tensor.Trig.cs b/src/TorchSharp/Tensor/Tensor.Trig.cs
index d377e967c..39e8f048b 100644
--- a/src/TorchSharp/Tensor/Tensor.Trig.cs
+++ b/src/TorchSharp/Tensor/Tensor.Trig.cs
@@ -1,6 +1,7 @@
 // Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
 using System;
 using System.Diagnostics.Contracts;
+using TorchSharp.Amp;
 using static TorchSharp.PInvoke.NativeMethods;
 
 namespace TorchSharp
@@ -39,6 +40,7 @@ public Tensor asin()
                 var res = THSTensor_asin(Handle);
                 if (res == IntPtr.Zero)
                     CheckForErrors();
+                res = AutocastMode.AutoCast(res, ScalarType.Float32);
                 return new Tensor(res);
             }
 
@@ -70,6 +72,7 @@ public Tensor acos()
                 var res = THSTensor_acos(Handle);
                 if (res == IntPtr.Zero)
                     CheckForErrors();
+                res = AutocastMode.AutoCast(res, ScalarType.Float32);
                 return new Tensor(res);
             }
 
diff --git a/src/TorchSharp/Tensor/Tensor.cs b/src/TorchSharp/Tensor/Tensor.cs
index 696e07d13..0fe6eb971 100644
--- a/src/TorchSharp/Tensor/Tensor.cs
+++ b/src/TorchSharp/Tensor/Tensor.cs
@@ -45,13 +45,7 @@ public partial class Tensor : IDisposable
             }*/
             internal Tensor(IntPtr handle)
             {
-                //TODO: Add Autocast/AMP ScopeManager, need improve this.. 1) is not threadsafe and may have big problem while casting and uncasting.
-                //DANGER: DONT USE THIS ON PRODUCTION
-                /*if (AMPManager.GetInstance().IsEnabled) {
-                    this.handle = AMPManager.GetInstance().Work(handle, this.handle); //MMM.... This is the more abstract of any method Tensor right????
-                } else {*/
-                    this.handle = handle;
-                //}
+                this.handle = handle;
                 System.Threading.Interlocked.Increment(ref _totalCount);
                 _peakCount = Math.Max(_totalCount, _peakCount);
                 OwningDisposeScope = DisposeScopeManager.ThreadSingleton.RegisterOnCurrentDisposeScope(this);
@@ -3119,7 +3113,7 @@ public Tensor baddbmm(Tensor batch1, Tensor batch2, float beta = 1, float alpha
             {
                 var res = NativeMethods.THSTensor_baddbmm(Handle, batch1.Handle, batch2.Handle, beta, alpha);
                 if (res == IntPtr.Zero) { CheckForErrors(); }
-                res = Amp.AMPManager.GetInstance().AutoCast(res);
+                res = AutocastMode.AutoCast(res);
                 return new Tensor(res);
             }
 
@@ -3132,7 +3126,7 @@ public Tensor bmm(Tensor batch2)
             {
                 var res = NativeMethods.THSTensor_bmm(Handle, batch2.Handle);
                 if (res == IntPtr.Zero) { CheckForErrors(); }
-                res = Amp.AMPManager.GetInstance().AutoCast(res);
+                res = AutocastMode.AutoCast(res);
                 return new Tensor(res);
             }
 
@@ -4488,7 +4482,7 @@ public Tensor prelu(Tensor target)
             {
                 var res = NativeMethods.THSTensor_prelu(Handle, target.Handle);
                 if (res == IntPtr.Zero) { CheckForErrors(); }
-                res = Amp.AMPManager.GetInstance().AutoCast(res);
+                res = AutocastMode.AutoCast(res);
                 return new Tensor(res);
             }
 
diff --git a/src/TorchSharp/Tensor/torch.Autocast.cs b/src/TorchSharp/Tensor/torch.Autocast.cs
index d817e4ab9..12e86d46d 100644
--- a/src/TorchSharp/Tensor/torch.Autocast.cs
+++ b/src/TorchSharp/Tensor/torch.Autocast.cs
@@ -10,6 +10,11 @@ public static bool is_autocast_cache_enabled()
             return THSAmp_is_autocast_cache_enabled();
         }
 
+        public static bool is_autocast_available(DeviceType device)
+        {
+            //https://github.com/pytorch/pytorch/blob/main/torch/csrc/autograd/init.cpp
+            return THSAmp_is_autocast_available((int)device);
+        }
         public static bool is_autocast_enabled(DeviceType device)
         {
             return THSAmp_is_autocast_enabled((int)device);
@@ -18,11 +23,6 @@ public static bool is_autocast_enabled(DeviceType device)
         public static ScalarType get_autocast_dtype(DeviceType device)
         {
             return (ScalarType)THSAmp_get_autocast_dtype((int)device);
-            /*if (device.type == DeviceType.CPU)
-                return get_autocast_cpu_dtype();
-            if (device.type == DeviceType.CUDA)
-                return get_autocast_gpu_dtype();
-            return ScalarType.Float32;*/
         }
 
 
@@ -36,9 +36,14 @@ public static int autocast_decrement_nesting()
             return THSAmp_autocast_decrement_nesting();
         }
 
-        public static void set_autocast_enabled(bool enabled)
+        public static void set_autocast_enabled(DeviceType device, bool enabled)
+        {
+            THSAmp_set_autocast_enabled((int)device,enabled);
+        }
+
+        public static void set_autocast_dtype(DeviceType device, ScalarType dtype)
         {
-            THSAmp_set_autocast_enabled(enabled);
+            THSAmp_set_autocast_dtype((int)device, (sbyte)dtype);
         }
         public static void set_autocast_cache_enabled(bool enabled)
         {
diff --git a/src/TorchSharp/TorchSharp.csproj b/src/TorchSharp/TorchSharp.csproj
index 054f5c18a..d5cb1135d 100644
--- a/src/TorchSharp/TorchSharp.csproj
+++ b/src/TorchSharp/TorchSharp.csproj
@@ -19,6 +19,10 @@
     <None Remove="TorchVision\**" />
   </ItemGroup>
 
+  <ItemGroup>
+    <Compile Remove="Amp\AMPManager.cs" />
+  </ItemGroup>
+
   <ItemGroup>
     <None Remove="Tensor\TensorTyped.tt" />
   </ItemGroup>
diff --git a/src/TorchSharp/Utils/UnorderedMap.cs b/src/TorchSharp/Utils/UnorderedMap.cs
index 92446906a..6eb073b1d 100644
--- a/src/TorchSharp/Utils/UnorderedMap.cs
+++ b/src/TorchSharp/Utils/UnorderedMap.cs
@@ -6,6 +6,65 @@
 
 namespace TorchSharp.Utils
 {
+    public class Dictionary<TKey1, TKey2, TValue> : Dictionary<Tuple<TKey1, TKey2>, TValue>, IDictionary<Tuple<TKey1, TKey2>, TValue>
+    {
+
+        public TValue this[TKey1 key1, TKey2 key2] {
+            get { return base[Tuple.Create(key1, key2)]; }
+            set { base[Tuple.Create(key1, key2)] = value; }
+        }
+
+        public void Add(TKey1 key1, TKey2 key2, TValue value)
+        {
+            base.Add(Tuple.Create(key1, key2), value);
+        }
+
+        public bool ContainsKey(TKey1 key1, TKey2 key2)
+        {
+            return base.ContainsKey(Tuple.Create(key1, key2));
+        }
+    }
+
+    public class UnorderedMap<TKey1, TKey2, TValue> : Dictionary<TKey1, TKey2, TValue>, IDisposable
+    {
+        bool disposedValue;
+        public new TValue this[TKey1 tk1, TKey2 tk2] {
+            get {
+                /*if (!this.ContainsKey(tk) && default_dict == null)
+                    return default_dict;*/
+                if (this.ContainsKey(tk1, tk2))
+                    return base[tk1, tk2];
+                return default;
+            }
+            set {
+                if (!this.ContainsKey(tk1, tk2)) {
+                    this.Add(tk1, tk2, value);
+                    return;
+                }
+                base[tk1, tk2] = value;
+            }
+        }
+
+        protected virtual void Dispose(bool disposing)
+        {
+            if (!disposedValue) {
+                if (disposing) {
+                    base.Clear();
+                    // TODO: dispose managed state (managed objects)
+                }
+
+                // TODO: free unmanaged resources (unmanaged objects) and override finalizer
+                // TODO: set large fields to null
+                disposedValue = true;
+            }
+        }
+        public void Dispose()
+        {
+            // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
+            Dispose(disposing: true);
+            GC.SuppressFinalize(this);
+        }
+    }
     public class UnorderedMap<TKey, TValue> : Dictionary<TKey, TValue>, IDisposable
     {
         bool disposedValue;