From a5707ae843222cf0a51a617c67dafb88cdfd2e4b Mon Sep 17 00:00:00 2001
From: cDc <cdc.seacave@gmail.com>
Date: Fri, 18 Aug 2023 10:54:32 +0300
Subject: [PATCH] interface: add Nerfstudio and RTMV importer (#1047)

Co-authored-by: Mihai Cernea <mihai.cernea@dronedeploy.com>
---
 apps/InterfaceMVSNet/InterfaceMVSNet.cpp   | 254 ++++++++++++++++++++-
 apps/InterfacePolycam/InterfacePolycam.cpp |   1 +
 libs/MVS/PointCloud.cpp                    |  12 +-
 libs/MVS/Scene.cpp                         |   4 +-
 vcpkg.json                                 |   9 +-
 5 files changed, 266 insertions(+), 14 deletions(-)
diff --git a/apps/InterfaceMVSNet/InterfaceMVSNet.cpp b/apps/InterfaceMVSNet/InterfaceMVSNet.cpp
index d258559d3..9cc9b9217 100644
--- a/apps/InterfaceMVSNet/InterfaceMVSNet.cpp
+++ b/apps/InterfaceMVSNet/InterfaceMVSNet.cpp
@@ -31,6 +31,8 @@
 
 #include "../../libs/MVS/Common.h"
 #include "../../libs/MVS/Scene.h"
+#define JSON_NOEXCEPTION
+#include "../../libs/IO/json.hpp"
 #include <boost/program_options.hpp>
 
 using namespace MVS;
@@ -44,6 +46,8 @@ using namespace MVS;
 #define MVSNET_CAMERAS_FOLDER _T("cams")
 #define MVSNET_IMAGES_EXT _T(".jpg")
 #define MVSNET_CAMERAS_NAME _T("_cam.txt")
+#define RTMV_CAMERAS_EXT _T(".json")
+#define NERFSTUDIO_TRANSFORMS _T("transforms.json")
 
 
 // S T R U C T S ///////////////////////////////////////////////////
@@ -217,17 +221,14 @@ void ImageListParse(const LPSTR* argv, Matrix3x4& P)
 //     |--xxx.jpg
 //     |--xxx.jpg
 //     ....
-//     |--xxx.jpg
 //   |--cams
 //     |--xxx_cam.txt
 //     |--xxx_cam.txt
 //     ....
-//     |--xxx_cam.txt
 //   |--render_cams
 //     |--xxx_cam.txt
 //     |--xxx_cam.txt
 //     ....
-//     |--xxx_cam.txt
 //
 // where the camera parameter of one image stored in a cam.txt file contains the camera
 // extrinsic E = [R|t], intrinsic K and the depth range:
@@ -243,12 +244,11 @@ void ImageListParse(const LPSTR* argv, Matrix3x4& P)
 //  K20 K21 K22
 //
 //  DEPTH_MIN DEPTH_INTERVAL (DEPTH_NUM DEPTH_MAX)
-bool ParseSceneMVSNet(Scene& scene)
+bool ParseSceneMVSNet(Scene& scene, const std::filesystem::path& path)
 {
 	#if defined(_SUPPORT_CPP17) && (!defined(__GNUC__) || (__GNUC__ > 7))
-	String strPath(MAKE_PATH_FULL(WORKING_FOLDER_FULL, OPT::strInputFileName));
+	String strPath(path.string());
 	Util::ensureValidFolderPath(strPath);
-	const std::filesystem::path path(static_cast<std::string&>(strPath));
 	IIndex prevPlatformID = NO_ID;
 	for (const std::filesystem::directory_entry& entry : std::filesystem::directory_iterator(path / MVSNET_IMAGES_FOLDER)) {
 		if (entry.path().extension() != MVSNET_IMAGES_EXT)
@@ -335,6 +335,246 @@ bool ParseSceneMVSNet(Scene& scene)
 	#endif // _SUPPORT_CPP17
 }
 
+// RTMV scene format: http://www.cs.umd.edu/~mmeshry/projects/rtmv
+// |--sceneX
+//   |--images
+//     |--xxx.jpg
+//     |--xxx.jpg
+//     ....
+//   |--outputs (optional)
+//     |--depthxxxx.exr
+//     |--normalxxxx.exr
+//     ....
+//   |--transforms.json
+bool ParseSceneNerfstudio(Scene& scene, const std::filesystem::path& path)
+{
+	const nlohmann::json data = nlohmann::json::parse(std::ifstream((path / NERFSTUDIO_TRANSFORMS).string()));
+	if (data.empty())
+		return false;
+	// parse camera
+	const cv::Size resolution(data["w"].get<uint32_t>(), data["h"].get<uint32_t>());
+	const IIndex platformID = scene.platforms.size();
+	Platform& platform = scene.platforms.emplace_back();
+	Platform::Camera& camera = platform.cameras.emplace_back();
+	camera.K = KMatrix::IDENTITY;
+	camera.R = RMatrix::IDENTITY;
+	camera.C = CMatrix::ZERO;
+	camera.K(0,0) = data["fl_x"].get<REAL>();
+	camera.K(1,1) = data["fl_y"].get<REAL>();
+	camera.K(0,2) = data["cx"].get<REAL>();
+	camera.K(1,2) = data["cy"].get<REAL>();
+	const String cameraModel = data["camera_model"].get<std::string>();
+	if (cameraModel == "SIMPLE_PINHOLE") {
+	} else
+	// check ZERO radial distortion for all "PERSPECTIVE" type cameras
+	if (cameraModel == "PINHOLE" || cameraModel == "SIMPLE_RADIAL" || cameraModel == "RADIAL" || cameraModel == "OPENCV") {
+		const REAL k1 = data["k1"].get<REAL>();
+		const REAL k2 = data["k2"].get<REAL>();
+		const REAL p1 = data["p1"].get<REAL>();
+		const REAL p2 = data["p2"].get<REAL>();
+		if (k1 != 0 || k2 != 0 || p1 != 0 || p2 != 0) {
+			VERBOSE("error: radial distortion not supported");
+			return false;
+		}
+	} else {
+		VERBOSE("error: camera model not supported");
+		return false;
+	}
+	// parse images
+	const nlohmann::json& frames = data["frames"];
+	for (const nlohmann::json& frame: frames) {
+		// set image
+		// frames expected to be ordered in JSON
+		const IIndex imageID = scene.images.size(); 
+		const String strFileName((path / frame["file_path"].get<std::string>()).string());
+		Image& imageData = scene.images.AddEmpty();
+		imageData.platformID = platformID;
+		imageData.cameraID = 0; // only one camera per platform supported by this format
+		imageData.poseID = NO_ID;
+		imageData.ID = imageID;
+		imageData.name = strFileName;
+		ASSERT(Util::isFullPath(imageData.name));
+		// set image resolution
+		imageData.width = resolution.width;
+		imageData.height = resolution.height;
+		imageData.scale = 1;
+		// load camera pose
+		imageData.poseID = platform.poses.size();
+		Platform::Pose& pose = platform.poses.AddEmpty();
+		const auto Ps = frame["transform_matrix"].get<std::vector<std::vector<double>>>();
+		Eigen::Matrix4d P{
+			{Ps[0][0], Ps[0][1], Ps[0][2], Ps[0][3]},
+			{Ps[1][0], Ps[1][1], Ps[1][2], Ps[1][3]},
+			{Ps[2][0], Ps[2][1], Ps[2][2], Ps[2][3]},
+			{Ps[3][0], Ps[3][1], Ps[3][2], Ps[3][3]}
+		};
+		// revert nerfstudio conversion:
+		// convert from COLMAP's camera coordinate system (OpenCV) to ours (OpenGL)
+        //   c2w[0:3, 1:3] *= -1
+        //   c2w = c2w[np.array([1, 0, 2, 3]), :]
+        //   c2w[2, :] *= -1
+		P.row(2) *= -1;
+		P.row(0).swap(P.row(1));
+		P.col(2) *= -1;
+		P.col(1) *= -1;
+		// set camera pose
+		pose.R = P.topLeftCorner<3, 3>().transpose().eval();
+		pose.R.EnforceOrthogonality();
+		pose.C = P.topRightCorner<3, 1>().eval();
+		imageData.camera = platform.GetCamera(imageData.cameraID, imageData.poseID);
+		// try reading depth-map and normal-map
+		DepthMap depthMap; {
+			const String depthPath((path.parent_path() / String::FormatString("outputs/depth%04u.exr", imageID).c_str()).string());
+			const cv::Mat imgDepthMap = cv::imread(depthPath, cv::IMREAD_UNCHANGED);
+			if (imgDepthMap.empty()) {
+				VERBOSE("Unable to load depthmap %s.", depthPath.c_str());
+				continue;
+			}
+			imgDepthMap.convertTo(depthMap, CV_32FC1);
+		}
+		NormalMap normalMap; {
+			const String normalPath((path.parent_path() / String::FormatString("outputs/normal%04u.exr", imageID).c_str()).string());
+			const cv::Mat imgNormalMap = cv::imread(normalPath, cv::IMREAD_UNCHANGED);
+			if (imgNormalMap.empty()) {
+				VERBOSE("Unable to load normalMap %s.", normalPath.c_str());
+				continue;
+			}
+			imgNormalMap.convertTo(normalMap, CV_32FC3);
+		}
+		const ConfidenceMap confMap;
+		const ViewsMap viewsMap;
+		const IIndexArr IDs = {imageID};
+		double dMin, dMax;
+		cv::minMaxIdx(depthMap, &dMin, &dMax, NULL, NULL, depthMap > 0);
+		const String dmapPath((path.parent_path() / String::FormatString("depth%04u.dmap", imageID).c_str()).string());
+		if (!ExportDepthDataRaw(dmapPath,
+			imageData.name, IDs, resolution,
+			camera.K, pose.R, pose.C,
+			(float)dMin, (float)dMax,
+			depthMap, normalMap, confMap, viewsMap))
+		{
+			VERBOSE("Unable to save dmap: %s", dmapPath);
+			continue;
+		}
+	}
+	if (scene.images.size() < 2)
+		return false;
+	scene.nCalibratedImages = (unsigned)scene.images.size();
+	return true;
+}
+
+// RTMV scene format: http://www.cs.umd.edu/~mmeshry/projects/rtmv
+// |--sceneX
+//   |--xxx.jpg
+//   |--xxx.json
+//   ....
+bool ParseSceneRTMV(Scene& scene, const std::filesystem::path& path)
+{
+	#if defined(_SUPPORT_CPP17) && (!defined(__GNUC__) || (__GNUC__ > 7))
+	IIndex prevPlatformID = NO_ID;
+	for (const std::filesystem::directory_entry& entry : std::filesystem::directory_iterator(path)) {
+		if (entry.path().extension() != RTMV_CAMERAS_EXT)
+			continue;
+		// parse camera
+		const std::string strFileName((path / entry.path().stem()).string());
+		const nlohmann::json dataCamera = nlohmann::json::parse(std::ifstream(strFileName+RTMV_CAMERAS_EXT));
+		if (dataCamera.empty())
+			continue;
+		const nlohmann::json& data = dataCamera["camera_data"];
+		const cv::Size resolution(data["width"].get<uint32_t>(), data["height"].get<uint32_t>());
+		// set platform
+		Matrix3x3 K = Matrix3x3::IDENTITY;
+		K(0,0) = data["intrinsics"]["fx"].get<REAL>();
+		K(1,1) = data["intrinsics"]["fy"].get<REAL>();
+		K(0,2) = data["intrinsics"]["cx"].get<REAL>();
+		K(1,2) = data["intrinsics"]["cy"].get<REAL>();
+		IIndex platformID;
+		if (prevPlatformID == NO_ID || !K.IsEqual(scene.platforms[prevPlatformID].cameras[0].K, 1e-3)) {
+			prevPlatformID = platformID = scene.platforms.size();
+			Platform& platform = scene.platforms.emplace_back();
+			Platform::Camera& camera = platform.cameras.emplace_back();
+			camera.K = K;
+			camera.R = RMatrix::IDENTITY;
+			camera.C = CMatrix::ZERO;
+		} else {
+			platformID = prevPlatformID;
+		}
+		Platform& platform = scene.platforms[platformID];
+		// set image
+		const IIndex imageID = scene.images.size();
+		Image& imageData = scene.images.AddEmpty();
+		imageData.platformID = platformID;
+		imageData.cameraID = 0; // only one camera per platform supported by this format
+		imageData.poseID = NO_ID;
+		imageData.ID = imageID;
+		imageData.name = strFileName+".jpg";
+		ASSERT(Util::isFullPath(imageData.name));
+		// set image resolution
+		imageData.width = resolution.width;
+		imageData.height = resolution.height;
+		imageData.scale = 1;
+		// load camera pose
+		imageData.poseID = platform.poses.size();
+		Platform::Pose& pose = platform.poses.AddEmpty();
+		const auto Ps = data["cam2world"].get<std::vector<std::vector<double>>>();
+		Eigen::Matrix4d P{
+			{Ps[0][0], Ps[1][0], Ps[2][0], Ps[3][0]},
+			{Ps[0][1], Ps[1][1], Ps[2][1], Ps[3][1]},
+			{Ps[0][2], Ps[1][2], Ps[2][2], Ps[3][2]},
+			{Ps[0][3], Ps[1][3], Ps[2][3], Ps[3][3]}
+		};
+		// apply the same transforms as nerfstudio converter
+		P.row(2) *= -1;
+		P.row(0).swap(P.row(1));
+		P.col(2) *= -1;
+		P.col(1) *= -1;
+		// set camera pose
+		pose.R = P.topLeftCorner<3, 3>().transpose().eval();
+		pose.R.EnforceOrthogonality();
+		pose.C = P.topRightCorner<3, 1>().eval();
+		imageData.camera = platform.GetCamera(imageData.cameraID, imageData.poseID);
+	}
+	if (scene.images.size() < 2)
+		return false;
+	scene.nCalibratedImages = (unsigned)scene.images.size();
+	return true;
+	#else
+	VERBOSE("error: C++17 is required to parse MVSNet format");
+	return false;
+	#endif // _SUPPORT_CPP17
+}
+
+bool ParseScene(Scene& scene)
+{
+	#if defined(_SUPPORT_CPP17) && (!defined(__GNUC__) || (__GNUC__ > 7))
+	String strPath(MAKE_PATH_FULL(WORKING_FOLDER_FULL, OPT::strInputFileName));
+	Util::ensureValidFolderPath(strPath);
+	const std::filesystem::path path(static_cast<const std::string&>(strPath));
+	enum Type {
+		MVSNet = 0,
+		NERFSTUDIO,
+		RTMV,
+	} sceneType = MVSNet;
+	for (const std::filesystem::directory_entry& entry : std::filesystem::directory_iterator(path)) {
+		if (entry.path().extension() == RTMV_CAMERAS_EXT) {
+			if (entry.path().filename() == NERFSTUDIO_TRANSFORMS) {
+				sceneType = NERFSTUDIO;
+				break;
+			}
+			sceneType = RTMV;
+		}
+	}
+	switch (sceneType) {
+	case NERFSTUDIO: return ParseSceneNerfstudio(scene, path);
+	case RTMV: return ParseSceneRTMV(scene, path);
+	default: return ParseSceneMVSNet(scene, path);
+	}
+	#else
+	VERBOSE("error: C++17 is required to parse MVSNet format");
+	return false;
+	#endif // _SUPPORT_CPP17
+}
+
 } // unnamed namespace
 
 int main(int argc, LPCTSTR* argv)
@@ -352,7 +592,7 @@ int main(int argc, LPCTSTR* argv)
 	Scene scene(OPT::nMaxThreads);
 
 	// convert data from MVSNet format to OpenMVS
-	if (!ParseSceneMVSNet(scene))
+	if (!ParseScene(scene))
 		return EXIT_FAILURE;
 
 	// write OpenMVS input data
diff --git a/apps/InterfacePolycam/InterfacePolycam.cpp b/apps/InterfacePolycam/InterfacePolycam.cpp
index c0b42fb7a..31c532e9a 100644
--- a/apps/InterfacePolycam/InterfacePolycam.cpp
+++ b/apps/InterfacePolycam/InterfacePolycam.cpp
@@ -31,6 +31,7 @@
 
 #include "../../libs/MVS/Common.h"
 #include "../../libs/MVS/Scene.h"
+#define JSON_NOEXCEPTION
 #include "../../libs/IO/json.hpp"
 #include <boost/program_options.hpp>
 
diff --git a/libs/MVS/PointCloud.cpp b/libs/MVS/PointCloud.cpp
index 15689a058..c62920961 100644
--- a/libs/MVS/PointCloud.cpp
+++ b/libs/MVS/PointCloud.cpp
@@ -335,10 +335,14 @@ bool PointCloud::Load(const String& fileName)
 					colors[v] = vertex.c;
 				if (!normals.empty())
 					normals[v] = vertex.n;
-				if (!pointViews.empty())
-					pointViews[v].CopyOfRemove(ViewArr(vertex.views.num, vertex.views.pIndices));
-				if (!pointWeights.empty())
-					pointWeights[v].CopyOfRemove(WeightArr(vertex.views.num, vertex.views.pWeights));
+				if (!pointViews.empty()) {
+					ViewArr pv(vertex.views.num, vertex.views.pIndices);
+					pointViews[v].CopyOfRemove(pv);
+				}
+				if (!pointWeights.empty()){
+					WeightArr pw(vertex.views.num, vertex.views.pWeights);
+					pointWeights[v].CopyOfRemove(pw);
+				}
 			}
 		} else {
 			ply.get_other_element();
diff --git a/libs/MVS/Scene.cpp b/libs/MVS/Scene.cpp
index 74e84675d..4943cf4af 100644
--- a/libs/MVS/Scene.cpp
+++ b/libs/MVS/Scene.cpp
@@ -1799,7 +1799,7 @@ bool Scene::ComputeTowerCylinder(Point2f& centerPoint, float& fRadius, float& fR
 	// calculate tower radius as median distance from tower center to cameras
 	FloatArr cameraDistancesToMiddle(cameras2D.size());
 	FOREACH (camIdx, cameras2D)
-		cameraDistancesToMiddle[camIdx] = norm(cameras2D[camIdx] - centerPoint);
+		cameraDistancesToMiddle[camIdx] = (float)norm(cameras2D[camIdx] - centerPoint);
 	const float fMedianDistance = cameraDistancesToMiddle.GetMedian();
 	fRadius = MAXF(0.2f, (fMedianDistance - 1.f) / 3.f);
 	// get the average of top 85 to 95% of the highest distances to center
@@ -1882,7 +1882,7 @@ PointCloud Scene::BuildTowerMesh(const PointCloud& origPointCloud, const Point2f
 	} else {
 		cList<FloatArr> sliceDistances(nTargetCircles);
 		for (const Point3f& P : origPointCloud.points) {
-			const float d(norm(Point2f(P.x, P.y) - centerPoint));
+			const float d((float)norm(Point2f(P.x, P.y) - centerPoint));
 			if (d <= fROIRadius) {
 				const float fIdx((zMax - P.z) * nTargetDensity);
 				int bIdx(FLOOR2INT(fIdx));
diff --git a/vcpkg.json b/vcpkg.json
index a74f701ea..0c1542b1a 100644
--- a/vcpkg.json
+++ b/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "openmvs",
-  "version": "2.1.0",
+  "version": "2.2.0",
   "description": "OpenMVS: open Multi-View Stereo reconstruction library",
   "homepage": "https://cdcseacave.github.io/openMVS",
   "dependencies": [
@@ -17,6 +17,13 @@
     "glew",
     "glfw3",
     "libpng",
+    {
+      "name": "opencv",
+      "features": [
+        "eigen",
+        "openexr"
+      ]
+    },
     "opencv",
 	"opengl",
     "tiff",