From 4f02d1dbeea1203830189c109f89c0e35d08947b Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Fri, 5 Jul 2024 12:40:22 +0530
Subject: [PATCH 01/19] Update .gitignore

---
 .gitignore | 307 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 306 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 2ed7bdc..98aa388 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,308 @@
+# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,pycharm
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,pycharm
+
+### PyCharm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### PyCharm Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+# Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+.idea/**/sonarlint/
+
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+.idea/**/sonarIssues.xml
+
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+.idea/**/markdown-navigator.xml
+.idea/**/markdown-navigator-enh.xml
+.idea/**/markdown-navigator/
+
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+.idea/$CACHE_FILE$
+
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+.idea/codestream.xml
+
+# Azure Toolkit for IntelliJ plugin
+# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
+.idea/**/azureSettings.xml
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
 build/
+develop-eggs/
 dist/
-*.egg-info/
\ No newline at end of file
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,pycharm

From b439b8ff9ec450307dfeed2492de0737c8d570fd Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Fri, 7 Jun 2024 19:32:49 +0530
Subject: [PATCH 02/19] feat: Export directory prefix (Darshan)

---
 drishti/handlers/handle_darshan.py |  15 ++-
 drishti/includes/module.py         | 147 +++++++++++++++++------------
 drishti/includes/parser.py         |  11 +++
 3 files changed, 102 insertions(+), 71 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index d47fbea..1270d7c 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -752,15 +752,12 @@ def handler():
     display_thresholds(console)
     display_footer(console, insights_start_time, insights_end_time)
 
-    filename = '{}.html'.format(args.log_path)
-    export_html(console, filename)
+    input_filename = os.path.basename(args.log_path).replace('.darshan', '')
+    out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
 
-    filename = '{}.svg'.format(args.log_path)
-    export_svg(console, filename)
+    print(f"DEBUG: outfile_name: {input_filename}")
 
-    filename = '{}-summary.csv'.format(
-        args.log_path.replace('.darshan', '')
-    )
-    
-    export_csv(filename, job['job']['jobid'])
+    export_html(console, out_dir, input_filename)
+    export_svg(console, out_dir, input_filename)
 
+    export_csv(out_dir, input_filename, job['job']['jobid'])
diff --git a/drishti/includes/module.py b/drishti/includes/module.py
index dedaa09..0e69430 100644
--- a/drishti/includes/module.py
+++ b/drishti/includes/module.py
@@ -1823,76 +1823,99 @@ def display_footer(console, insights_start_time, insights_end_time):
         )
     )
 
-def export_html(console, filename):
+def export_html(console, export_dir, filename):
     '''
     '''
 
-    if args.export_html:
-        console.save_html(
-            filename,
-            theme=set_export_theme(),
-            clear=False
-        )
+    if not args.export_html:
+        print("DEBUG: export_html() - return")
+        return
 
+    os.makedirs(export_dir, exist_ok=True)
+    filepath = os.path.join(export_dir, f"{filename}.html")
 
-def export_svg(console, filename):
-    if args.export_svg:
-        console.save_svg(
-            filename,
-            title='Drishti',
-            theme=set_export_theme(),
-            clear=False
-        )
+    console.save_html(
+        filepath,
+        theme=set_export_theme(),
+        clear=False
+    )
 
+    print("DEBUG: END export_html()")
 
-def export_csv(filename, jobid=None):
-    if args.export_csv:
-        issues = [
-            'JOB',
-            INSIGHTS_STDIO_HIGH_USAGE,
-            INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE,
-            INSIGHTS_POSIX_READ_COUNT_INTENSIVE,
-            INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE,
-            INSIGHTS_POSIX_READ_SIZE_INTENSIVE,
-            INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE,
-            INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE,
-            INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE,
-            INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE,
-            INSIGHTS_POSIX_REDUNDANT_READ_USAGE,
-            INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE,
-            INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE,
-            INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE,
-            INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE,
-            INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE,
-            INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE,
-            INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE,
-            INSIGHTS_POSIX_HIGH_METADATA_TIME,
-            INSIGHTS_POSIX_SIZE_IMBALANCE,
-            INSIGHTS_POSIX_TIME_IMBALANCE,
-            INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE,
-            INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE,
-            INSIGHTS_MPI_IO_NO_USAGE,
-            INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE,
-            INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE,
-            INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE,
-            INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE,
-            INSIGHTS_MPI_IO_BLOCKING_READ_USAGE,
-            INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE,
-            INSIGHTS_MPI_IO_AGGREGATORS_INTRA,
-            INSIGHTS_MPI_IO_AGGREGATORS_INTER,
-            INSIGHTS_MPI_IO_AGGREGATORS_OK
-        ]
-        if codes:
-            issues.extend(codes)
 
-        detected_issues = dict.fromkeys(issues, False)
-        detected_issues['JOB'] = jobid
+def export_svg(console, export_dir, filename):
+    if not args.export_svg:
+        return
+    
+    os.makedirs(export_dir, exist_ok=True)
+    filepath = os.path.join(export_dir, f"{filename}.svg")
+
+    console.save_svg(
+        filepath,
+        title='Drishti',
+        theme=set_export_theme(),
+        clear=False
+    )
 
-        for report in csv_report:
-            detected_issues[report] = True
 
-        with open(filename, 'w') as f:
-            w = csv.writer(f)
-            w.writerow(detected_issues.keys())
-            w.writerow(detected_issues.values())
+def export_csv(export_dir, filename, jobid=None):
+    if not args.export_csv:
+        return
+    
+    issues = [
+        'JOB',
+        INSIGHTS_STDIO_HIGH_USAGE,
+        INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE,
+        INSIGHTS_POSIX_READ_COUNT_INTENSIVE,
+        INSIGHTS_POSIX_WRITE_SIZE_INTENSIVE,
+        INSIGHTS_POSIX_READ_SIZE_INTENSIVE,
+        INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_USAGE,
+        INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_USAGE,
+        INSIGHTS_POSIX_HIGH_MISALIGNED_MEMORY_USAGE,
+        INSIGHTS_POSIX_HIGH_MISALIGNED_FILE_USAGE,
+        INSIGHTS_POSIX_REDUNDANT_READ_USAGE,
+        INSIGHTS_POSIX_REDUNDANT_WRITE_USAGE,
+        INSIGHTS_POSIX_HIGH_RANDOM_READ_USAGE,
+        INSIGHTS_POSIX_HIGH_SEQUENTIAL_READ_USAGE,
+        INSIGHTS_POSIX_HIGH_RANDOM_WRITE_USAGE,
+        INSIGHTS_POSIX_HIGH_SEQUENTIAL_WRITE_USAGE,
+        INSIGHTS_POSIX_HIGH_SMALL_READ_REQUESTS_SHARED_FILE_USAGE,
+        INSIGHTS_POSIX_HIGH_SMALL_WRITE_REQUESTS_SHARED_FILE_USAGE,
+        INSIGHTS_POSIX_HIGH_METADATA_TIME,
+        INSIGHTS_POSIX_SIZE_IMBALANCE,
+        INSIGHTS_POSIX_TIME_IMBALANCE,
+        INSIGHTS_POSIX_INDIVIDUAL_WRITE_SIZE_IMBALANCE,
+        INSIGHTS_POSIX_INDIVIDUAL_READ_SIZE_IMBALANCE,
+        INSIGHTS_MPI_IO_NO_USAGE,
+        INSIGHTS_MPI_IO_NO_COLLECTIVE_READ_USAGE,
+        INSIGHTS_MPI_IO_NO_COLLECTIVE_WRITE_USAGE,
+        INSIGHTS_MPI_IO_COLLECTIVE_READ_USAGE,
+        INSIGHTS_MPI_IO_COLLECTIVE_WRITE_USAGE,
+        INSIGHTS_MPI_IO_BLOCKING_READ_USAGE,
+        INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE,
+        INSIGHTS_MPI_IO_AGGREGATORS_INTRA,
+        INSIGHTS_MPI_IO_AGGREGATORS_INTER,
+        INSIGHTS_MPI_IO_AGGREGATORS_OK
+    ]
+    if codes:
+        issues.extend(codes)
+
+    detected_issues = dict.fromkeys(issues, False)
+    detected_issues['JOB'] = jobid
+
+    for report in csv_report:
+        detected_issues[report] = True
+
+    # ensure dir exists
+    os.makedirs(export_dir, exist_ok=True)
+    filepath = os.path.join(export_dir, f"{filename}.csv")
+
+    print(f"DEBUG: export_dir: {export_dir}")
+    print(f"DEBUG: filename: {filename}")
+    print(f"DEBUG: filepath: {filepath}")
+
+    with open(filepath, 'w') as f:
+        w = csv.writer(f)
+        w.writerow(detected_issues.keys())
+        w.writerow(detected_issues.values())
 
diff --git a/drishti/includes/parser.py b/drishti/includes/parser.py
index 8659520..afa4247 100644
--- a/drishti/includes/parser.py
+++ b/drishti/includes/parser.py
@@ -96,6 +96,13 @@
     help='Export a CSV with the code of all issues that were triggered'
 )
 
+parser.add_argument(
+    '--export_dir',
+    default="",
+    dest='export_dir',
+    help='Specify the directory prefix for the output files (if any)'
+)
+
 parser.add_argument(
     '--json', 
     default=False, 
@@ -119,3 +126,7 @@
 )
 
 args = parser.parse_args()
+
+print(f"DEBUG: log_path: {args.log_path}")
+print(f"DEBUG: export_path: {args.export_dir}")
+print(f"DEBUG: export_csv: {args.export_csv}")
\ No newline at end of file

From 2a8aed8e7edef4555cbf26b4b95fa8085a29b103 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Fri, 7 Jun 2024 19:39:52 +0530
Subject: [PATCH 03/19] format: Remove debug statements

---
 drishti/handlers/handle_darshan.py |  4 +---
 drishti/includes/module.py         | 19 +++++--------------
 drishti/includes/parser.py         |  4 ----
 3 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 1270d7c..24c0519 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -752,12 +752,10 @@ def handler():
     display_thresholds(console)
     display_footer(console, insights_start_time, insights_end_time)
 
+    # Export to HTML, SVG, and CSV
     input_filename = os.path.basename(args.log_path).replace('.darshan', '')
     out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
 
-    print(f"DEBUG: outfile_name: {input_filename}")
-
     export_html(console, out_dir, input_filename)
     export_svg(console, out_dir, input_filename)
-
     export_csv(out_dir, input_filename, job['job']['jobid'])
diff --git a/drishti/includes/module.py b/drishti/includes/module.py
index 0e69430..f6f9fcf 100644
--- a/drishti/includes/module.py
+++ b/drishti/includes/module.py
@@ -1823,15 +1823,12 @@ def display_footer(console, insights_start_time, insights_end_time):
         )
     )
 
-def export_html(console, export_dir, filename):
-    '''
-    '''
 
+def export_html(console, export_dir, filename):
     if not args.export_html:
-        print("DEBUG: export_html() - return")
         return
 
-    os.makedirs(export_dir, exist_ok=True)
+    os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
     filepath = os.path.join(export_dir, f"{filename}.html")
 
     console.save_html(
@@ -1840,14 +1837,12 @@ def export_html(console, export_dir, filename):
         clear=False
     )
 
-    print("DEBUG: END export_html()")
-
 
 def export_svg(console, export_dir, filename):
     if not args.export_svg:
         return
     
-    os.makedirs(export_dir, exist_ok=True)
+    os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
     filepath = os.path.join(export_dir, f"{filename}.svg")
 
     console.save_svg(
@@ -1906,14 +1901,10 @@ def export_csv(export_dir, filename, jobid=None):
     for report in csv_report:
         detected_issues[report] = True
 
-    # ensure dir exists
-    os.makedirs(export_dir, exist_ok=True)
+    
+    os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
     filepath = os.path.join(export_dir, f"{filename}.csv")
 
-    print(f"DEBUG: export_dir: {export_dir}")
-    print(f"DEBUG: filename: {filename}")
-    print(f"DEBUG: filepath: {filepath}")
-
     with open(filepath, 'w') as f:
         w = csv.writer(f)
         w.writerow(detected_issues.keys())
diff --git a/drishti/includes/parser.py b/drishti/includes/parser.py
index afa4247..28dcd63 100644
--- a/drishti/includes/parser.py
+++ b/drishti/includes/parser.py
@@ -126,7 +126,3 @@
 )
 
 args = parser.parse_args()
-
-print(f"DEBUG: log_path: {args.log_path}")
-print(f"DEBUG: export_path: {args.export_dir}")
-print(f"DEBUG: export_csv: {args.export_csv}")
\ No newline at end of file

From 087a0f95d8580b33f348c64752fa5d3c9dd62d75 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Tue, 18 Jun 2024 19:25:51 +0530
Subject: [PATCH 04/19] Add export_dir support for recorder

---
 drishti/handlers/handle_recorder.py | 41 ++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py
index 34c4790..e78bd1f 100644
--- a/drishti/handlers/handle_recorder.py
+++ b/drishti/handlers/handle_recorder.py
@@ -2,9 +2,11 @@
 
 import os
 import time
+
 import pandas as pd
 from recorder_utils import RecorderReader
 from recorder_utils.build_offset_intervals import build_offset_intervals
+
 from drishti.includes.module import *
 
 
@@ -577,23 +579,32 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None):
     display_thresholds(console)
     display_footer(console, insights_start_time, insights_end_time)
 
-    if args.split_files:
-        filename = '{}.{}.html'.format(args.log_path, fid)
-    else:
-        filename = '{}.html'.format(args.log_path)
+    # if args.split_files:
+    #     filename = '{}.{}.html'.format(args.log_path, fid)
+    # else:
+    #     filename = '{}.html'.format(args.log_path)
 
-    export_html(console, filename)
+    # export_html(console, filename)
 
-    if args.split_files:
-        filename = '{}.{}.svg'.format(args.log_path, fid)
-    else:
-        filename = '{}.svg'.format(args.log_path)
+    # if args.split_files:
+    #     filename = '{}.{}.svg'.format(args.log_path, fid)
+    # else:
+    #     filename = '{}.svg'.format(args.log_path)
 
-    export_svg(console, filename)
+    # export_svg(console, filename)
 
-    if args.split_files:
-        filename = '{}.{}.summary.csv'.format(args.log_path, fid)
-    else:
-        filename = '{}-summary.csv'.format(args.log_path)
-    export_csv(filename)
+    # if args.split_files:
+    #     filename = '{}.{}.summary.csv'.format(args.log_path, fid)
+    # else:
+    #     filename = '{}-summary.csv'.format(args.log_path)
+    # export_csv(filename)
+
+    # Export to HTML, SVG, and CSV
+    input_filename = os.path.basename(os.path.dirname(args.log_path))
+    input_filename = f"{input_filename}.{fid}" if args.split_files else input_filename # Append fid if split_files is enabled
+
+    out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
 
+    export_html(console, out_dir, input_filename)
+    export_svg(console, out_dir, input_filename)
+    export_csv(out_dir, input_filename, job['job']['jobid'])

From 62b11823495b81356715c8a2ed61c00d45d1ce48 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Tue, 18 Jun 2024 19:28:31 +0530
Subject: [PATCH 05/19] chore: Remove commented out code for exporting files in
 handle_recorder.py

---
 drishti/handlers/handle_recorder.py | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py
index e78bd1f..01f4de8 100644
--- a/drishti/handlers/handle_recorder.py
+++ b/drishti/handlers/handle_recorder.py
@@ -579,30 +579,9 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None):
     display_thresholds(console)
     display_footer(console, insights_start_time, insights_end_time)
 
-    # if args.split_files:
-    #     filename = '{}.{}.html'.format(args.log_path, fid)
-    # else:
-    #     filename = '{}.html'.format(args.log_path)
-
-    # export_html(console, filename)
-
-    # if args.split_files:
-    #     filename = '{}.{}.svg'.format(args.log_path, fid)
-    # else:
-    #     filename = '{}.svg'.format(args.log_path)
-
-    # export_svg(console, filename)
-
-    # if args.split_files:
-    #     filename = '{}.{}.summary.csv'.format(args.log_path, fid)
-    # else:
-    #     filename = '{}-summary.csv'.format(args.log_path)
-    # export_csv(filename)
-
     # Export to HTML, SVG, and CSV
     input_filename = os.path.basename(os.path.dirname(args.log_path))
     input_filename = f"{input_filename}.{fid}" if args.split_files else input_filename # Append fid if split_files is enabled
-
     out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
 
     export_html(console, out_dir, input_filename)

From 970c08b5b214d59be100496ccc49875d4701f1a9 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 19 Jun 2024 10:32:32 +0530
Subject: [PATCH 06/19] feat: Update export file name to include "-summary"

---
 drishti/includes/module.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drishti/includes/module.py b/drishti/includes/module.py
index f6f9fcf..ae9c0e2 100644
--- a/drishti/includes/module.py
+++ b/drishti/includes/module.py
@@ -1,11 +1,15 @@
 #!/usr/bin/env python3
 
-import datetime
 import csv
+import datetime
 import time
+
 import pandas as pd
 from rich import box
 from rich.syntax import Syntax
+
+from rich.syntax import Syntax
+
 from drishti.includes.config import *
 
 '''
@@ -1903,7 +1907,7 @@ def export_csv(export_dir, filename, jobid=None):
 
     
     os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
-    filepath = os.path.join(export_dir, f"{filename}.csv")
+    filepath = os.path.join(export_dir, f"{filename}-summary.csv")
 
     with open(filepath, 'w') as f:
         w = csv.writer(f)

From 7c6778ba96e39735405db158f6af4efcd41e8a85 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 19 Jun 2024 10:32:53 +0530
Subject: [PATCH 07/19] fix: Update export_csv function call in
 handle_recorder.py

---
 drishti/handlers/handle_recorder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py
index 01f4de8..5a55267 100644
--- a/drishti/handlers/handle_recorder.py
+++ b/drishti/handlers/handle_recorder.py
@@ -586,4 +586,4 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None):
 
     export_html(console, out_dir, input_filename)
     export_svg(console, out_dir, input_filename)
-    export_csv(out_dir, input_filename, job['job']['jobid'])
+    export_csv(out_dir, input_filename)

From 81c1a8590a9a5b57463cf5c9cfeed10be8a7b740 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 19 Jun 2024 10:33:06 +0530
Subject: [PATCH 08/19] fix: Update darshan dependency to version 3.4.4.0 or
 higher

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 3e75113..a93a8ce 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 
 with open("README.md", "r") as f:
     long_description = f.read()
@@ -19,7 +19,7 @@
     install_requires=[
         'argparse',
         'pandas',
-        'darshan==3.4.4.0',
+        'darshan>=3.4.4.0',
         'rich==12.5.1',
         'recorder-utils',
     ],

From 09ad2c3940342aad8bb01b3d40f254d59e402ed2 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 19 Jun 2024 10:33:06 +0530
Subject: [PATCH 09/19] fix: Update darshan dependency to version 3.4.4.0 or
 higher

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 65461cb..5020329 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 argparse
-darshan==3.4.4.0
+darshan>=3.4.4.0
 pandas
 rich==12.5.1
 recorder-utils

From 6d238a49eb2a8fbbad0feff6e9f13391dfe807ab Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 19 Jun 2024 12:31:05 +0530
Subject: [PATCH 10/19] fmt: Clean up split_files case

---
 drishti/handlers/handle_recorder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py
index 5a55267..0084c84 100644
--- a/drishti/handlers/handle_recorder.py
+++ b/drishti/handlers/handle_recorder.py
@@ -581,7 +581,8 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None):
 
     # Export to HTML, SVG, and CSV
     input_filename = os.path.basename(os.path.dirname(args.log_path))
-    input_filename = f"{input_filename}.{fid}" if args.split_files else input_filename # Append fid if split_files is enabled
+    if args.split_files:
+        input_filename = f"{input_filename}.{fid}"
     out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
 
     export_html(console, out_dir, input_filename)

From 49714e674f133eb349040335904d818b134c63fc Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 19 Jun 2024 12:33:19 +0530
Subject: [PATCH 11/19] fmt: import statements

---
 drishti/includes/module.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drishti/includes/module.py b/drishti/includes/module.py
index ae9c0e2..f6f9fcf 100644
--- a/drishti/includes/module.py
+++ b/drishti/includes/module.py
@@ -1,15 +1,11 @@
 #!/usr/bin/env python3
 
-import csv
 import datetime
+import csv
 import time
-
 import pandas as pd
 from rich import box
 from rich.syntax import Syntax
-
-from rich.syntax import Syntax
-
 from drishti.includes.config import *
 
 '''
@@ -1907,7 +1903,7 @@ def export_csv(export_dir, filename, jobid=None):
 
     
     os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
-    filepath = os.path.join(export_dir, f"{filename}-summary.csv")
+    filepath = os.path.join(export_dir, f"{filename}.csv")
 
     with open(filepath, 'w') as f:
         w = csv.writer(f)

From a8feb4c6471aba1fc282ad832466118c7f7ab228 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 19 Jun 2024 15:01:59 +0530
Subject: [PATCH 12/19] fmt: Rename `input_filename` to `trace_name`

---
 drishti/handlers/handle_darshan.py  |  8 ++++----
 drishti/handlers/handle_recorder.py | 10 +++++-----
 drishti/includes/module.py          | 12 ++++++------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 24c0519..ea690f3 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -753,9 +753,9 @@ def handler():
     display_footer(console, insights_start_time, insights_end_time)
 
     # Export to HTML, SVG, and CSV
-    input_filename = os.path.basename(args.log_path).replace('.darshan', '')
+    trace_name = os.path.basename(args.log_path).replace('.darshan', '')
     out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
 
-    export_html(console, out_dir, input_filename)
-    export_svg(console, out_dir, input_filename)
-    export_csv(out_dir, input_filename, job['job']['jobid'])
+    export_html(console, out_dir, trace_name)
+    export_svg(console, out_dir, trace_name)
+    export_csv(out_dir, trace_name, job['job']['jobid'])
diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py
index 0084c84..afccfce 100644
--- a/drishti/handlers/handle_recorder.py
+++ b/drishti/handlers/handle_recorder.py
@@ -580,11 +580,11 @@ def process_helper(file_map, df_intervals, df_posix_records, fid=None):
     display_footer(console, insights_start_time, insights_end_time)
 
     # Export to HTML, SVG, and CSV
-    input_filename = os.path.basename(os.path.dirname(args.log_path))
+    trace_name = os.path.basename(os.path.dirname(args.log_path))
     if args.split_files:
-        input_filename = f"{input_filename}.{fid}"
+        trace_name = f"{trace_name}.{fid}"
     out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
 
-    export_html(console, out_dir, input_filename)
-    export_svg(console, out_dir, input_filename)
-    export_csv(out_dir, input_filename)
+    export_html(console, out_dir, trace_name)
+    export_svg(console, out_dir, trace_name)
+    export_csv(out_dir, trace_name)
diff --git a/drishti/includes/module.py b/drishti/includes/module.py
index f6f9fcf..9c2df16 100644
--- a/drishti/includes/module.py
+++ b/drishti/includes/module.py
@@ -1824,12 +1824,12 @@ def display_footer(console, insights_start_time, insights_end_time):
     )
 
 
-def export_html(console, export_dir, filename):
+def export_html(console, export_dir, trace_name):
     if not args.export_html:
         return
 
     os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
-    filepath = os.path.join(export_dir, f"{filename}.html")
+    filepath = os.path.join(export_dir, f"{trace_name}.html")
 
     console.save_html(
         filepath,
@@ -1838,12 +1838,12 @@ def export_html(console, export_dir, filename):
     )
 
 
-def export_svg(console, export_dir, filename):
+def export_svg(console, export_dir, trace_name):
     if not args.export_svg:
         return
     
     os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
-    filepath = os.path.join(export_dir, f"{filename}.svg")
+    filepath = os.path.join(export_dir, f"{trace_name}.svg")
 
     console.save_svg(
         filepath,
@@ -1853,7 +1853,7 @@ def export_svg(console, export_dir, filename):
     )
 
 
-def export_csv(export_dir, filename, jobid=None):
+def export_csv(export_dir, trace_name, jobid=None):
     if not args.export_csv:
         return
     
@@ -1903,7 +1903,7 @@ def export_csv(export_dir, filename, jobid=None):
 
     
     os.makedirs(export_dir, exist_ok=True) # Ensure export directory exists
-    filepath = os.path.join(export_dir, f"{filename}.csv")
+    filepath = os.path.join(export_dir, f"{trace_name}.csv")
 
     with open(filepath, 'w') as f:
         w = csv.writer(f)

From bf6161aac0791024d575f64ed1dd0cfad50892a6 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 3 Jul 2024 17:57:08 +0530
Subject: [PATCH 13/19] Add default call to main(

---
 drishti/reporter.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drishti/reporter.py b/drishti/reporter.py
index 8455040..ce7a461 100644
--- a/drishti/reporter.py
+++ b/drishti/reporter.py
@@ -5,7 +5,6 @@
 from subprocess import call
 from drishti.includes.parser import *
 
-
 '''
                          |- handler_darshan   -|
                          |                     |
@@ -17,7 +16,6 @@
     |-----> /includes -> module -> config -> parser
 '''
 
-
 LOG_TYPE_DARSHAN = 0
 LOG_TYPE_RECORDER = 1
 
@@ -34,22 +32,27 @@ def check_log_type(path):
         if not os.path.isfile(path):
             print('Unable to open .darshan file.')
             sys.exit(os.EX_NOINPUT)
-        else: return LOG_TYPE_DARSHAN
-    else: # check whether is a valid recorder log
+        else:
+            return LOG_TYPE_DARSHAN
+    else:  # check whether is a valid recorder log
         if not os.path.isdir(path):
             print('Unable to open recorder folder.')
             sys.exit(os.EX_NOINPUT)
-        else: return LOG_TYPE_RECORDER
+        else:
+            return LOG_TYPE_RECORDER
 
 
 def main():
     log_type = check_log_type(args.log_path)
-    
+
     if log_type == LOG_TYPE_DARSHAN:
         from drishti.handlers.handle_darshan import handler
 
     elif log_type == LOG_TYPE_RECORDER:
         from drishti.handlers.handle_recorder import handler
-    
+
     handler()
 
+
+if __name__ == '__main__':
+    main()

From 11cf6fcdced964051ff16a427b3ae05920256089 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Fri, 5 Jul 2024 12:56:47 +0530
Subject: [PATCH 14/19] feat: parser takes multiple traces

---
 drishti/includes/parser.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drishti/includes/parser.py b/drishti/includes/parser.py
index 28dcd63..842874e 100644
--- a/drishti/includes/parser.py
+++ b/drishti/includes/parser.py
@@ -5,7 +5,8 @@
 )
 
 parser.add_argument(
-    'log_path',
+    'log_paths',
+    nargs='+',
     help='Input .darshan file or recorder folder'
 )
 

From 19bdf0a8890c800360d5280ff5abe9dfb1804a95 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Fri, 5 Jul 2024 12:58:16 +0530
Subject: [PATCH 15/19] feat: check_log_type() processes multiple traces

---
 drishti/reporter.py | 44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/drishti/reporter.py b/drishti/reporter.py
index ce7a461..426d80c 100644
--- a/drishti/reporter.py
+++ b/drishti/reporter.py
@@ -27,23 +27,45 @@ def clear():
     _ = call('clear' if os.name == 'posix' else 'cls')
 
 
-def check_log_type(path):
-    if path.endswith('.darshan'):
-        if not os.path.isfile(path):
-            print('Unable to open .darshan file.')
-            sys.exit(os.EX_NOINPUT)
-        else:
+def check_log_type(paths: list[str]) -> int | None:
+    is_darshan = True
+    is_recorder = True
+    multiple_logs = len(paths) > 1
+
+    for path in paths:
+        if path.endswith('.darshan'):
+            if not os.path.isfile(path):
+                print('Unable to open .darshan file.')
+                sys.exit(os.EX_NOINPUT)
+            else:
+                is_darshan = True and is_darshan
+                is_recorder = False and is_recorder
+        else:  # check whether is a valid recorder log
+            if not os.path.isdir(path):
+                print('Unable to open recorder folder.')
+                sys.exit(os.EX_NOINPUT)
+            else:
+                is_recorder = True and is_recorder
+                is_darshan = False and is_darshan
+
+    if multiple_logs:
+        if is_darshan:
             return LOG_TYPE_DARSHAN
-    else:  # check whether is a valid recorder log
-        if not os.path.isdir(path):
-            print('Unable to open recorder folder.')
-            sys.exit(os.EX_NOINPUT)
         else:
+            print('Only .darshan files are supported for multiple logs.') #TODO
+            sys.exit(os.EX_NOINPUT)
+    else:
+        if is_darshan and not is_recorder:
+            return LOG_TYPE_DARSHAN
+        elif is_recorder and not is_darshan:
             return LOG_TYPE_RECORDER
+        else:
+            print('Unable to reliably determine the log type.')
+            sys.exit(os.EX_NOINPUT)
 
 
 def main():
-    log_type = check_log_type(args.log_path)
+    log_type = check_log_type(args.log_paths)
 
     if log_type == LOG_TYPE_DARSHAN:
         from drishti.handlers.handle_darshan import handler

From eb8d3d843d64f1c28a23ad952b7a94540ba01ef7 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Wed, 24 Jul 2024 10:59:56 +0530
Subject: [PATCH 16/19] TEMP COMMIT 2

---
 .gitignore                                    |   2 +
 .idea/.gitignore                              |   8 +
 .idea/feat-multiple-darshan-files.iml         |  23 +
 .../inspectionProfiles/profiles_settings.xml  |   6 +
 .idea/misc.xml                                |   7 +
 .idea/modules.xml                             |   8 +
 .idea/vcs.xml                                 |   6 +
 drishti/handlers/handle_darshan.py            | 613 ++++++++++++------
 drishti/handlers/handle_recorder.py           |   2 +-
 drishti/includes/config.py                    |   2 +-
 drishti/includes/module.py                    |   2 +-
 drishti/reporter.py                           |  10 +-
 12 files changed, 492 insertions(+), 197 deletions(-)
 create mode 100644 .idea/.gitignore
 create mode 100644 .idea/feat-multiple-darshan-files.iml
 create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/vcs.xml

diff --git a/.gitignore b/.gitignore
index 98aa388..29bd232 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+tensorflow_unet3d_darshan_per_rank_workload/
+
 # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,pycharm
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,pycharm
 
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/feat-multiple-darshan-files.iml b/.idea/feat-multiple-darshan-files.iml
new file mode 100644
index 0000000..9127201
--- /dev/null
+++ b/.idea/feat-multiple-darshan-files.iml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/drishti" isTestSource="false" />
+      <excludeFolder url="file://$MODULE_DIR$/images" />
+      <excludeFolder url="file://$MODULE_DIR$/sample" />
+      <excludeFolder url="file://$MODULE_DIR$/.venv/bin" />
+      <excludeFolder url="file://$MODULE_DIR$/.venv/lib" />
+      <excludeFolder url="file://$MODULE_DIR$/.venv/share" />
+      <excludeFolder url="file://$MODULE_DIR$/build" />
+      <excludeFolder url="file://$MODULE_DIR$/drishti_io.egg-info" />
+      <excludeFolder url="file://$MODULE_DIR$/tensorflow_unet3d_darshan_per_rank_workload" />
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..e5748a9
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.12 (feat-multiple-darshan-files)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (feat-multiple-darshan-files)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..2d7ac4e
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/feat-multiple-darshan-files.iml" filepath="$PROJECT_DIR$/.idea/feat-multiple-darshan-files.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index ea690f3..fc4a673 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -1,18 +1,27 @@
 #!/usr/bin/env python3
-
+import collections
+import dataclasses
+from dataclasses import dataclass
+import datetime
 import io
 import sys
 import time
 import shlex
 import shutil
 import subprocess
+import typing
+
 import pandas as pd
 import darshan
 import darshan.backend.cffi_backend as darshanll
 
 from rich import print
 from packaging import version
-from drishti.includes.module import *
+from includes.module import *
+import includes.module as module
+from includes.parser import args
+
+from pprint import pprint
 
 
 def is_available(name):
@@ -70,38 +79,321 @@ def check_log_version(console, file, log_version, library_version):
     return use_file
 
 
-def handler():
-    console = init_console()
+@dataclass
+class TimestampPair:
+    start: datetime.date
+    end: datetime.date
+
+
+@dataclass
+class DarshanTrace:
+    # Trace metadata
+    path: str
+    jobid: str
+    log_ver: str
+    time: TimestampPair
+    exe: str
+
+    # Report
+    report: darshan.DarshanReport
+    modules: typing.Iterable[str]
+
+    stdio_df: pd.DataFrame = None
+    posix_df: pd.DataFrame = None
+    mpiio_df: pd.DataFrame = None
+    lustre_df: pd.DataFrame = None
+
+    dxt_posix: pd.DataFrame = None
+    dxt_mpiio: pd.DataFrame = None
+
+    dxt_posix_read_data: pd.DataFrame = None
+    dxt_posix_write_data: pd.DataFrame = None
+
+    total_write_size_stdio: int
+    total_write_size_stdio: int
+    total_size_stdio: int
+
+    total_write_size_posix: int
+    total_read_size_posix: int
+    total_size_posix: int
+
+    total_write_size_mpiio: int
+    total_read_size_mpiio: int
+    total_size_mpiio: int
+
+    total_size: int
+    total_files: int
+
+    total_files_stdio: int = 0
+    total_files_posix: int = 0
+    total_files_mpiio: int = 0
+
+    files: dict[str, dict[str, int]] = dataclasses.field(default_factory=dict)
+
+    total_reads: int = 0
+    total_writes: int = 0
+    total_operations: int = 0
+    total_read_size: int = 0
+    total_written_size: int = 0
+    total_size: int = 0
+    total_reads_small: int = 0
+    total_writes_small: int = 0
+
+    def __init__(self, trace_path: str, job_information, report: darshan.DarshanReport):
+        self.path = trace_path
+
+        self.jobid = job_information['jobid']
+        self.log_ver = job_information['log_ver'] if 'log_ver' in job_information else job_information['metadata'][
+            'lib_ver']
+        self.exe = report.metadata['exe']
+
+        _start_time = datetime.datetime.fromtimestamp(job_information['start_time_sec'], tz=datetime.timezone.utc)
+        _end_time = datetime.datetime.fromtimestamp(job_information['end_time_sec'], tz=datetime.timezone.utc)
+        self.time = TimestampPair(_start_time, _end_time)
+
+        self.modules = report.modules.keys()
+
+        # TODO: Should I search in self.modules or in report.records?
+        # ! All dfs are being materialised
+        self.report = report
+        self.posix_df = report.records['POSIX'].to_df() if 'POSIX' in self.modules else None
+        self.stdio_df = report.records['STDIO'].to_df() if 'STDIO' in self.modules else None
+        self.mpiio_df = report.records['MPI-IO'].to_df() if 'MPI-IO' in self.modules else None
+
+        self.lustre_df = report.records['LUSTRE'].to_df() if 'LUSTRE' in self.modules else None
+
+        self.dxt_posix = report.records['DXT_POSIX'].to_df() if 'DXT_POSIX' in self.modules else None
+        self.dxt_mpiio = report.records['DXT_MPIIO'].to_df() if 'DXT_MPIIO' in self.modules else None
+
+    def generate_dxt_posix_rw_df(self) -> None:
+        if not args.backtrace:
+            return
+        if not self.dxt_posix:
+            return
+        if "address_line_mapping" not in self.dxt_posix:
+            args.backtrace = False
+            return
+
+        read_id = []
+        read_rank = []
+        read_length = []
+        read_offsets = []
+        read_end_time = []
+        read_start_time = []
+        read_operation = []
+
+        write_id = []
+        write_rank = []
+        write_length = []
+        write_offsets = []
+        write_end_time = []
+        write_start_time = []
+        write_operation = []
+
+        for r in zip(self.dxt_posix['rank'], self.dxt_posix['read_segments'], self.dxt_posix['write_segments'],
+                     self.dxt_posix['id']):
+            if not r[1].empty:
+                read_id.append([r[3]] * len((r[1]["length"].to_list())))
+                read_rank.append([r[0]] * len((r[1]["length"].to_list())))
+                read_length.append(r[1]["length"].to_list())
+                read_end_time.append(r[1]["end_time"].to_list())
+                read_start_time.append(r[1]["start_time"].to_list())
+                read_operation.append(["read"] * len((r[1]["length"].to_list())))
+                read_offsets.append(r[1]["offset"].to_list())
+
+            if not r[2].empty:
+                write_id.append([r[3]] * len((r[2]['length'].to_list())))
+                write_rank.append([r[0]] * len((r[2]['length'].to_list())))
+                write_length.append(r[2]['length'].to_list())
+                write_end_time.append(r[2]['end_time'].to_list())
+                write_start_time.append(r[2]['start_time'].to_list())
+                write_operation.append(['write'] * len((r[2]['length'].to_list())))
+                write_offsets.append(r[2]['offset'].to_list())
+
+        read_id = [element for nestedlist in read_id for element in nestedlist]
+        read_rank = [element for nestedlist in read_rank for element in nestedlist]
+        read_length = [element for nestedlist in read_length for element in nestedlist]
+        read_offsets = [element for nestedlist in read_offsets for element in nestedlist]
+        read_end_time = [element for nestedlist in read_end_time for element in nestedlist]
+        read_operation = [element for nestedlist in read_operation for element in nestedlist]
+        read_start_time = [element for nestedlist in read_start_time for element in nestedlist]
+
+        write_id = [element for nestedlist in write_id for element in nestedlist]
+        write_rank = [element for nestedlist in write_rank for element in nestedlist]
+        write_length = [element for nestedlist in write_length for element in nestedlist]
+        write_offsets = [element for nestedlist in write_offsets for element in nestedlist]
+        write_end_time = [element for nestedlist in write_end_time for element in nestedlist]
+        write_operation = [element for nestedlist in write_operation for element in nestedlist]
+        write_start_time = [element for nestedlist in write_start_time for element in nestedlist]
+
+        self.dxt_posix_read_data = pd.DataFrame(
+            {
+                "id": read_id,
+                "rank": read_rank,
+                "length": read_length,
+                "end_time": read_end_time,
+                "start_time": read_start_time,
+                "operation": read_operation,
+                "offsets": read_offsets,
+            }
+        )
 
-    insights_start_time = time.time()
+        self.dxt_posix_write_data = pd.DataFrame(
+            {
+                "id": write_id,
+                "rank": write_rank,
+                "length": write_length,
+                "end_time": write_end_time,
+                "start_time": write_start_time,
+                "operation": write_operation,
+                "offsets": write_offsets,
+            }
+        )
+
+    def calculate_insights(self) -> None:
+        self.total_write_size_stdio = self.stdio_df['counters']['STDIO_BYTES_WRITTEN'].sum() if self.stdio_df else 0
+        self.total_read_size_stdio = self.stdio_df['counters']['STDIO_BYTES_READ'].sum() if self.stdio_df else 0
+        self.total_size_stdio = self.total_write_size_stdio + self.total_read_size_stdio
+
+        self.total_write_size_posix = self.posix_df['counters']['POSIX_BYTES_WRITTEN'].sum() if self.posix_df else 0
+        self.total_read_size_posix = self.posix_df['counters']['POSIX_BYTES_READ'].sum() if self.posix_df else 0
+        self.total_size_posix = self.total_write_size_posix + self.total_read_size_posix
+
+        self.total_write_size_mpiio = self.mpiio_df['counters']['MPIIO_BYTES_WRITTEN'].sum() if self.mpiio_df else 0
+        self.total_read_size_mpiio = self.mpiio_df['counters']['MPIIO_BYTES_READ'].sum() if self.mpiio_df else 0
+        self.total_size_mpiio = self.total_write_size_mpiio + self.total_read_size_mpiio
+
+        # POSIX will capture POSIX-only and MPI-IO
+        if self.total_size_posix > 0 and self.total_size_posix >= self.total_size_mpiio:
+            self.total_size_posix -= self.total_size_mpiio
+
+        self.total_size = self.total_size_stdio + self.total_size_posix + self.total_size_mpiio
+
+        assert (self.total_size_stdio >= 0)
+        assert (self.total_size_posix >= 0)
+        assert (self.total_size_mpiio >= 0)
+
+    def files_stuff(self) -> None:
+        file_map = self.report.name_records
+
+        self.total_files = len(file_map)
+
+        # files = dict()
+
+        for id, path in file_map.items():
+            uses_stdio = len(
+                self.stdio_df['counters'][self.stdio_df['counters']['id'] == id]) > 0 if self.stdio_df else 0
+            uses_posix = len(
+                self.posix_df['counters'][self.posix_df['counters']['id'] == id]) > 0 if self.posix_df else 0
+            uses_mpiio = len(
+                self.mpiio_df['counters'][self.mpiio_df['counters']['id'] == id]) > 0 if self.mpiio_df else 0
+
+            self.total_files_stdio += uses_stdio
+            self.total_files_posix += uses_posix
+            self.total_files_mpiio += uses_mpiio
+
+            self.files[id] = {
+                'path': path,
+                'stdio': uses_stdio,
+                'posix': uses_posix,
+                'mpiio': uses_mpiio
+            }
+
+    def check_stdio(self) -> None:
+        module.check_stdio(self.total_size, self.total_size_stdio)
+
+    def check_mpiio(self) -> None:
+        module.check_mpiio(self.modules)
+
+    def something(self) -> None:
+        if not self.posix_df:
+            return
+
+        self.total_reads = self.posix_df['counters']['POSIX_READS'].sum()
+        self.total_writes = self.posix_df['counters']['POSIX_WRITES'].sum()
+        self.total_operations = self.total_writes + self.total_reads
+
+        module.check_operation_intensive(self.total_operations, self.total_reads, self.total_writes)
+
+        self.total_read_size = self.posix_df['counters']['POSIX_BYTES_READ'].sum()
+        self.total_written_size = self.posix_df['counters']['POSIX_BYTES_WRITTEN'].sum()
+        self.total_size = self.total_written_size + self.total_read_size
+
+        module.check_size_intensive(self.total_size, self.total_read_size, self.total_written_size)
+
+        self.total_reads_small = (
+                self.posix_df['counters']['POSIX_SIZE_READ_0_100'].sum() +
+                self.posix_df['counters']['POSIX_SIZE_READ_100_1K'].sum() +
+                self.posix_df['counters']['POSIX_SIZE_READ_1K_10K'].sum() +
+                self.posix_df['counters']['POSIX_SIZE_READ_10K_100K'].sum() +
+                self.posix_df['counters']['POSIX_SIZE_READ_100K_1M'].sum()
+        )
+        self.total_writes_small = (
+                self.posix_df['counters']['POSIX_SIZE_WRITE_0_100'].sum() +
+                self.posix_df['counters']['POSIX_SIZE_WRITE_100_1K'].sum() +
+                self.posix_df['counters']['POSIX_SIZE_WRITE_1K_10K'].sum() +
+                self.posix_df['counters']['POSIX_SIZE_WRITE_10K_100K'].sum() +
+                self.posix_df['counters']['POSIX_SIZE_WRITE_100K_1M'].sum()
+        )
 
+    def something2(self):
+        detected_files = pd.DataFrame(self.posix_df['counters'].groupby('id')[['INSIGHTS_POSIX_SMALL_READ',
+                                                                    'INSIGHTS_POSIX_SMALL_WRITE']].sum()).reset_index()
+        detected_files.columns = ['id', 'total_reads', 'total_writes']
+        detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str)
+
+        file_map = self.report.name_records
+        module.check_small_operation(self.total_reads, self.total_reads_small, self.total_writes, self.total_writes_small,
+                                     detected_files,
+                                     self.modules, file_map, self.dxt_posix, self.dxt_posix_read_data,
+                                     self.dxt_posix_write_data)
+
+
+def file_reader(trace_path: str):
     log = darshanll.log_open(args.log_path)
 
     modules = darshanll.log_get_modules(log)
 
     information = darshanll.log_get_job(log)
 
-    if 'log_ver' in information:
-        log_version = information['log_ver']
-    else:
-        log_version = information['metadata']['lib_ver']  
+
+def log_relation_check():
+    # TODO: Ensure that all logs are from a single job, generated at the same time, from the same executable and using the same library version
+    pass
+
+
+def handler():
+    console = init_console()
+
+    insights_start_time = time.time()
+
+    # TODO: Break here for new fn
+
+    trace_path = args.log_paths[0]  # TODO: A single file rn
+
+    darshan.enable_experimental()
     library_version = darshanll.get_lib_version()
 
-    # Make sure log format is of the same version
-    filename = args.log_path
-    # check_log_version(console, args.log_path, log_version, library_version)
- 
+    # TODO: Can this be put in a with block?
+    log = darshanll.log_open(trace_path)
+    information = darshanll.log_get_job(log)
     darshanll.log_close(log)
 
-    darshan.enable_experimental()
+    report = darshan.DarshanReport(trace_path)
+    current_trace = DarshanTrace(trace_path, information, report)  # WIP: Implement this constructor
+    #
 
-    report = darshan.DarshanReport(filename)
+    # TODO: What to do here?
+    # # Make sure log format is of the same version
+    # filename = args.log_path
+    # # check_log_version(console, args.log_path, log_version, library_version)
+    #
 
-    job = report.metadata
+    # TODO: Break here
 
     #########################################################################################################################################################################
 
-    # Check usage of STDIO, POSIX, and MPI-IO per file
+    # TODO: Check usage of STDIO, POSIX, and MPI-IO per file
 
     if 'STDIO' in report.records:
         df_stdio = report.records['STDIO'].to_df()
@@ -110,7 +402,7 @@ def handler():
             total_write_size_stdio = df_stdio['counters']['STDIO_BYTES_WRITTEN'].sum()
             total_read_size_stdio = df_stdio['counters']['STDIO_BYTES_READ'].sum()
 
-            total_size_stdio = total_write_size_stdio + total_read_size_stdio 
+            total_size_stdio = total_write_size_stdio + total_read_size_stdio
         else:
             total_size_stdio = 0
     else:
@@ -140,117 +432,28 @@ def handler():
             total_write_size_mpiio = df_mpiio['counters']['MPIIO_BYTES_WRITTEN'].sum()
             total_read_size_mpiio = df_mpiio['counters']['MPIIO_BYTES_READ'].sum()
 
-            total_size_mpiio = total_write_size_mpiio + total_read_size_mpiio 
+            total_size_mpiio = total_write_size_mpiio + total_read_size_mpiio
         else:
             total_size_mpiio = 0
     else:
         df_mpiio = None
 
         total_size_mpiio = 0
-    
+
     dxt_posix = None
     dxt_posix_read_data = None
     dxt_posix_write_data = None
     dxt_mpiio = None
 
-    df_lustre = None
-    if "LUSTRE" in report.records:
-        df_lustre = report.records['LUSTRE'].to_df()
-    
-    if args.backtrace:
-        if "DXT_POSIX" in report.records:
-            dxt_posix = report.records["DXT_POSIX"].to_df()
-            dxt_posix = pd.DataFrame(dxt_posix)
-            if "address_line_mapping" not in dxt_posix:
-                args.backtrace = False
-            else:
-                read_id = []
-                read_rank = []
-                read_length = []
-                read_offsets = []
-                read_end_time = []
-                read_start_time = []
-                read_operation = []
-
-                write_id = []
-                write_rank = []
-                write_length = []
-                write_offsets = []
-                write_end_time = []
-                write_start_time = []
-                write_operation = []
-                
-                for r in zip(dxt_posix['rank'], dxt_posix['read_segments'], dxt_posix['write_segments'], dxt_posix['id']):
-                    if not r[1].empty:
-                        read_id.append([r[3]] * len((r[1]['length'].to_list())))
-                        read_rank.append([r[0]] * len((r[1]['length'].to_list())))
-                        read_length.append(r[1]['length'].to_list())
-                        read_end_time.append(r[1]['end_time'].to_list())
-                        read_start_time.append(r[1]['start_time'].to_list())
-                        read_operation.append(['read'] * len((r[1]['length'].to_list())))
-                        read_offsets.append(r[1]['offset'].to_list())
-
-                    if not r[2].empty:
-                        write_id.append([r[3]] * len((r[2]['length'].to_list())))     
-                        write_rank.append([r[0]] * len((r[2]['length'].to_list())))
-                        write_length.append(r[2]['length'].to_list())
-                        write_end_time.append(r[2]['end_time'].to_list())
-                        write_start_time.append(r[2]['start_time'].to_list())
-                        write_operation.append(['write'] * len((r[2]['length'].to_list())))
-                        write_offsets.append(r[2]['offset'].to_list())
-
-                read_id = [element for nestedlist in read_id for element in nestedlist]
-                read_rank = [element for nestedlist in read_rank for element in nestedlist]
-                read_length = [element for nestedlist in read_length for element in nestedlist]
-                read_offsets = [element for nestedlist in read_offsets for element in nestedlist]
-                read_end_time = [element for nestedlist in read_end_time for element in nestedlist]
-                read_operation = [element for nestedlist in read_operation for element in nestedlist]
-                read_start_time = [element for nestedlist in read_start_time for element in nestedlist]
-                
-                write_id = [element for nestedlist in write_id for element in nestedlist]
-                write_rank = [element for nestedlist in write_rank for element in nestedlist]
-                write_length = [element for nestedlist in write_length for element in nestedlist]
-                write_offsets = [element for nestedlist in write_offsets for element in nestedlist]
-                write_end_time = [element for nestedlist in write_end_time for element in nestedlist]
-                write_operation = [element for nestedlist in write_operation for element in nestedlist]
-                write_start_time = [element for nestedlist in write_start_time for element in nestedlist]
-
-                dxt_posix_read_data = pd.DataFrame(
-                    {
-                    'id': read_id,
-                    'rank': read_rank,
-                    'length': read_length,
-                    'end_time': read_end_time,
-                    'start_time': read_start_time,
-                    'operation': read_operation,
-                    'offsets': read_offsets,
-                    })
-
-                dxt_posix_write_data = pd.DataFrame(
-                    {
-                    'id': write_id,
-                    'rank': write_rank,
-                    'length': write_length,
-                    'end_time': write_end_time,
-                    'start_time': write_start_time,
-                    'operation': write_operation,
-                    'offsets': write_offsets,
-                    })
-
-            if "DXT_MPIIO" in report.records:
-                dxt_mpiio = report.records["DXT_MPIIO"].to_df()
-                dxt_mpiio = pd.DataFrame(dxt_mpiio)
-            
-
     # Since POSIX will capture both POSIX-only accesses and those comming from MPI-IO, we can subtract those
     if total_size_posix > 0 and total_size_posix >= total_size_mpiio:
         total_size_posix -= total_size_mpiio
 
     total_size = total_size_stdio + total_size_posix + total_size_mpiio
 
-    assert(total_size_stdio >= 0)
-    assert(total_size_posix >= 0)
-    assert(total_size_mpiio >= 0)
+    assert (total_size_stdio >= 0)
+    assert (total_size_posix >= 0)
+    assert (total_size_mpiio >= 0)
 
     files = {}
 
@@ -268,7 +471,7 @@ def handler():
             uses_stdio = len(df_stdio['counters'][(df_stdio['counters']['id'] == id)]) > 0
         else:
             uses_stdio = 0
-        
+
         if df_posix:
             uses_posix = len(df_posix['counters'][(df_posix['counters']['id'] == id)]) > 0
         else:
@@ -305,7 +508,7 @@ def handler():
         total_writes = df['counters']['POSIX_WRITES'].sum()
 
         # Get total number of I/O operations
-        total_operations = total_writes + total_reads 
+        total_operations = total_writes + total_reads
 
         # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance
         check_operation_intensive(total_operations, total_reads, total_writes)
@@ -321,45 +524,47 @@ def handler():
 
         # Get the number of small I/O operations (less than 1 MB)
         total_reads_small = (
-            df['counters']['POSIX_SIZE_READ_0_100'].sum() +
-            df['counters']['POSIX_SIZE_READ_100_1K'].sum() +
-            df['counters']['POSIX_SIZE_READ_1K_10K'].sum() +
-            df['counters']['POSIX_SIZE_READ_10K_100K'].sum() +
-            df['counters']['POSIX_SIZE_READ_100K_1M'].sum()
+                df['counters']['POSIX_SIZE_READ_0_100'].sum() +
+                df['counters']['POSIX_SIZE_READ_100_1K'].sum() +
+                df['counters']['POSIX_SIZE_READ_1K_10K'].sum() +
+                df['counters']['POSIX_SIZE_READ_10K_100K'].sum() +
+                df['counters']['POSIX_SIZE_READ_100K_1M'].sum()
         )
 
         total_writes_small = (
-            df['counters']['POSIX_SIZE_WRITE_0_100'].sum() +
-            df['counters']['POSIX_SIZE_WRITE_100_1K'].sum() +
-            df['counters']['POSIX_SIZE_WRITE_1K_10K'].sum() +
-            df['counters']['POSIX_SIZE_WRITE_10K_100K'].sum() +
-            df['counters']['POSIX_SIZE_WRITE_100K_1M'].sum()
+                df['counters']['POSIX_SIZE_WRITE_0_100'].sum() +
+                df['counters']['POSIX_SIZE_WRITE_100_1K'].sum() +
+                df['counters']['POSIX_SIZE_WRITE_1K_10K'].sum() +
+                df['counters']['POSIX_SIZE_WRITE_10K_100K'].sum() +
+                df['counters']['POSIX_SIZE_WRITE_100K_1M'].sum()
         )
 
         # Get the files responsible for more than half of these accesses
         files = []
 
         df['counters']['INSIGHTS_POSIX_SMALL_READ'] = (
-            df['counters']['POSIX_SIZE_READ_0_100'] +
-            df['counters']['POSIX_SIZE_READ_100_1K'] +
-            df['counters']['POSIX_SIZE_READ_1K_10K'] +
-            df['counters']['POSIX_SIZE_READ_10K_100K'] +
-            df['counters']['POSIX_SIZE_READ_100K_1M']
+                df['counters']['POSIX_SIZE_READ_0_100'] +
+                df['counters']['POSIX_SIZE_READ_100_1K'] +
+                df['counters']['POSIX_SIZE_READ_1K_10K'] +
+                df['counters']['POSIX_SIZE_READ_10K_100K'] +
+                df['counters']['POSIX_SIZE_READ_100K_1M']
         )
 
         df['counters']['INSIGHTS_POSIX_SMALL_WRITE'] = (
-            df['counters']['POSIX_SIZE_WRITE_0_100'] +
-            df['counters']['POSIX_SIZE_WRITE_100_1K'] +
-            df['counters']['POSIX_SIZE_WRITE_1K_10K'] +
-            df['counters']['POSIX_SIZE_WRITE_10K_100K'] +
-            df['counters']['POSIX_SIZE_WRITE_100K_1M']
+                df['counters']['POSIX_SIZE_WRITE_0_100'] +
+                df['counters']['POSIX_SIZE_WRITE_100_1K'] +
+                df['counters']['POSIX_SIZE_WRITE_1K_10K'] +
+                df['counters']['POSIX_SIZE_WRITE_10K_100K'] +
+                df['counters']['POSIX_SIZE_WRITE_100K_1M']
         )
 
-        detected_files = pd.DataFrame(df['counters'].groupby('id')[['INSIGHTS_POSIX_SMALL_READ', 'INSIGHTS_POSIX_SMALL_WRITE']].sum()).reset_index()
+        detected_files = pd.DataFrame(df['counters'].groupby('id')[['INSIGHTS_POSIX_SMALL_READ',
+                                                                    'INSIGHTS_POSIX_SMALL_WRITE']].sum()).reset_index()
         detected_files.columns = ['id', 'total_reads', 'total_writes']
         detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str)
 
-        check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules,
+                              file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
 
         #########################################################################################################################################################################
 
@@ -368,7 +573,8 @@ def handler():
         total_mem_not_aligned = df['counters']['POSIX_MEM_NOT_ALIGNED'].sum()
         total_file_not_aligned = df['counters']['POSIX_FILE_NOT_ALIGNED'].sum()
 
-        check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre, dxt_posix, dxt_posix_read_data)
+        check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre,
+                         dxt_posix, dxt_posix_read_data)
 
         #########################################################################################################################################################################
 
@@ -377,7 +583,8 @@ def handler():
         max_read_offset = df['counters']['POSIX_MAX_BYTE_READ'].max()
         max_write_offset = df['counters']['POSIX_MAX_BYTE_WRITTEN'].max()
 
-        check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix,
+                      dxt_posix_read_data, dxt_posix_write_data)
 
         #########################################################################################################################################################################
 
@@ -393,7 +600,6 @@ def handler():
         read_random = total_reads - read_consecutive - read_sequential
         #print('READ Random: {} ({:.2f}%)'.format(read_random, read_random / total_reads * 100))
 
-
         write_consecutive = df['counters']['POSIX_CONSEC_WRITES'].sum()
 
         write_sequential = df['counters']['POSIX_SEQ_WRITES'].sum()
@@ -402,7 +608,9 @@ def handler():
         write_random = total_writes - write_consecutive - write_sequential
         #print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100))
 
-        check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive, write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive,
+                               write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data,
+                               dxt_posix_write_data)
 
         #########################################################################################################################################################################
 
@@ -415,44 +623,45 @@ def handler():
         if not shared_files.empty:
             total_shared_reads = shared_files['POSIX_READS'].sum()
             total_shared_reads_small = (
-                shared_files['POSIX_SIZE_READ_0_100'].sum() +
-                shared_files['POSIX_SIZE_READ_100_1K'].sum() +
-                shared_files['POSIX_SIZE_READ_1K_10K'].sum() +
-                shared_files['POSIX_SIZE_READ_10K_100K'].sum() +
-                shared_files['POSIX_SIZE_READ_100K_1M'].sum()
+                    shared_files['POSIX_SIZE_READ_0_100'].sum() +
+                    shared_files['POSIX_SIZE_READ_100_1K'].sum() +
+                    shared_files['POSIX_SIZE_READ_1K_10K'].sum() +
+                    shared_files['POSIX_SIZE_READ_10K_100K'].sum() +
+                    shared_files['POSIX_SIZE_READ_100K_1M'].sum()
             )
 
             shared_files['INSIGHTS_POSIX_SMALL_READS'] = (
-                shared_files['POSIX_SIZE_READ_0_100'] +
-                shared_files['POSIX_SIZE_READ_100_1K'] +
-                shared_files['POSIX_SIZE_READ_1K_10K'] +
-                shared_files['POSIX_SIZE_READ_10K_100K'] +
-                shared_files['POSIX_SIZE_READ_100K_1M']
+                    shared_files['POSIX_SIZE_READ_0_100'] +
+                    shared_files['POSIX_SIZE_READ_100_1K'] +
+                    shared_files['POSIX_SIZE_READ_1K_10K'] +
+                    shared_files['POSIX_SIZE_READ_10K_100K'] +
+                    shared_files['POSIX_SIZE_READ_100K_1M']
             )
 
-
             total_shared_writes = shared_files['POSIX_WRITES'].sum()
             total_shared_writes_small = (
-                shared_files['POSIX_SIZE_WRITE_0_100'].sum() +
-                shared_files['POSIX_SIZE_WRITE_100_1K'].sum() +
-                shared_files['POSIX_SIZE_WRITE_1K_10K'].sum() +
-                shared_files['POSIX_SIZE_WRITE_10K_100K'].sum() +
-                shared_files['POSIX_SIZE_WRITE_100K_1M'].sum()
+                    shared_files['POSIX_SIZE_WRITE_0_100'].sum() +
+                    shared_files['POSIX_SIZE_WRITE_100_1K'].sum() +
+                    shared_files['POSIX_SIZE_WRITE_1K_10K'].sum() +
+                    shared_files['POSIX_SIZE_WRITE_10K_100K'].sum() +
+                    shared_files['POSIX_SIZE_WRITE_100K_1M'].sum()
             )
 
             shared_files['INSIGHTS_POSIX_SMALL_WRITES'] = (
-                shared_files['POSIX_SIZE_WRITE_0_100'] +
-                shared_files['POSIX_SIZE_WRITE_100_1K'] +
-                shared_files['POSIX_SIZE_WRITE_1K_10K'] +
-                shared_files['POSIX_SIZE_WRITE_10K_100K'] +
-                shared_files['POSIX_SIZE_WRITE_100K_1M']
+                    shared_files['POSIX_SIZE_WRITE_0_100'] +
+                    shared_files['POSIX_SIZE_WRITE_100_1K'] +
+                    shared_files['POSIX_SIZE_WRITE_1K_10K'] +
+                    shared_files['POSIX_SIZE_WRITE_10K_100K'] +
+                    shared_files['POSIX_SIZE_WRITE_100K_1M']
             )
 
-            check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes, total_shared_writes_small, shared_files, file_map)
+            check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes,
+                                         total_shared_writes_small, shared_files, file_map)
 
         #########################################################################################################################################################################
 
-        count_long_metadata = len(df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])])
+        count_long_metadata = len(
+            df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])])
 
         check_long_metadata(count_long_metadata, modules)
 
@@ -473,16 +682,20 @@ def handler():
         for index, row in shared_files.iterrows():
             total_transfer_size = row['POSIX_BYTES_WRITTEN'] + row['POSIX_BYTES_READ']
 
-            if total_transfer_size and abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > thresholds['imbalance_stragglers'][0]:
+            if total_transfer_size and abs(
+                    row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > \
+                    thresholds['imbalance_stragglers'][0]:
                 stragglers_count += 1
 
                 detected_files.append([
-                    row['id'], abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size * 100
+                    row['id'],
+                    abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size * 100
                 ])
 
         column_names = ['id', 'data_imbalance']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
-        check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
+        check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data,
+                                   dxt_posix_write_data)
 
         # POSIX_F_FASTEST_RANK_TIME
         # POSIX_F_SLOWEST_RANK_TIME
@@ -501,11 +714,14 @@ def handler():
         for index, row in shared_files_times.iterrows():
             total_transfer_time = row['POSIX_F_WRITE_TIME'] + row['POSIX_F_READ_TIME'] + row['POSIX_F_META_TIME']
 
-            if total_transfer_time and abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > thresholds['imbalance_stragglers'][0]:
+            if total_transfer_time and abs(
+                    row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > \
+                    thresholds['imbalance_stragglers'][0]:
                 stragglers_count += 1
 
                 detected_files.append([
-                    row['id'], abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time * 100
+                    row['id'],
+                    abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time * 100
                 ])
 
         column_names = ['id', 'time_imbalance']
@@ -530,11 +746,13 @@ def handler():
         detected_files = []
 
         for index, row in aggregated.iterrows():
-            if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row['POSIX_BYTES_WRITTEN_max'] > thresholds['imbalance_size'][0]:
+            if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / \
+                    row['POSIX_BYTES_WRITTEN_max'] > thresholds['imbalance_size'][0]:
                 imbalance_count += 1
 
                 detected_files.append([
-                    row['id'], abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row['POSIX_BYTES_WRITTEN_max'] * 100
+                    row['id'], abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row[
+                        'POSIX_BYTES_WRITTEN_max'] * 100
                 ])
 
         column_names = ['id', 'write_imbalance']
@@ -546,11 +764,13 @@ def handler():
         detected_files = []
 
         for index, row in aggregated.iterrows():
-            if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] > thresholds['imbalance_size'][0]:
+            if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row[
+                'POSIX_BYTES_READ_max'] > thresholds['imbalance_size'][0]:
                 imbalance_count += 1
 
                 detected_files.append([
-                    row['id'], abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] * 100
+                    row['id'],
+                    abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] * 100
                 ])
 
         column_names = ['id', 'read_imbalance']
@@ -570,52 +790,62 @@ def handler():
 
         df_mpiio_collective_reads = df_mpiio['counters']  #.loc[(df_mpiio['counters']['MPIIO_COLL_READS'] > 0)]
 
-        total_mpiio_read_operations = df_mpiio['counters']['MPIIO_INDEP_READS'].sum() + df_mpiio['counters']['MPIIO_COLL_READS'].sum()
+        total_mpiio_read_operations = df_mpiio['counters']['MPIIO_INDEP_READS'].sum() + df_mpiio['counters'][
+            'MPIIO_COLL_READS'].sum()
 
         mpiio_coll_reads = df_mpiio['counters']['MPIIO_COLL_READS'].sum()
         mpiio_indep_reads = df_mpiio['counters']['MPIIO_INDEP_READS'].sum()
 
         detected_files = []
-        if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > thresholds['collective_operations_absolute'][0]:
+        if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > \
+                thresholds['collective_operations_absolute'][0]:
             files = pd.DataFrame(df_mpiio_collective_reads.groupby('id').sum()).reset_index()
             for index, row in df_mpiio_collective_reads.iterrows():
                 if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and
-                    row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations'][0] and
-                    (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations_absolute'][0]):
-
+                        row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+                        thresholds['collective_operations'][0] and
+                        (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+                        thresholds['collective_operations_absolute'][0]):
                     detected_files.append([
-                        row['id'], row['MPIIO_INDEP_READS'], row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100
+                        row['id'], row['MPIIO_INDEP_READS'],
+                        row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100
                     ])
-        
+
         column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
 
-        check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations, detected_files, file_map, dxt_mpiio)
+        check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations,
+                                            detected_files, file_map, dxt_mpiio)
 
         df_mpiio_collective_writes = df_mpiio['counters']  #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)]
 
-        total_mpiio_write_operations = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() + df_mpiio['counters']['MPIIO_COLL_WRITES'].sum()
+        total_mpiio_write_operations = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() + df_mpiio['counters'][
+            'MPIIO_COLL_WRITES'].sum()
 
         mpiio_coll_writes = df_mpiio['counters']['MPIIO_COLL_WRITES'].sum()
         mpiio_indep_writes = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum()
 
         detected_files = []
-        if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > thresholds['collective_operations_absolute'][0]:
+        if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > \
+                thresholds['collective_operations_absolute'][0]:
             files = pd.DataFrame(df_mpiio_collective_writes.groupby('id').sum()).reset_index()
 
             for index, row in df_mpiio_collective_writes.iterrows():
-                if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and 
-                    row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations'][0] and 
-                    (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > thresholds['collective_operations_absolute'][0]):
-
+                if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and
+                        row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+                        thresholds['collective_operations'][0] and
+                        (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
+                        thresholds['collective_operations_absolute'][0]):
                     detected_files.append([
-                        row['id'], row['MPIIO_INDEP_WRITES'], row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100
+                        row['id'], row['MPIIO_INDEP_WRITES'],
+                        row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100
                     ])
 
         column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes']
         detected_files = pd.DataFrame(detected_files, columns=column_names)
 
-        check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations, detected_files, file_map, dxt_mpiio)
+        check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations,
+                                             detected_files, file_map, dxt_mpiio)
 
         #########################################################################################################################################################################
 
@@ -656,7 +886,7 @@ def handler():
         for hint in hints:
             if hint != 'no':
                 (key, value) = hint.split('=')
-            
+
             if key == 'cb_nodes':
                 cb_nodes = value
 
@@ -686,7 +916,7 @@ def handler():
                     pass
         except FileNotFoundError:
             pass
-    
+
     #########################################################################################################################################################################
 
     insights_end_time = time.time()
@@ -721,7 +951,8 @@ def handler():
                 ' [b]FILES[/b]:          [white]{} files ({} use STDIO, {} use POSIX, {} use MPI-IO)[/white]'.format(
                     total_files,
                     total_files_stdio,
-                    total_files_posix - total_files_mpiio,  # Since MPI-IO files will always use POSIX, we can decrement to get a unique count
+                    total_files_posix - total_files_mpiio,
+                    # Since MPI-IO files will always use POSIX, we can decrement to get a unique count
                     total_files_mpiio
                 ),
                 ' [b]COMPUTE NODES[/b]   [white]{}[/white]'.format(
diff --git a/drishti/handlers/handle_recorder.py b/drishti/handlers/handle_recorder.py
index afccfce..7719aa9 100644
--- a/drishti/handlers/handle_recorder.py
+++ b/drishti/handlers/handle_recorder.py
@@ -7,7 +7,7 @@
 from recorder_utils import RecorderReader
 from recorder_utils.build_offset_intervals import build_offset_intervals
 
-from drishti.includes.module import *
+from includes.module import *
 
 
 def get_accessed_files(reader):
diff --git a/drishti/includes/config.py b/drishti/includes/config.py
index 15097fd..f93146b 100644
--- a/drishti/includes/config.py
+++ b/drishti/includes/config.py
@@ -9,7 +9,7 @@
 from rich.terminal_theme import TerminalTheme
 from rich.terminal_theme import MONOKAI
 
-from drishti.includes.parser import *
+from includes.parser import *
 
 
 RECOMMENDATIONS = 0
diff --git a/drishti/includes/module.py b/drishti/includes/module.py
index 9c2df16..538c399 100644
--- a/drishti/includes/module.py
+++ b/drishti/includes/module.py
@@ -6,7 +6,7 @@
 import pandas as pd
 from rich import box
 from rich.syntax import Syntax
-from drishti.includes.config import *
+from includes.config import *
 
 '''
 Before calling the functions below
diff --git a/drishti/reporter.py b/drishti/reporter.py
index 426d80c..c22487b 100644
--- a/drishti/reporter.py
+++ b/drishti/reporter.py
@@ -3,7 +3,11 @@
 import os
 import sys
 from subprocess import call
-from drishti.includes.parser import *
+
+# from includes.parser import * # imports {'parser', 'args', 'argparse'} # TODO: Is next line enuf
+from includes.parser import args
+
+
 
 '''
                          |- handler_darshan   -|
@@ -68,10 +72,10 @@ def main():
     log_type = check_log_type(args.log_paths)
 
     if log_type == LOG_TYPE_DARSHAN:
-        from drishti.handlers.handle_darshan import handler
+        from handlers.handle_darshan import handler
 
     elif log_type == LOG_TYPE_RECORDER:
-        from drishti.handlers.handle_recorder import handler
+        from handlers.handle_recorder import handler
 
     handler()
 

From 9d2fd1ffdcbc4fe663b2a81cec598ccc87bc5f89 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Fri, 2 Aug 2024 18:19:45 +0530
Subject: [PATCH 17/19] feat: Implement DarshanTrace dataclass

# Conflicts:
#	drishti/handlers/handle_darshan.py
---
 drishti/handlers/handle_darshan.py | 840 +++++++++++++----------------
 1 file changed, 386 insertions(+), 454 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index fc4a673..72e8558 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -1,28 +1,27 @@
 #!/usr/bin/env python3
-import collections
 import dataclasses
-from dataclasses import dataclass
-import datetime
 import io
-import sys
-import time
 import shlex
 import shutil
 import subprocess
+import os
+import sys
 import typing
+from dataclasses import dataclass
+from typing import Optional
 
-import pandas as pd
 import darshan
+import pandas as pd
+import datetime
 import darshan.backend.cffi_backend as darshanll
-
-from rich import print
 from packaging import version
-from includes.module import *
+from rich import print
+
+import includes.config as config
 import includes.module as module
+from includes.module import *
 from includes.parser import args
 
-from pprint import pprint
-
 
 def is_available(name):
     """Check whether `name` is on PATH and marked as executable."""
@@ -98,17 +97,7 @@ class DarshanTrace:
     report: darshan.DarshanReport
     modules: typing.Iterable[str]
 
-    stdio_df: pd.DataFrame = None
-    posix_df: pd.DataFrame = None
-    mpiio_df: pd.DataFrame = None
-    lustre_df: pd.DataFrame = None
-
-    dxt_posix: pd.DataFrame = None
-    dxt_mpiio: pd.DataFrame = None
-
-    dxt_posix_read_data: pd.DataFrame = None
-    dxt_posix_write_data: pd.DataFrame = None
-
+    ###
     total_write_size_stdio: int
     total_write_size_stdio: int
     total_size_stdio: int
@@ -123,6 +112,21 @@ class DarshanTrace:
 
     total_size: int
     total_files: int
+    ###
+    max_read_offset: int
+    max_write_offset: int
+    ###
+
+    stdio_df: pd.DataFrame = None
+    posix_df: pd.DataFrame = None
+    mpiio_df: pd.DataFrame = None
+    lustre_df: pd.DataFrame = None
+
+    dxt_posix: pd.DataFrame = None
+    dxt_mpiio: pd.DataFrame = None
+
+    dxt_posix_read_data: pd.DataFrame = None
+    dxt_posix_write_data: pd.DataFrame = None
 
     total_files_stdio: int = 0
     total_files_posix: int = 0
@@ -135,10 +139,46 @@ class DarshanTrace:
     total_operations: int = 0
     total_read_size: int = 0
     total_written_size: int = 0
-    total_size: int = 0
+    total_posix_size: int = 0
     total_reads_small: int = 0
     total_writes_small: int = 0
 
+    total_mem_not_aligned: int = 0
+    total_file_not_aligned: int = 0
+
+    read_consecutive: int = 0
+    read_sequential: int = 0
+    read_random: int = 0
+    write_consecutive: int = 0
+    write_sequential: int = 0
+    write_random: int = 0
+
+    shared_files: pd.DataFrame = None
+    total_shared_reads: int = 0
+    total_shared_reads_small: int = 0
+    total_shared_writes: int = 0
+    total_shared_writes_small: int = 0
+
+    count_long_metadata: int = 0
+
+    posix_shared_data_imbalance_stragglers_count: int = 0
+
+    # 2 functions (unsure ones)
+
+    has_hdf5_extension: bool = False
+
+    mpiio_nb_reads: int = 0
+    mpiio_nb_writes: int = 0
+
+    cb_nodes: Optional[int] = None
+    number_of_compute_nodes: int = 0
+    hints: list[str] = dataclasses.field(default_factory=list)
+
+    job_start: Optional[datetime.datetime] = None
+    job_end: Optional[datetime.datetime] = None
+
+    aggregated: pd.DataFrame = None
+
     def __init__(self, trace_path: str, job_information, report: darshan.DarshanReport):
         self.path = trace_path
 
@@ -165,6 +205,9 @@ def __init__(self, trace_path: str, job_information, report: darshan.DarshanRepo
         self.dxt_posix = report.records['DXT_POSIX'].to_df() if 'DXT_POSIX' in self.modules else None
         self.dxt_mpiio = report.records['DXT_MPIIO'].to_df() if 'DXT_MPIIO' in self.modules else None
 
+        self.hints = []
+        self.files = {}
+
     def generate_dxt_posix_rw_df(self) -> None:
         if not args.backtrace:
             return
@@ -267,20 +310,20 @@ def calculate_insights(self) -> None:
         if self.total_size_posix > 0 and self.total_size_posix >= self.total_size_mpiio:
             self.total_size_posix -= self.total_size_mpiio
 
-        self.total_size = self.total_size_stdio + self.total_size_posix + self.total_size_mpiio
+        self.total_posix_size = self.total_size_stdio + self.total_size_posix + self.total_size_mpiio
 
         assert (self.total_size_stdio >= 0)
         assert (self.total_size_posix >= 0)
         assert (self.total_size_mpiio >= 0)
 
     def files_stuff(self) -> None:
-        file_map = self.report.name_records
+        self.report.name_records = self.report.name_records
 
-        self.total_files = len(file_map)
+        self.total_files = len(self.report.name_records)
 
         # files = dict()
 
-        for id, path in file_map.items():
+        for id, path in self.report.name_records.items():
             uses_stdio = len(
                 self.stdio_df['counters'][self.stdio_df['counters']['id'] == id]) > 0 if self.stdio_df else 0
             uses_posix = len(
@@ -300,7 +343,7 @@ def files_stuff(self) -> None:
             }
 
     def check_stdio(self) -> None:
-        module.check_stdio(self.total_size, self.total_size_stdio)
+        module.check_stdio(self.total_posix_size, self.total_size_stdio)
 
     def check_mpiio(self) -> None:
         module.check_mpiio(self.modules)
@@ -312,15 +355,15 @@ def something(self) -> None:
         self.total_reads = self.posix_df['counters']['POSIX_READS'].sum()
         self.total_writes = self.posix_df['counters']['POSIX_WRITES'].sum()
         self.total_operations = self.total_writes + self.total_reads
-
+        # ----------------------------------------------------------------------------------------------------------------------
         module.check_operation_intensive(self.total_operations, self.total_reads, self.total_writes)
 
-        self.total_read_size = self.posix_df['counters']['POSIX_BYTES_READ'].sum()
-        self.total_written_size = self.posix_df['counters']['POSIX_BYTES_WRITTEN'].sum()
-        self.total_size = self.total_written_size + self.total_read_size
-
-        module.check_size_intensive(self.total_size, self.total_read_size, self.total_written_size)
+        total_read_size = self.posix_df['counters']['POSIX_BYTES_READ'].sum()
+        total_written_size = self.posix_df['counters']['POSIX_BYTES_WRITTEN'].sum()
+        total_size = total_written_size + total_read_size
 
+        module.check_size_intensive(total_size, total_read_size, total_written_size)
+        # -----
         self.total_reads_small = (
                 self.posix_df['counters']['POSIX_SIZE_READ_0_100'].sum() +
                 self.posix_df['counters']['POSIX_SIZE_READ_100_1K'].sum() +
@@ -336,378 +379,195 @@ def something(self) -> None:
                 self.posix_df['counters']['POSIX_SIZE_WRITE_100K_1M'].sum()
         )
 
-    def something2(self):
-        detected_files = pd.DataFrame(self.posix_df['counters'].groupby('id')[['INSIGHTS_POSIX_SMALL_READ',
-                                                                    'INSIGHTS_POSIX_SMALL_WRITE']].sum()).reset_index()
-        detected_files.columns = ['id', 'total_reads', 'total_writes']
-        detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str)
-
-        file_map = self.report.name_records
-        module.check_small_operation(self.total_reads, self.total_reads_small, self.total_writes, self.total_writes_small,
-                                     detected_files,
-                                     self.modules, file_map, self.dxt_posix, self.dxt_posix_read_data,
-                                     self.dxt_posix_write_data)
-
-
-def file_reader(trace_path: str):
-    log = darshanll.log_open(args.log_path)
-
-    modules = darshanll.log_get_modules(log)
-
-    information = darshanll.log_get_job(log)
-
-
-def log_relation_check():
-    # TODO: Ensure that all logs are from a single job, generated at the same time, from the same executable and using the same library version
-    pass
-
-
-def handler():
-    console = init_console()
-
-    insights_start_time = time.time()
-
-    # TODO: Break here for new fn
-
-    trace_path = args.log_paths[0]  # TODO: A single file rn
-
-    darshan.enable_experimental()
-    library_version = darshanll.get_lib_version()
-
-    # TODO: Can this be put in a with block?
-    log = darshanll.log_open(trace_path)
-    information = darshanll.log_get_job(log)
-    darshanll.log_close(log)
-
-    report = darshan.DarshanReport(trace_path)
-    current_trace = DarshanTrace(trace_path, information, report)  # WIP: Implement this constructor
-    #
-
-    # TODO: What to do here?
-    # # Make sure log format is of the same version
-    # filename = args.log_path
-    # # check_log_version(console, args.log_path, log_version, library_version)
-    #
-
-    # TODO: Break here
-
-    #########################################################################################################################################################################
-
-    # TODO: Check usage of STDIO, POSIX, and MPI-IO per file
-
-    if 'STDIO' in report.records:
-        df_stdio = report.records['STDIO'].to_df()
-
-        if df_stdio:
-            total_write_size_stdio = df_stdio['counters']['STDIO_BYTES_WRITTEN'].sum()
-            total_read_size_stdio = df_stdio['counters']['STDIO_BYTES_READ'].sum()
-
-            total_size_stdio = total_write_size_stdio + total_read_size_stdio
-        else:
-            total_size_stdio = 0
-    else:
-        df_stdio = None
-
-        total_size_stdio = 0
-
-    if 'POSIX' in report.records:
-        df_posix = report.records['POSIX'].to_df()
-
-        if df_posix:
-            total_write_size_posix = df_posix['counters']['POSIX_BYTES_WRITTEN'].sum()
-            total_read_size_posix = df_posix['counters']['POSIX_BYTES_READ'].sum()
-
-            total_size_posix = total_write_size_posix + total_read_size_posix
-        else:
-            total_size_posix = 0
-    else:
-        df_posix = None
-
-        total_size_posix = 0
-
-    if 'MPI-IO' in report.records:
-        df_mpiio = report.records['MPI-IO'].to_df()
-
-        if df_mpiio:
-            total_write_size_mpiio = df_mpiio['counters']['MPIIO_BYTES_WRITTEN'].sum()
-            total_read_size_mpiio = df_mpiio['counters']['MPIIO_BYTES_READ'].sum()
-
-            total_size_mpiio = total_write_size_mpiio + total_read_size_mpiio
-        else:
-            total_size_mpiio = 0
-    else:
-        df_mpiio = None
-
-        total_size_mpiio = 0
-
-    dxt_posix = None
-    dxt_posix_read_data = None
-    dxt_posix_write_data = None
-    dxt_mpiio = None
-
-    # Since POSIX will capture both POSIX-only accesses and those comming from MPI-IO, we can subtract those
-    if total_size_posix > 0 and total_size_posix >= total_size_mpiio:
-        total_size_posix -= total_size_mpiio
-
-    total_size = total_size_stdio + total_size_posix + total_size_mpiio
-
-    assert (total_size_stdio >= 0)
-    assert (total_size_posix >= 0)
-    assert (total_size_mpiio >= 0)
-
-    files = {}
-
-    # Check interface usage for each file
-    file_map = report.name_records
-
-    total_files = len(file_map)
-
-    total_files_stdio = 0
-    total_files_posix = 0
-    total_files_mpiio = 0
-
-    for id, path in file_map.items():
-        if df_stdio:
-            uses_stdio = len(df_stdio['counters'][(df_stdio['counters']['id'] == id)]) > 0
-        else:
-            uses_stdio = 0
-
-        if df_posix:
-            uses_posix = len(df_posix['counters'][(df_posix['counters']['id'] == id)]) > 0
-        else:
-            uses_posix = 0
-
-        if df_mpiio:
-            uses_mpiio = len(df_mpiio['counters'][(df_mpiio['counters']['id'] == id)]) > 0
-        else:
-            uses_mpiio = 0
-
-        total_files_stdio += uses_stdio
-        total_files_posix += uses_posix
-        total_files_mpiio += uses_mpiio
-
-        files[id] = {
-            'path': path,
-            'stdio': uses_stdio,
-            'posix': uses_posix,
-            'mpiio': uses_mpiio
-        }
-
-    check_stdio(total_size, total_size_stdio)
-    check_mpiio(modules)
-
-    #########################################################################################################################################################################
-
-    if 'POSIX' in report.records:
-        df = report.records['POSIX'].to_df()
-
-        #########################################################################################################################################################################
-
-        # Get number of write/read operations
-        total_reads = df['counters']['POSIX_READS'].sum()
-        total_writes = df['counters']['POSIX_WRITES'].sum()
-
-        # Get total number of I/O operations
-        total_operations = total_writes + total_reads
-
-        # To check whether the application is write-intersive or read-intensive we only look at the POSIX level and check if the difference between reads and writes is larger than 10% (for more or less), otherwise we assume a balance
-        check_operation_intensive(total_operations, total_reads, total_writes)
-
-        total_read_size = df['counters']['POSIX_BYTES_READ'].sum()
-        total_written_size = df['counters']['POSIX_BYTES_WRITTEN'].sum()
-
-        total_size = total_written_size + total_read_size
-
-        check_size_intensive(total_size, total_read_size, total_written_size)
-
-        #########################################################################################################################################################################
-
-        # Get the number of small I/O operations (less than 1 MB)
-        total_reads_small = (
-                df['counters']['POSIX_SIZE_READ_0_100'].sum() +
-                df['counters']['POSIX_SIZE_READ_100_1K'].sum() +
-                df['counters']['POSIX_SIZE_READ_1K_10K'].sum() +
-                df['counters']['POSIX_SIZE_READ_10K_100K'].sum() +
-                df['counters']['POSIX_SIZE_READ_100K_1M'].sum()
-        )
-
-        total_writes_small = (
-                df['counters']['POSIX_SIZE_WRITE_0_100'].sum() +
-                df['counters']['POSIX_SIZE_WRITE_100_1K'].sum() +
-                df['counters']['POSIX_SIZE_WRITE_1K_10K'].sum() +
-                df['counters']['POSIX_SIZE_WRITE_10K_100K'].sum() +
-                df['counters']['POSIX_SIZE_WRITE_100K_1M'].sum()
-        )
+    def small_operation_calculation(self):
+        if not self.posix_df:
+            return
 
-        # Get the files responsible for more than half of these accesses
         files = []
 
-        df['counters']['INSIGHTS_POSIX_SMALL_READ'] = (
-                df['counters']['POSIX_SIZE_READ_0_100'] +
-                df['counters']['POSIX_SIZE_READ_100_1K'] +
-                df['counters']['POSIX_SIZE_READ_1K_10K'] +
-                df['counters']['POSIX_SIZE_READ_10K_100K'] +
-                df['counters']['POSIX_SIZE_READ_100K_1M']
+        self.posix_df['counters']['INSIGHTS_POSIX_SMALL_READ'] = (
+                self.posix_df['counters']['POSIX_SIZE_READ_0_100'] +
+                self.posix_df['counters']['POSIX_SIZE_READ_100_1K'] +
+                self.posix_df['counters']['POSIX_SIZE_READ_1K_10K'] +
+                self.posix_df['counters']['POSIX_SIZE_READ_10K_100K'] +
+                self.posix_df['counters']['POSIX_SIZE_READ_100K_1M']
         )
 
-        df['counters']['INSIGHTS_POSIX_SMALL_WRITE'] = (
-                df['counters']['POSIX_SIZE_WRITE_0_100'] +
-                df['counters']['POSIX_SIZE_WRITE_100_1K'] +
-                df['counters']['POSIX_SIZE_WRITE_1K_10K'] +
-                df['counters']['POSIX_SIZE_WRITE_10K_100K'] +
-                df['counters']['POSIX_SIZE_WRITE_100K_1M']
+        self.posix_df['counters']['INSIGHTS_POSIX_SMALL_WRITE'] = (
+                self.posix_df['counters']['POSIX_SIZE_WRITE_0_100'] +
+                self.posix_df['counters']['POSIX_SIZE_WRITE_100_1K'] +
+                self.posix_df['counters']['POSIX_SIZE_WRITE_1K_10K'] +
+                self.posix_df['counters']['POSIX_SIZE_WRITE_10K_100K'] +
+                self.posix_df['counters']['POSIX_SIZE_WRITE_100K_1M']
         )
 
-        detected_files = pd.DataFrame(df['counters'].groupby('id')[['INSIGHTS_POSIX_SMALL_READ',
-                                                                    'INSIGHTS_POSIX_SMALL_WRITE']].sum()).reset_index()
-        detected_files.columns = ['id', 'total_reads', 'total_writes']
+        detected_files = pd.DataFrame(self.posix_df['counters'].groupby('id')[['INSIGHTS_POSIX_SMALL_READ',
+                                                                               'INSIGHTS_POSIX_SMALL_WRITE']].sum()).reset_index()
+        detected_files.columns = ['id', 'total_reads',
+                                  'total_writes']  # !: Rename later. total_small_reads, total_small_writes
         detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str)
 
-        check_small_operation(total_reads, total_reads_small, total_writes, total_writes_small, detected_files, modules,
-                              file_map, dxt_posix, dxt_posix_read_data, dxt_posix_write_data)
-
-        #########################################################################################################################################################################
-
-        # How many requests are misaligned?
-
-        total_mem_not_aligned = df['counters']['POSIX_MEM_NOT_ALIGNED'].sum()
-        total_file_not_aligned = df['counters']['POSIX_FILE_NOT_ALIGNED'].sum()
+        self.report.name_records = self.report.name_records
+        module.check_small_operation(self.total_reads, self.total_reads_small, self.total_writes,
+                                     self.total_writes_small,
+                                     detected_files,
+                                     self.modules, self.report.name_records, self.dxt_posix, self.dxt_posix_read_data,
+                                     self.dxt_posix_write_data)
 
-        check_misaligned(total_operations, total_mem_not_aligned, total_file_not_aligned, modules, file_map, df_lustre,
-                         dxt_posix, dxt_posix_read_data)
+    def posix_alignment(self):
+        if not self.posix_df:
+            return
 
-        #########################################################################################################################################################################
+        self.total_mem_not_aligned = self.posix_df['counters']['POSIX_MEM_NOT_ALIGNED'].sum()
+        self.total_file_not_aligned = self.posix_df['counters']['POSIX_FILE_NOT_ALIGNED'].sum()
 
-        # Redundant read-traffic (based on Phill)
-        # POSIX_MAX_BYTE_READ (Highest offset in the file that was read)
-        max_read_offset = df['counters']['POSIX_MAX_BYTE_READ'].max()
-        max_write_offset = df['counters']['POSIX_MAX_BYTE_WRITTEN'].max()
+        self.report.name_records = self.report.name_records
+        module.check_misaligned(self.total_operations, self.total_mem_not_aligned, self.total_file_not_aligned,
+                                self.modules, self.report.name_records, self.lustre_df, self.dxt_posix,
+                                self.dxt_posix_read_data)
 
-        check_traffic(max_read_offset, total_read_size, max_write_offset, total_written_size, dxt_posix,
-                      dxt_posix_read_data, dxt_posix_write_data)
+    def posix_redundant_reads(self):
+        if not self.posix_df:
+            return
 
-        #########################################################################################################################################################################
+        self.max_read_offset = self.posix_df['counters']['POSIX_MAX_BYTE_READ'].max()
+        self.max_write_offset = self.posix_df['counters']['POSIX_MAX_BYTE_WRITTEN'].max()
 
-        # Check for a lot of random operations
+        module.check_traffic(self.max_read_offset, self.total_read_size, self.max_write_offset, self.total_written_size,
+                             self.dxt_posix, self.dxt_posix_read_data, self.dxt_posix_write_data)
 
-        read_consecutive = df['counters']['POSIX_CONSEC_READS'].sum()
-        #print('READ Consecutive: {} ({:.2f}%)'.format(read_consecutive, read_consecutive / total_reads * 100))
+    def posix_random_check(self):
+        if not self.posix_df:
+            return
 
-        read_sequential = df['counters']['POSIX_SEQ_READS'].sum()
-        read_sequential -= read_consecutive
-        #print('READ Sequential: {} ({:.2f}%)'.format(read_sequential, read_sequential / total_reads * 100))
+        self.read_consecutive = self.posix_df['counters']['POSIX_CONSEC_READS'].sum()
 
-        read_random = total_reads - read_consecutive - read_sequential
-        #print('READ Random: {} ({:.2f}%)'.format(read_random, read_random / total_reads * 100))
+        self.read_sequential = self.posix_df['counters']['POSIX_SEQ_READS'].sum()
+        self.read_sequential -= self.read_consecutive
 
-        write_consecutive = df['counters']['POSIX_CONSEC_WRITES'].sum()
+        self.read_random = self.total_reads - self.read_consecutive - self.read_sequential
 
-        write_sequential = df['counters']['POSIX_SEQ_WRITES'].sum()
-        write_sequential -= write_consecutive
+        self.write_consecutive = self.posix_df['counters']['POSIX_CONSEC_WRITES'].sum()
 
-        write_random = total_writes - write_consecutive - write_sequential
-        #print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100))
+        self.write_sequential = self.posix_df['counters']['POSIX_SEQ_WRITES'].sum()
+        self.write_sequential -= self.write_consecutive
 
-        check_random_operation(read_consecutive, read_sequential, read_random, total_reads, write_consecutive,
-                               write_sequential, write_random, total_writes, dxt_posix, dxt_posix_read_data,
-                               dxt_posix_write_data)
+        self.write_random = self.total_writes - self.write_consecutive - self.write_sequential
 
-        #########################################################################################################################################################################
+        module.check_random_operation(self.read_consecutive, self.read_sequential, self.read_random, self.total_reads,
+                                      self.write_consecutive, self.write_sequential, self.write_random,
+                                      self.total_writes, self.dxt_posix,
+                                      self.dxt_posix_read_data, self.dxt_posix_write_data)
 
-        # Shared file with small operations
+    def posix_shared_file(self):
+        if not self.posix_df:
+            return
 
-        shared_files = df['counters'].loc[(df['counters']['rank'] == -1)]
+        self.shared_files = self.posix_df['counters'].loc[(self.posix_df['counters']['rank'] == -1)]
 
-        shared_files = shared_files.assign(id=lambda d: d['id'].astype(str))
+        self.shared_files = self.shared_files.assign(id=lambda d: d['id'].astype(str))
 
-        if not shared_files.empty:
-            total_shared_reads = shared_files['POSIX_READS'].sum()
-            total_shared_reads_small = (
-                    shared_files['POSIX_SIZE_READ_0_100'].sum() +
-                    shared_files['POSIX_SIZE_READ_100_1K'].sum() +
-                    shared_files['POSIX_SIZE_READ_1K_10K'].sum() +
-                    shared_files['POSIX_SIZE_READ_10K_100K'].sum() +
-                    shared_files['POSIX_SIZE_READ_100K_1M'].sum()
+        if not self.shared_files.empty:
+            self.total_shared_reads = self.shared_files['POSIX_READS'].sum()
+            self.total_shared_reads_small = (
+                    self.shared_files['POSIX_SIZE_READ_0_100'].sum() +
+                    self.shared_files['POSIX_SIZE_READ_100_1K'].sum() +
+                    self.shared_files['POSIX_SIZE_READ_1K_10K'].sum() +
+                    self.shared_files['POSIX_SIZE_READ_10K_100K'].sum() +
+                    self.shared_files['POSIX_SIZE_READ_100K_1M'].sum()
             )
 
-            shared_files['INSIGHTS_POSIX_SMALL_READS'] = (
-                    shared_files['POSIX_SIZE_READ_0_100'] +
-                    shared_files['POSIX_SIZE_READ_100_1K'] +
-                    shared_files['POSIX_SIZE_READ_1K_10K'] +
-                    shared_files['POSIX_SIZE_READ_10K_100K'] +
-                    shared_files['POSIX_SIZE_READ_100K_1M']
+            self.shared_files['INSIGHTS_POSIX_SMALL_READS'] = (
+                    self.shared_files['POSIX_SIZE_READ_0_100'] +
+                    self.shared_files['POSIX_SIZE_READ_100_1K'] +
+                    self.shared_files['POSIX_SIZE_READ_1K_10K'] +
+                    self.shared_files['POSIX_SIZE_READ_10K_100K'] +
+                    self.shared_files['POSIX_SIZE_READ_100K_1M']
             )
 
-            total_shared_writes = shared_files['POSIX_WRITES'].sum()
-            total_shared_writes_small = (
-                    shared_files['POSIX_SIZE_WRITE_0_100'].sum() +
-                    shared_files['POSIX_SIZE_WRITE_100_1K'].sum() +
-                    shared_files['POSIX_SIZE_WRITE_1K_10K'].sum() +
-                    shared_files['POSIX_SIZE_WRITE_10K_100K'].sum() +
-                    shared_files['POSIX_SIZE_WRITE_100K_1M'].sum()
+            self.total_shared_writes = self.shared_files['POSIX_WRITES'].sum()
+            self.total_shared_writes_small = (
+                    self.shared_files['POSIX_SIZE_WRITE_0_100'].sum() +
+                    self.shared_files['POSIX_SIZE_WRITE_100_1K'].sum() +
+                    self.shared_files['POSIX_SIZE_WRITE_1K_10K'].sum() +
+                    self.shared_files['POSIX_SIZE_WRITE_10K_100K'].sum() +
+                    self.shared_files['POSIX_SIZE_WRITE_100K_1M'].sum()
             )
 
-            shared_files['INSIGHTS_POSIX_SMALL_WRITES'] = (
-                    shared_files['POSIX_SIZE_WRITE_0_100'] +
-                    shared_files['POSIX_SIZE_WRITE_100_1K'] +
-                    shared_files['POSIX_SIZE_WRITE_1K_10K'] +
-                    shared_files['POSIX_SIZE_WRITE_10K_100K'] +
-                    shared_files['POSIX_SIZE_WRITE_100K_1M']
+            self.shared_files['INSIGHTS_POSIX_SMALL_WRITES'] = (
+                    self.shared_files['POSIX_SIZE_WRITE_0_100'] +
+                    self.shared_files['POSIX_SIZE_WRITE_100_1K'] +
+                    self.shared_files['POSIX_SIZE_WRITE_1K_10K'] +
+                    self.shared_files['POSIX_SIZE_WRITE_10K_100K'] +
+                    self.shared_files['POSIX_SIZE_WRITE_100K_1M']
             )
 
-            check_shared_small_operation(total_shared_reads, total_shared_reads_small, total_shared_writes,
-                                         total_shared_writes_small, shared_files, file_map)
+            self.report.name_records = self.report.name_records
+            check_shared_small_operation(self.total_shared_reads, self.total_shared_reads_small,
+                                         self.total_shared_writes,
+                                         self.total_shared_writes_small, self.shared_files, self.report.name_records)
 
-        #########################################################################################################################################################################
+    def posix_long_metadata(self):
+        if not self.posix_df:
+            return
 
-        count_long_metadata = len(
-            df['fcounters'][(df['fcounters']['POSIX_F_META_TIME'] > thresholds['metadata_time_rank'][0])])
+        self.count_long_metadata = len(
+            self.posix_df['fcounters'][
+                (self.posix_df['fcounters']['POSIX_F_META_TIME'] > config.thresholds['metadata_time_rank'][0])])
 
-        check_long_metadata(count_long_metadata, modules)
+        module.check_long_metadata(self.count_long_metadata, self.modules)
 
+    def posix_stragglers(self):
+        if not self.posix_df:
+            return
         # We already have a single line for each shared-file access
-        # To check for stragglers, we can check the difference between the 
+        # To check for stragglers, we can check the difference between the
 
         # POSIX_FASTEST_RANK_BYTES
         # POSIX_SLOWEST_RANK_BYTES
         # POSIX_F_VARIANCE_RANK_BYTES
 
-        stragglers_count = 0
+        self.shared_files = self.shared_files.assign(id=lambda d: d['id'].astype(str))
 
-        shared_files = shared_files.assign(id=lambda d: d['id'].astype(str))
+        posix_straggler_files = []
 
-        # Get the files responsible
-        detected_files = []
-
-        for index, row in shared_files.iterrows():
+        for index, row in self.shared_files.iterrows():
             total_transfer_size = row['POSIX_BYTES_WRITTEN'] + row['POSIX_BYTES_READ']
 
             if total_transfer_size and abs(
                     row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > \
                     thresholds['imbalance_stragglers'][0]:
-                stragglers_count += 1
+                self.posix_shared_data_imbalance_stragglers_count += 1
 
-                detected_files.append([
+                posix_straggler_files.append([
                     row['id'],
                     abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size * 100
                 ])
 
         column_names = ['id', 'data_imbalance']
-        detected_files = pd.DataFrame(detected_files, columns=column_names)
-        check_shared_data_imblance(stragglers_count, detected_files, file_map, dxt_posix, dxt_posix_read_data,
-                                   dxt_posix_write_data)
+        posix_straggler_files = pd.DataFrame(posix_straggler_files, columns=column_names)
+
+        self.report.name_records = self.report.name_records
+        module.check_shared_data_imblance(self.posix_shared_data_imbalance_stragglers_count, posix_straggler_files,
+                                          self.report.name_records, self.dxt_posix,
+                                          self.dxt_posix_read_data,
+                                          self.dxt_posix_write_data)
 
         # POSIX_F_FASTEST_RANK_TIME
         # POSIX_F_SLOWEST_RANK_TIME
         # POSIX_F_VARIANCE_RANK_TIME
 
-        shared_files_times = df['fcounters'].loc[(df['fcounters']['rank'] == -1)]
+    #################################################################################################################
 
+    def posix_stragglers2(self):
         # Get the files responsible
-        detected_files = []
 
-        stragglers_count = 0
-        stragglers_imbalance = {}
+        shared_files_times = self.posix_df['fcounters'].loc[(self.posix_df['fcounters']['rank'] == -1)]
+
+        posix_shared_time_imbalance_detected_files = []
+
+        posix_stragglers_shared_file_time_imbalance_count = 0
+        posix_stragglers_shared_file_time_imbalance = {}
 
         shared_files_times = shared_files_times.assign(id=lambda d: d['id'].astype(str))
 
@@ -716,85 +576,70 @@ def handler():
 
             if total_transfer_time and abs(
                     row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > \
-                    thresholds['imbalance_stragglers'][0]:
-                stragglers_count += 1
+                    config.thresholds['imbalance_stragglers'][0]:
+                posix_stragglers_shared_file_time_imbalance_count += 1
 
-                detected_files.append([
+                posix_shared_time_imbalance_detected_files.append([
                     row['id'],
                     abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time * 100
                 ])
 
         column_names = ['id', 'time_imbalance']
-        detected_files = pd.DataFrame(detected_files, columns=column_names)
-        check_shared_time_imbalance(stragglers_count, detected_files, file_map)
+        posix_shared_time_imbalance_detected_files = pd.DataFrame(posix_shared_time_imbalance_detected_files,
+                                                                  columns=column_names)
+        module.check_shared_time_imbalance(posix_stragglers_shared_file_time_imbalance_count,
+                                           posix_shared_time_imbalance_detected_files, self.report.name_records)
 
-        aggregated = df['counters'].loc[(df['counters']['rank'] != -1)][
+    def posix_imbalance(self):
+        aggregated = self.posix_df['counters'].loc[(self.posix_df['counters']['rank'] != -1)][
             ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ']
         ].groupby('id', as_index=False).agg({
             'rank': 'nunique',
             'POSIX_BYTES_WRITTEN': ['sum', 'min', 'max'],
             'POSIX_BYTES_READ': ['sum', 'min', 'max']
         })
-
         aggregated.columns = list(map('_'.join, aggregated.columns.values))
-
         aggregated = aggregated.assign(id=lambda d: d['id_'].astype(str))
+        self.aggregated = aggregated
 
         # Get the files responsible
         imbalance_count = 0
 
-        detected_files = []
+        posix_shared_time_imbalance_detected_files = []
 
-        for index, row in aggregated.iterrows():
+        for index, row in self.aggregated.iterrows():
             if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / \
-                    row['POSIX_BYTES_WRITTEN_max'] > thresholds['imbalance_size'][0]:
+                    row['POSIX_BYTES_WRITTEN_max'] > config.thresholds['imbalance_size'][0]:
                 imbalance_count += 1
 
-                detected_files.append([
+                posix_shared_time_imbalance_detected_files.append([
                     row['id'], abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row[
                         'POSIX_BYTES_WRITTEN_max'] * 100
                 ])
 
         column_names = ['id', 'write_imbalance']
-        detected_files = pd.DataFrame(detected_files, columns=column_names)
-        check_individual_write_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_write_data)
-
-        imbalance_count = 0
+        posix_shared_time_imbalance_detected_files = pd.DataFrame(posix_shared_time_imbalance_detected_files,
+                                                                  columns=column_names)
+        module.check_individual_write_imbalance(imbalance_count, posix_shared_time_imbalance_detected_files,
+                                                self.report.name_records, self.dxt_posix, self.dxt_posix_write_data)
 
-        detected_files = []
-
-        for index, row in aggregated.iterrows():
-            if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row[
-                'POSIX_BYTES_READ_max'] > thresholds['imbalance_size'][0]:
-                imbalance_count += 1
-
-                detected_files.append([
-                    row['id'],
-                    abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] * 100
-                ])
-
-        column_names = ['id', 'read_imbalance']
-        detected_files = pd.DataFrame(detected_files, columns=column_names)
-        check_individual_read_imbalance(imbalance_count, detected_files, file_map, dxt_posix, dxt_posix_read_data)
-
-    #########################################################################################################################################################################
-
-    if 'MPI-IO' in report.records:
-        # Check if application uses MPI-IO and collective operations
-        df_mpiio = report.records['MPI-IO'].to_df()
+    def mpiio_processing(self):
+        if not self.mpiio_df:
+            return
 
-        df_mpiio['counters'] = df_mpiio['counters'].assign(id=lambda d: d['id'].astype(str))
+        self.mpiio_df['counters'] = self.mpiio_df['counters'].assign(
+            id=lambda d: d['id'].astype(str))  # What does this do?
 
         # Get the files responsible
         detected_files = []
 
-        df_mpiio_collective_reads = df_mpiio['counters']  #.loc[(df_mpiio['counters']['MPIIO_COLL_READS'] > 0)]
+        df_mpiio_collective_reads = self.mpiio_df['counters']  # .loc[(df_mpiio['counters']['MPIIO_COLL_READS'] > 0)]
 
-        total_mpiio_read_operations = df_mpiio['counters']['MPIIO_INDEP_READS'].sum() + df_mpiio['counters'][
+        total_mpiio_read_operations = self.mpiio_df['counters']['MPIIO_INDEP_READS'].sum() + self.mpiio_df['counters'][
             'MPIIO_COLL_READS'].sum()
 
-        mpiio_coll_reads = df_mpiio['counters']['MPIIO_COLL_READS'].sum()
-        mpiio_indep_reads = df_mpiio['counters']['MPIIO_INDEP_READS'].sum()
+        mpiio_coll_reads = self.mpiio_df['counters']['MPIIO_COLL_READS'].sum()
+        mpiio_indep_reads = self.mpiio_df['counters']['MPIIO_INDEP_READS'].sum()
 
         detected_files = []
         if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > \
@@ -815,15 +660,16 @@ def handler():
         detected_files = pd.DataFrame(detected_files, columns=column_names)
 
         check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations,
-                                            detected_files, file_map, dxt_mpiio)
+                                            detected_files, self.report.name_records, self.dxt_mpiio)
 
-        df_mpiio_collective_writes = df_mpiio['counters']  #.loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)]
+        df_mpiio_collective_writes = self.mpiio_df['counters']  # .loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)]
 
-        total_mpiio_write_operations = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() + df_mpiio['counters'][
-            'MPIIO_COLL_WRITES'].sum()
+        total_mpiio_write_operations = self.mpiio_df['counters']['MPIIO_INDEP_WRITES'].sum() + \
+                                       self.mpiio_df['counters'][
+                                           'MPIIO_COLL_WRITES'].sum()
 
-        mpiio_coll_writes = df_mpiio['counters']['MPIIO_COLL_WRITES'].sum()
-        mpiio_indep_writes = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum()
+        mpiio_coll_writes = self.mpiio_df['counters']['MPIIO_COLL_WRITES'].sum()
+        mpiio_indep_writes = self.mpiio_df['counters']['MPIIO_INDEP_WRITES'].sum()
 
         detected_files = []
         if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > \
@@ -845,89 +691,175 @@ def handler():
         detected_files = pd.DataFrame(detected_files, columns=column_names)
 
         check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations,
-                                             detected_files, file_map, dxt_mpiio)
+                                             detected_files, self.report.name_records, self.dxt_mpiio)
 
-        #########################################################################################################################################################################
+    def posix_imbalance2(self):
+        imbalance_count = 0
 
-        # Look for usage of non-block operations
+        posix_shared_time_imbalance_detected_files = []
 
-        # Look for HDF5 file extension
+        for index, row in self.aggregated.iterrows():
+            if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row[
+                'POSIX_BYTES_READ_max'] > thresholds['imbalance_size'][0]:
+                imbalance_count += 1
 
-        has_hdf5_extension = False
+                posix_shared_time_imbalance_detected_files.append([
+                    row['id'],
+                    abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] * 100
+                ])
+
+        column_names = ['id', 'read_imbalance']
+        posix_shared_time_imbalance_detected_files = pd.DataFrame(posix_shared_time_imbalance_detected_files,
+                                                                  columns=column_names)
+        module.check_individual_read_imbalance(imbalance_count, posix_shared_time_imbalance_detected_files,
+                                               self.report.name_records, self.dxt_posix, self.dxt_posix_read_data)
 
-        for index, row in df_mpiio['counters'].iterrows():
-            if file_map[int(row['id'])].endswith('.h5') or file_map[int(row['id'])].endswith('.hdf5'):
-                has_hdf5_extension = True
+    def hdf5_check(self):
+        if not self.mpiio_df:
+            return
 
-        mpiio_nb_reads = df_mpiio['counters']['MPIIO_NB_READS'].sum()
-        mpiio_nb_writes = df_mpiio['counters']['MPIIO_NB_WRITES'].sum()
+        self.report.name_records = self.report.name_records  # Will this be optimised via JIT? Nvm CPython doesn't have JIT lol
+        for index, row in self.mpiio_df['counters'].iterrows():
+            if self.report.name_records[int(row['id'])].endswith('.h5') or self.report.name_records[
+                int(row['id'])].endswith('.hdf5'):
+                self.has_hdf5_extension = True
+                break  # Early exit
 
-        check_mpi_none_block_operation(mpiio_nb_reads, mpiio_nb_writes, has_hdf5_extension, modules)
+    def mpiio_non_blocking(self):
+        if not self.mpiio_df:
+            return
 
-    #########################################################################################################################################################################
+        self.mpiio_nb_reads = self.mpiio_df['counters']['MPIIO_NB_READS'].sum()
+        self.mpiio_nb_writes = self.mpiio_df['counters']['MPIIO_NB_WRITES'].sum()
 
-    # Nodes and MPI-IO aggregators
-    # If the application uses collective reads or collective writes, look for the number of aggregators
-    hints = ''
+        module.check_mpi_none_block_operation(self.mpiio_nb_reads, self.mpiio_nb_writes, self.has_hdf5_extension,
+                                              self.modules)
 
-    if 'h' in job['job']['metadata']:
-        hints = job['job']['metadata']['h']
+    def CHECKnumber_of_aggregators(self):
+        hints = ''
 
-        if hints:
-            hints = hints.split(';')
+        if 'h' in self.report.metadata['job']['metadata']:
+            hints = self.report.metadata['job']['metadata']['h']
 
-    # print('Hints: ', hints)
+            if hints:
+                hints = hints.split(';')
 
-    NUMBER_OF_COMPUTE_NODES = 0
+            self.hints = hints
 
-    if 'MPI-IO' in modules:
-        cb_nodes = None
+        if 'MPI-IO' in self.modules:
 
-        for hint in hints:
-            if hint != 'no':
-                (key, value) = hint.split('=')
+            for hint in hints:
+                if hint != 'no':
+                    (key, value) = hint.split('=')
 
-            if key == 'cb_nodes':
-                cb_nodes = value
+                if key == 'cb_nodes':
+                    self.cb_nodes = value
 
-        # Try to get the number of compute nodes from SLURM, if not found, set as information
-        command = 'sacct --job {} --format=JobID,JobIDRaw,NNodes,NCPUs --parsable2 --delimiter ","'.format(
-            job['job']['jobid']
-        )
+            # Try to get the number of compute nodes from SLURM, if not found, set as information
+            command = f'sacct --job {self.report.metadata["job"]["jobid"]} --format=JobID,JobIDRaw,NNodes,NCPUs --parsable2 --delimiter ","'
 
-        arguments = shlex.split(command)
+            arguments = shlex.split(command)
 
-        try:
-            result = subprocess.run(arguments, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            try:
+                result = subprocess.run(arguments, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
-            if result.returncode == 0:
-                # We have successfully fetched the information from SLURM
-                db = csv.DictReader(io.StringIO(result.stdout.decode('utf-8')))
+                if result.returncode == 0:
+                    # We have successfully fetched the information from SLURM
+                    db = csv.DictReader(io.StringIO(result.stdout.decode('utf-8')))
 
-                try:
-                    first = next(db)
+                    try:
+                        first = next(db)
 
-                    if 'NNodes' in first:
-                        NUMBER_OF_COMPUTE_NODES = first['NNodes']
+                        if 'NNodes' in first:
+                            self.number_of_compute_nodes = first['NNodes']
+
+                            # Do we have one MPI-IO aggregator per node?
+                            module.check_mpi_aggregator(self.cb_nodes, self.number_of_compute_nodes)
+                    except StopIteration:
+                        pass
+            except FileNotFoundError:
+                pass
+
+    def something_else(self):
+        if 'start_time' in self.report.metadata['job']:
+            self.job_start = datetime.datetime.fromtimestamp(self.report.metadata['job']['start_time'],
+                                                             datetime.timezone.utc)
+            self.job_end = datetime.datetime.fromtimestamp(self.report.metadata['job']['end_time'],
+                                                           datetime.timezone.utc)
+        else:
+            self.job_start = datetime.datetime.fromtimestamp(self.report.metadata['job']['start_time_sec'],
+                                                             datetime.timezone.utc)
+            self.job_end = datetime.datetime.fromtimestamp(self.report.metadata['job']['end_time_sec'],
+                                                           datetime.timezone.utc)
+
+
+def log_relation_check():
+    # TODO: Ensure that all logs are from a single job, generated at the same time, from the same executable and using the same library version
+    pass
+
+
+def handler():
+    console = init_console()
+
+    insights_start_time = time.time()
+
+    log_path = args.log_paths[0]  # TODO: A single file rn
+    log = darshanll.log_open(log_path)
+
+    modules = darshanll.log_get_modules(log)
+
+    information = darshanll.log_get_job(log)
+
+    trace_path = args.log_paths[0]  # TODO: A single file rn
+
+    darshan.enable_experimental()
+    library_version = darshanll.get_lib_version()
+
+    # TODO: Can this be put in a with block?
+    log = darshanll.log_open(trace_path)
+    information = darshanll.log_get_job(log)
+    darshanll.log_close(log)
+
+    report = darshan.DarshanReport(trace_path)
+    current_trace = DarshanTrace(trace_path, information, report)
+    #
+
+    # Leave this as is for now
+    # # Make sure log format is of the same version
+    # filename = args.log_path
+    # # check_log_version(console, args.log_path, log_version, library_version)
+    #
 
-                        # Do we have one MPI-IO aggregator per node?
-                        check_mpi_aggregator(cb_nodes, NUMBER_OF_COMPUTE_NODES)
-                except StopIteration:
-                    pass
-        except FileNotFoundError:
-            pass
 
-    #########################################################################################################################################################################
+    current_trace.generate_dxt_posix_rw_df()
+    current_trace.calculate_insights()
+    current_trace.files_stuff()
+    current_trace.check_stdio()
+    current_trace.check_mpiio()
+    current_trace.something()
+    current_trace.small_operation_calculation()
+    current_trace.posix_alignment()
+    current_trace.posix_redundant_reads()
+    current_trace.posix_random_check()
+    current_trace.posix_shared_file()
+    current_trace.posix_long_metadata()
+    current_trace.posix_stragglers()
+    current_trace.posix_stragglers2()
+    current_trace.posix_imbalance()
+    current_trace.hdf5_check()
+    current_trace.mpiio_non_blocking()
+    current_trace.CHECKnumber_of_aggregators()
+    current_trace.something_else()
 
     insights_end_time = time.time()
 
     # Version 3.4.1 of py-darshan changed the contents on what is reported in 'job'
-    if 'start_time' in job['job']:
-        job_start = datetime.datetime.fromtimestamp(job['job']['start_time'], datetime.timezone.utc)
-        job_end = datetime.datetime.fromtimestamp(job['job']['end_time'], datetime.timezone.utc)
+    if 'start_time' in report.metadata['job']:
+        job_start = datetime.datetime.fromtimestamp(report.metadata['job']['start_time'], datetime.timezone.utc)
+        job_end = datetime.datetime.fromtimestamp(report.metadata['job']['end_time'], datetime.timezone.utc)
     else:
-        job_start = datetime.datetime.fromtimestamp(job['job']['start_time_sec'], datetime.timezone.utc)
-        job_end = datetime.datetime.fromtimestamp(job['job']['end_time_sec'], datetime.timezone.utc)
+        job_start = datetime.datetime.fromtimestamp(report.metadata['job']['start_time_sec'], datetime.timezone.utc)
+        job_end = datetime.datetime.fromtimestamp(report.metadata['job']['end_time_sec'], datetime.timezone.utc)
 
     console.print()
 
@@ -935,13 +867,13 @@ def handler():
         Panel(
             '\n'.join([
                 ' [b]JOB[/b]:            [white]{}[/white]'.format(
-                    job['job']['jobid']
+                    report.metadata['job']['jobid']
                 ),
                 ' [b]EXECUTABLE[/b]:     [white]{}[/white]'.format(
-                    job['exe'].split()[0]
+                    report.metadata['exe'].split()[0]
                 ),
                 ' [b]DARSHAN[/b]:        [white]{}[/white]'.format(
-                    os.path.basename(args.log_path)
+                    os.path.basename(log_path)
                 ),
                 ' [b]EXECUTION TIME[/b]: [white]{} to {} ({:.2f} hours)[/white]'.format(
                     job_start,
@@ -949,20 +881,20 @@ def handler():
                     (job_end - job_start).total_seconds() / 3600
                 ),
                 ' [b]FILES[/b]:          [white]{} files ({} use STDIO, {} use POSIX, {} use MPI-IO)[/white]'.format(
-                    total_files,
-                    total_files_stdio,
-                    total_files_posix - total_files_mpiio,
+                    current_trace.total_files_posix,
+                    current_trace.total_files_stdio,
+                    current_trace.total_files_posix - current_trace.total_files_mpiio,
                     # Since MPI-IO files will always use POSIX, we can decrement to get a unique count
-                    total_files_mpiio
+                    current_trace.total_files_mpiio
                 ),
                 ' [b]COMPUTE NODES[/b]   [white]{}[/white]'.format(
-                    NUMBER_OF_COMPUTE_NODES
+                    current_trace.number_of_compute_nodes
                 ),
                 ' [b]PROCESSES[/b]       [white]{}[/white]'.format(
-                    job['job']['nprocs']
+                    report.metadata['job']['nprocs']
                 ),
                 ' [b]HINTS[/b]:          [white]{}[/white]'.format(
-                    ' '.join(hints)
+                    ' '.join(current_trace.hints)
                 )
             ]),
             title='[b][slate_blue3]DRISHTI[/slate_blue3] v.0.5[/b]',
@@ -984,9 +916,9 @@ def handler():
     display_footer(console, insights_start_time, insights_end_time)
 
     # Export to HTML, SVG, and CSV
-    trace_name = os.path.basename(args.log_path).replace('.darshan', '')
+    trace_name = os.path.basename(log_path).replace('.darshan', '')
     out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
 
     export_html(console, out_dir, trace_name)
     export_svg(console, out_dir, trace_name)
-    export_csv(out_dir, trace_name, job['job']['jobid'])
+    export_csv(out_dir, trace_name, report.metadata['job']['jobid'])

From bf5a2ea1f63e176887853707706c6207fb0ca330 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Fri, 2 Aug 2024 18:20:01 +0530
Subject: [PATCH 18/19] chore: Add ruff.xml

---
 .idea/ruff.xml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .idea/ruff.xml

diff --git a/.idea/ruff.xml b/.idea/ruff.xml
new file mode 100644
index 0000000..916a850
--- /dev/null
+++ b/.idea/ruff.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RuffConfigService">
+    <option name="globalRuffExecutablePath" value="/usr/bin/ruff" />
+  </component>
+</project>
\ No newline at end of file

From bb4d911a2333d5ae4d35cffb1e00bf741c43f0a1 Mon Sep 17 00:00:00 2001
From: Joel Tony <github@jaytau.com>
Date: Sat, 31 Aug 2024 00:28:21 +0530
Subject: [PATCH 19/19] feat (part): Process an aggregated darshan trace

---
 drishti/handlers/handle_darshan.py | 623 +++++++++++++++++++----------
 1 file changed, 418 insertions(+), 205 deletions(-)

diff --git a/drishti/handlers/handle_darshan.py b/drishti/handlers/handle_darshan.py
index 72e8558..cf947ce 100644
--- a/drishti/handlers/handle_darshan.py
+++ b/drishti/handlers/handle_darshan.py
@@ -2,13 +2,16 @@
 import dataclasses
 import io
 import shlex
+import csv
 import shutil
 import subprocess
 import os
 import sys
 import typing
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
+import abc
+from typing import List
 
 import darshan
 import pandas as pd
@@ -16,11 +19,15 @@
 import darshan.backend.cffi_backend as darshanll
 from packaging import version
 from rich import print
+from rich.padding import Padding
 
 import includes.config as config
 import includes.module as module
-from includes.module import *
+from includes.module import Panel, insights_total
+from includes.module import HIGH, WARN, RECOMMENDATIONS
+# from includes.module import *
 from includes.parser import args
+import time
 
 
 def is_available(name):
@@ -83,56 +90,27 @@ class TimestampPair:
     start: datetime.date
     end: datetime.date
 
-
 @dataclass
-class DarshanTrace:
+class AbstractDarshanTrace(abc.ABC):
     # Trace metadata
-    path: str
     jobid: str
     log_ver: str
     time: TimestampPair
     exe: str
 
     # Report
-    report: darshan.DarshanReport
     modules: typing.Iterable[str]
+    name_records: dict[str, str] = field(default_factory=dict)
 
+    max_read_offset: int = float('-inf')
+    max_write_offset: int = float('-inf')
     ###
-    total_write_size_stdio: int
-    total_write_size_stdio: int
-    total_size_stdio: int
-
-    total_write_size_posix: int
-    total_read_size_posix: int
-    total_size_posix: int
-
-    total_write_size_mpiio: int
-    total_read_size_mpiio: int
-    total_size_mpiio: int
-
-    total_size: int
-    total_files: int
-    ###
-    max_read_offset: int
-    max_write_offset: int
-    ###
-
-    stdio_df: pd.DataFrame = None
-    posix_df: pd.DataFrame = None
-    mpiio_df: pd.DataFrame = None
-    lustre_df: pd.DataFrame = None
-
-    dxt_posix: pd.DataFrame = None
-    dxt_mpiio: pd.DataFrame = None
-
-    dxt_posix_read_data: pd.DataFrame = None
-    dxt_posix_write_data: pd.DataFrame = None
 
     total_files_stdio: int = 0
     total_files_posix: int = 0
     total_files_mpiio: int = 0
 
-    files: dict[str, dict[str, int]] = dataclasses.field(default_factory=dict)
+    files: dict[str, dict[str, int]] = None
 
     total_reads: int = 0
     total_writes: int = 0
@@ -143,6 +121,23 @@ class DarshanTrace:
     total_reads_small: int = 0
     total_writes_small: int = 0
 
+    ###
+    total_write_size_stdio: int = 0
+    total_write_size_stdio: int = 0
+    total_size_stdio: int = 0
+
+    total_write_size_posix: int = 0
+    total_read_size_posix: int = 0
+    total_size_posix: int = 0
+
+    total_write_size_mpiio: int = 0
+    total_read_size_mpiio: int = 0
+    total_size_mpiio: int = 0
+
+    total_size: int = 0
+    total_files: int = 0
+    ###
+
     total_mem_not_aligned: int = 0
     total_file_not_aligned: int = 0
 
@@ -153,7 +148,6 @@ class DarshanTrace:
     write_sequential: int = 0
     write_random: int = 0
 
-    shared_files: pd.DataFrame = None
     total_shared_reads: int = 0
     total_shared_reads_small: int = 0
     total_shared_writes: int = 0
@@ -165,11 +159,12 @@ class DarshanTrace:
 
     # 2 functions (unsure ones)
 
-    has_hdf5_extension: bool = False
+    has_hdf5_extension: bool = False # TODO: OR this
 
     mpiio_nb_reads: int = 0
     mpiio_nb_writes: int = 0
 
+    # TODO: Should be a list of CB nodes for agg
     cb_nodes: Optional[int] = None
     number_of_compute_nodes: int = 0
     hints: list[str] = dataclasses.field(default_factory=list)
@@ -179,34 +174,22 @@ class DarshanTrace:
 
     aggregated: pd.DataFrame = None
 
-    def __init__(self, trace_path: str, job_information, report: darshan.DarshanReport):
-        self.path = trace_path
-
-        self.jobid = job_information['jobid']
-        self.log_ver = job_information['log_ver'] if 'log_ver' in job_information else job_information['metadata'][
-            'lib_ver']
-        self.exe = report.metadata['exe']
-
-        _start_time = datetime.datetime.fromtimestamp(job_information['start_time_sec'], tz=datetime.timezone.utc)
-        _end_time = datetime.datetime.fromtimestamp(job_information['end_time_sec'], tz=datetime.timezone.utc)
-        self.time = TimestampPair(_start_time, _end_time)
-
-        self.modules = report.modules.keys()
-
-        # TODO: Should I search in self.modules or in report.records?
-        # ! All dfs are being materialised
-        self.report = report
-        self.posix_df = report.records['POSIX'].to_df() if 'POSIX' in self.modules else None
-        self.stdio_df = report.records['STDIO'].to_df() if 'STDIO' in self.modules else None
-        self.mpiio_df = report.records['MPI-IO'].to_df() if 'MPI-IO' in self.modules else None
-
-        self.lustre_df = report.records['LUSTRE'].to_df() if 'LUSTRE' in self.modules else None
-
-        self.dxt_posix = report.records['DXT_POSIX'].to_df() if 'DXT_POSIX' in self.modules else None
-        self.dxt_mpiio = report.records['DXT_MPIIO'].to_df() if 'DXT_MPIIO' in self.modules else None
-
-        self.hints = []
-        self.files = {}
+    ## EXTRA from module being split
+    mpiio_coll_reads: int = 0
+    mpiio_indep_reads: int = 0
+    total_mpiio_read_operations: int = 0
+    detected_files_mpi_coll_reads: pd.DataFrame = None
+    mpiio_coll_writes: int = 0
+    mpiio_indep_writes: int = 0
+    total_mpiio_write_operations: int = 0
+    detected_files_mpiio_coll_writes: pd.DataFrame = None
+    imbalance_count_posix_shared_time: int = 0
+    posix_shared_time_imbalance_detected_files1: pd.DataFrame = None
+    posix_shared_time_imbalance_detected_files2: pd.DataFrame = None
+    posix_shared_time_imbalance_detected_files3: pd.DataFrame = None
+    posix_total_size: int = 0
+    posix_total_read_size: int = 0
+    posix_total_written_size: int = 0
 
     def generate_dxt_posix_rw_df(self) -> None:
         if not args.backtrace:
@@ -342,12 +325,87 @@ def files_stuff(self) -> None:
                 'mpiio': uses_mpiio
             }
 
-    def check_stdio(self) -> None:
+
+    def generate_insights(self):
+        # TODO: Check if module exists. Replicate from each function which calculates insights.
+        self._check_stdio()
+        self._check_mpiio()
+        self._do_something()
+        self._small_operation_insight()
+
+
+
+    def _check_stdio(self) -> None:
         module.check_stdio(self.total_posix_size, self.total_size_stdio)
 
-    def check_mpiio(self) -> None:
+    def _check_mpiio(self) -> None:
         module.check_mpiio(self.modules)
 
+    def _do_something(self):
+        module.check_operation_intensive(self.total_operations, self.total_reads, self.total_writes)
+        module.check_size_intensive(self.posix_total_size, self.posix_total_read_size, self.posix_total_written_size)
+
+        # TODO: for trace in traces
+        for trace in self.traces:
+            pass
+        module.check_misaligned(self.total_operations, self.total_mem_not_aligned, self.total_file_not_aligned,
+                                self.modules, self.name_records, self.lustre_df, self.dxt_posix,
+                                self.dxt_posix_read_data) # posix alignment
+
+        module.check_traffic(self.max_read_offset, self.total_read_size, self.max_write_offset, self.total_written_size,
+                             self.dxt_posix, self.dxt_posix_read_data, self.dxt_posix_write_data) # redundant reads
+
+        module.check_random_operation(self.read_consecutive, self.read_sequential, self.read_random, self.total_reads,
+                                      self.write_consecutive, self.write_sequential, self.write_random,
+                                      self.total_writes, self.dxt_posix,
+                                      self.dxt_posix_read_data, self.dxt_posix_write_data) # random check
+
+        module.check_shared_small_operation(self.total_shared_reads, self.total_shared_reads_small,
+                                            self.total_shared_writes,
+                                            self.total_shared_writes_small, self.shared_files, self.report.name_records)
+
+        module.check_long_metadata(self.count_long_metadata, self.modules)
+
+        module.check_shared_data_imblance(self.posix_shared_data_imbalance_stragglers_count,
+                                          self.posix_data_straggler_files,
+                                          self.report.name_records, self.dxt_posix,
+                                          self.dxt_posix_read_data,
+                                          self.dxt_posix_write_data)
+
+        module.check_shared_time_imbalance(self.posix_stragglers_shared_file_time_imbalance_count,
+                                           self.posix_shared_time_imbalance_detected_files1, self.report.name_records)
+
+        module.check_individual_write_imbalance(self.posix_data_imbalance_count,
+                                                self.posix_shared_time_imbalance_detected_files2,
+                                                self.report.name_records, self.dxt_posix, self.dxt_posix_write_data)
+
+        module.check_mpi_collective_read_operation(self.mpiio_coll_reads, self.mpiio_indep_reads,
+                                                   self.total_mpiio_read_operations,
+                                                   self.detected_files_mpi_coll_reads, self.report.name_records,
+                                                   self.dxt_mpiio)
+
+        module.check_mpi_collective_write_operation(self.mpiio_coll_writes, self.mpiio_indep_writes,
+                                                    self.total_mpiio_write_operations,
+                                                    self.detected_files_mpiio_coll_writes, self.report.name_records, self.dxt_mpiio)
+
+        module.check_individual_read_imbalance(self.imbalance_count_posix_shared_time,
+                                               self.posix_shared_time_imbalance_detected_files3,
+                                               self.report.name_records, self.dxt_posix, self.dxt_posix_read_data)
+
+        module.check_mpi_none_block_operation(self.mpiio_nb_reads, self.mpiio_nb_writes, self.has_hdf5_extension,
+                                              self.modules)
+
+
+
+    def _small_operation_insight(self):
+        module.check_small_operation(self.total_reads, self.total_reads_small, self.total_writes,
+                                     self.total_writes_small,
+                                     self.small_operation_detected_files,
+                                     self.modules, self.report.name_records, self.dxt_posix, self.dxt_posix_read_data,
+                                     self.dxt_posix_write_data)
+
+
+
     def something(self) -> None:
         if not self.posix_df:
             return
@@ -356,13 +414,13 @@ def something(self) -> None:
         self.total_writes = self.posix_df['counters']['POSIX_WRITES'].sum()
         self.total_operations = self.total_writes + self.total_reads
         # ----------------------------------------------------------------------------------------------------------------------
-        module.check_operation_intensive(self.total_operations, self.total_reads, self.total_writes)
+        # module.check_operation_intensive(self.total_operations, self.total_reads, self.total_writes)
 
-        total_read_size = self.posix_df['counters']['POSIX_BYTES_READ'].sum()
-        total_written_size = self.posix_df['counters']['POSIX_BYTES_WRITTEN'].sum()
-        total_size = total_written_size + total_read_size
+        self.posix_total_read_size = self.posix_df['counters']['POSIX_BYTES_READ'].sum()
+        self.posix_total_written_size = self.posix_df['counters']['POSIX_BYTES_WRITTEN'].sum()
+        self.posix_total_size = self.posix_total_written_size + self.posix_total_read_size
 
-        module.check_size_intensive(total_size, total_read_size, total_written_size)
+        # module.check_size_intensive(self.posix_total_size, self.posix_total_read_size, self.posix_total_written_size)
         # -----
         self.total_reads_small = (
                 self.posix_df['counters']['POSIX_SIZE_READ_0_100'].sum() +
@@ -401,18 +459,18 @@ def small_operation_calculation(self):
                 self.posix_df['counters']['POSIX_SIZE_WRITE_100K_1M']
         )
 
-        detected_files = pd.DataFrame(self.posix_df['counters'].groupby('id')[['INSIGHTS_POSIX_SMALL_READ',
+        self.small_operation_detected_files = pd.DataFrame(self.posix_df['counters'].groupby('id')[['INSIGHTS_POSIX_SMALL_READ',
                                                                                'INSIGHTS_POSIX_SMALL_WRITE']].sum()).reset_index()
-        detected_files.columns = ['id', 'total_reads',
+        self.small_operation_detected_files.columns = ['id', 'total_reads',
                                   'total_writes']  # !: Rename later. total_small_reads, total_small_writes
-        detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str)
+        self.small_operation_detected_files.loc[:, 'id'] = self.small_operation_detected_files.loc[:, 'id'].astype(str)
 
         self.report.name_records = self.report.name_records
-        module.check_small_operation(self.total_reads, self.total_reads_small, self.total_writes,
-                                     self.total_writes_small,
-                                     detected_files,
-                                     self.modules, self.report.name_records, self.dxt_posix, self.dxt_posix_read_data,
-                                     self.dxt_posix_write_data)
+        # module.check_small_operation(self.total_reads, self.total_reads_small, self.total_writes,
+        #                              self.total_writes_small,
+        #                              self.small_operation_detected_files,
+        #                              self.modules, self.report.name_records, self.dxt_posix, self.dxt_posix_read_data,
+        #                              self.dxt_posix_write_data)
 
     def posix_alignment(self):
         if not self.posix_df:
@@ -422,9 +480,9 @@ def posix_alignment(self):
         self.total_file_not_aligned = self.posix_df['counters']['POSIX_FILE_NOT_ALIGNED'].sum()
 
         self.report.name_records = self.report.name_records
-        module.check_misaligned(self.total_operations, self.total_mem_not_aligned, self.total_file_not_aligned,
-                                self.modules, self.report.name_records, self.lustre_df, self.dxt_posix,
-                                self.dxt_posix_read_data)
+        # module.check_misaligned(self.total_operations, self.total_mem_not_aligned, self.total_file_not_aligned,
+        #                         self.modules, self.report.name_records, self.lustre_df, self.dxt_posix,
+        #                         self.dxt_posix_read_data)
 
     def posix_redundant_reads(self):
         if not self.posix_df:
@@ -433,8 +491,8 @@ def posix_redundant_reads(self):
         self.max_read_offset = self.posix_df['counters']['POSIX_MAX_BYTE_READ'].max()
         self.max_write_offset = self.posix_df['counters']['POSIX_MAX_BYTE_WRITTEN'].max()
 
-        module.check_traffic(self.max_read_offset, self.total_read_size, self.max_write_offset, self.total_written_size,
-                             self.dxt_posix, self.dxt_posix_read_data, self.dxt_posix_write_data)
+        # module.check_traffic(self.max_read_offset, self.total_read_size, self.max_write_offset, self.total_written_size,
+        #                      self.dxt_posix, self.dxt_posix_read_data, self.dxt_posix_write_data)
 
     def posix_random_check(self):
         if not self.posix_df:
@@ -454,10 +512,10 @@ def posix_random_check(self):
 
         self.write_random = self.total_writes - self.write_consecutive - self.write_sequential
 
-        module.check_random_operation(self.read_consecutive, self.read_sequential, self.read_random, self.total_reads,
-                                      self.write_consecutive, self.write_sequential, self.write_random,
-                                      self.total_writes, self.dxt_posix,
-                                      self.dxt_posix_read_data, self.dxt_posix_write_data)
+        # module.check_random_operation(self.read_consecutive, self.read_sequential, self.read_random, self.total_reads,
+        #                               self.write_consecutive, self.write_sequential, self.write_random,
+        #                               self.total_writes, self.dxt_posix,
+        #                               self.dxt_posix_read_data, self.dxt_posix_write_data)
 
     def posix_shared_file(self):
         if not self.posix_df:
@@ -503,9 +561,9 @@ def posix_shared_file(self):
             )
 
             self.report.name_records = self.report.name_records
-            check_shared_small_operation(self.total_shared_reads, self.total_shared_reads_small,
-                                         self.total_shared_writes,
-                                         self.total_shared_writes_small, self.shared_files, self.report.name_records)
+            # module.check_shared_small_operation(self.total_shared_reads, self.total_shared_reads_small,
+            #                              self.total_shared_writes,
+            #                              self.total_shared_writes_small, self.shared_files, self.report.name_records)
 
     def posix_long_metadata(self):
         if not self.posix_df:
@@ -515,7 +573,7 @@ def posix_long_metadata(self):
             self.posix_df['fcounters'][
                 (self.posix_df['fcounters']['POSIX_F_META_TIME'] > config.thresholds['metadata_time_rank'][0])])
 
-        module.check_long_metadata(self.count_long_metadata, self.modules)
+        # module.check_long_metadata(self.count_long_metadata, self.modules)
 
     def posix_stragglers(self):
         if not self.posix_df:
@@ -529,29 +587,29 @@ def posix_stragglers(self):
 
         self.shared_files = self.shared_files.assign(id=lambda d: d['id'].astype(str))
 
-        posix_straggler_files = []
+        self.posix_data_straggler_files = []
 
         for index, row in self.shared_files.iterrows():
             total_transfer_size = row['POSIX_BYTES_WRITTEN'] + row['POSIX_BYTES_READ']
 
             if total_transfer_size and abs(
                     row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size > \
-                    thresholds['imbalance_stragglers'][0]:
+                    module.thresholds['imbalance_stragglers'][0]:
                 self.posix_shared_data_imbalance_stragglers_count += 1
 
-                posix_straggler_files.append([
+                self.posix_data_straggler_files.append([
                     row['id'],
                     abs(row['POSIX_SLOWEST_RANK_BYTES'] - row['POSIX_FASTEST_RANK_BYTES']) / total_transfer_size * 100
                 ])
 
         column_names = ['id', 'data_imbalance']
-        posix_straggler_files = pd.DataFrame(posix_straggler_files, columns=column_names)
+        self.posix_data_straggler_files = pd.DataFrame(self.posix_data_straggler_files, columns=column_names)
 
         self.report.name_records = self.report.name_records
-        module.check_shared_data_imblance(self.posix_shared_data_imbalance_stragglers_count, posix_straggler_files,
-                                          self.report.name_records, self.dxt_posix,
-                                          self.dxt_posix_read_data,
-                                          self.dxt_posix_write_data)
+        # module.check_shared_data_imblance(self.posix_shared_data_imbalance_stragglers_count, self.posix_data_straggler_files,
+        #                                   self.report.name_records, self.dxt_posix,
+        #                                   self.dxt_posix_read_data,
+        #                                   self.dxt_posix_write_data)
 
         # POSIX_F_FASTEST_RANK_TIME
         # POSIX_F_SLOWEST_RANK_TIME
@@ -560,14 +618,16 @@ def posix_stragglers(self):
     #################################################################################################################
 
     def posix_stragglers2(self):
-        # Get the files responsible
+        if not self.posix_df:
+            return
 
+        # Get the files responsible
         shared_files_times = self.posix_df['fcounters'].loc[(self.posix_df['fcounters']['rank'] == -1)]
 
-        posix_shared_time_imbalance_detected_files = []
+        self.posix_shared_time_imbalance_detected_files1 = []
 
-        posix_stragglers_shared_file_time_imbalance_count = 0
-        posix_stragglers_shared_file_time_imbalance = {}
+        self.posix_stragglers_shared_file_time_imbalance_count = 0
+        # posix_stragglers_shared_file_time_imbalance = {} # UNUSED?
 
         shared_files_times = shared_files_times.assign(id=lambda d: d['id'].astype(str))
 
@@ -577,20 +637,23 @@ def posix_stragglers2(self):
             if total_transfer_time and abs(
                     row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time > \
                     config.thresholds['imbalance_stragglers'][0]:
-                posix_stragglers_shared_file_time_imbalance_count += 1
+                self.posix_stragglers_shared_file_time_imbalance_count += 1
 
-                posix_shared_time_imbalance_detected_files.append([
+                self.posix_shared_time_imbalance_detected_files1.append([
                     row['id'],
                     abs(row['POSIX_F_SLOWEST_RANK_TIME'] - row['POSIX_F_FASTEST_RANK_TIME']) / total_transfer_time * 100
                 ])
 
         column_names = ['id', 'time_imbalance']
-        posix_shared_time_imbalance_detected_files = pd.DataFrame(posix_shared_time_imbalance_detected_files,
+        self.posix_shared_time_imbalance_detected_files1 = pd.DataFrame(self.posix_shared_time_imbalance_detected_files1,
                                                                   columns=column_names)
-        module.check_shared_time_imbalance(posix_stragglers_shared_file_time_imbalance_count,
-                                           posix_shared_time_imbalance_detected_files, self.report.name_records)
+        # module.check_shared_time_imbalance(self.posix_stragglers_shared_file_time_imbalance_count,
+        #                                    self.posix_shared_time_imbalance_detected_files1, self.report.name_records)
 
     def posix_imbalance(self):
+        if not self.posix_df:
+            return
+
         aggregated = self.posix_df['counters'].loc[(self.posix_df['counters']['rank'] != -1)][
             ['rank', 'id', 'POSIX_BYTES_WRITTEN', 'POSIX_BYTES_READ']
         ].groupby('id', as_index=False).agg({
@@ -603,25 +666,25 @@ def posix_imbalance(self):
         self.aggregated = aggregated
 
         # Get the files responsible
-        imbalance_count = 0
+        self.posix_data_imbalance_count = 0
 
-        posix_shared_time_imbalance_detected_files = []
+        self.posix_shared_time_imbalance_detected_files2 = []
 
         for index, row in self.aggregated.iterrows():
             if row['POSIX_BYTES_WRITTEN_max'] and abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / \
                     row['POSIX_BYTES_WRITTEN_max'] > config.thresholds['imbalance_size'][0]:
-                imbalance_count += 1
+                self.posix_data_imbalance_count += 1
 
-                posix_shared_time_imbalance_detected_files.append([
+                self.posix_shared_time_imbalance_detected_files2.append([
                     row['id'], abs(row['POSIX_BYTES_WRITTEN_max'] - row['POSIX_BYTES_WRITTEN_min']) / row[
                         'POSIX_BYTES_WRITTEN_max'] * 100
                 ])
 
         column_names = ['id', 'write_imbalance']
-        posix_shared_time_imbalance_detected_files = pd.DataFrame(posix_shared_time_imbalance_detected_files,
+        self.posix_shared_time_imbalance_detected_files2 = pd.DataFrame(self.posix_shared_time_imbalance_detected_files2,
                                                                   columns=column_names)
-        module.check_individual_write_imbalance(imbalance_count, posix_shared_time_imbalance_detected_files,
-                                                self.report.name_records, self.dxt_posix, self.dxt_posix_write_data)
+        # module.check_individual_write_imbalance(self.posix_data_imbalance_count, self.posix_shared_time_imbalance_detected_files2,
+        #                                         self.report.name_records, self.dxt_posix, self.dxt_posix_write_data)
 
     def mpiio_processing(self):
         if not self.mpiio_df:
@@ -630,94 +693,99 @@ def mpiio_processing(self):
         self.mpiio_df['counters'] = self.mpiio_df['counters'].assign(
             id=lambda d: d['id'].astype(str))  # What does this do?
 
-        # Get the files responsible
-        detected_files = []
 
         df_mpiio_collective_reads = self.mpiio_df['counters']  # .loc[(df_mpiio['counters']['MPIIO_COLL_READS'] > 0)]
 
-        total_mpiio_read_operations = self.mpiio_df['counters']['MPIIO_INDEP_READS'].sum() + self.mpiio_df['counters'][
+        self.total_mpiio_read_operations = self.mpiio_df['counters']['MPIIO_INDEP_READS'].sum() + self.mpiio_df['counters'][
             'MPIIO_COLL_READS'].sum()
 
-        mpiio_coll_reads = self.mpiio_df['counters']['MPIIO_COLL_READS'].sum()
-        mpiio_indep_reads = self.mpiio_df['counters']['MPIIO_INDEP_READS'].sum()
+        self.mpiio_coll_reads = self.mpiio_df['counters']['MPIIO_COLL_READS'].sum()
+        self.mpiio_indep_reads = self.mpiio_df['counters']['MPIIO_INDEP_READS'].sum()
 
-        detected_files = []
-        if mpiio_coll_reads == 0 and total_mpiio_read_operations and total_mpiio_read_operations > \
-                thresholds['collective_operations_absolute'][0]:
+        self.detected_files_mpi_coll_reads = []
+        if self.mpiio_coll_reads == 0 and self.total_mpiio_read_operations and self.total_mpiio_read_operations > \
+                module.thresholds['collective_operations_absolute'][0]:
             files = pd.DataFrame(df_mpiio_collective_reads.groupby('id').sum()).reset_index()
             for index, row in df_mpiio_collective_reads.iterrows():
                 if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and
                         row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
-                        thresholds['collective_operations'][0] and
+                        module.thresholds['collective_operations'][0] and
                         (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
-                        thresholds['collective_operations_absolute'][0]):
-                    detected_files.append([
+                        module.thresholds['collective_operations_absolute'][0]):
+                    self.detected_files_mpi_coll_reads.append([
                         row['id'], row['MPIIO_INDEP_READS'],
                         row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100
                     ])
 
         column_names = ['id', 'absolute_indep_reads', 'percent_indep_reads']
-        detected_files = pd.DataFrame(detected_files, columns=column_names)
+        self.detected_files_mpi_coll_reads = pd.DataFrame(self.detected_files_mpi_coll_reads, columns=column_names)
+
+        # module.check_mpi_collective_read_operation(self.mpiio_coll_reads, self.mpiio_indep_reads, self.total_mpiio_read_operations,
+        #                                     self.detected_files_mpi_coll_reads, self.report.name_records, self.dxt_mpiio)
+
+        # TODO: Split this into 2 functions for each module insight
 
-        check_mpi_collective_read_operation(mpiio_coll_reads, mpiio_indep_reads, total_mpiio_read_operations,
-                                            detected_files, self.report.name_records, self.dxt_mpiio)
 
         df_mpiio_collective_writes = self.mpiio_df['counters']  # .loc[(df_mpiio['counters']['MPIIO_COLL_WRITES'] > 0)]
 
-        total_mpiio_write_operations = self.mpiio_df['counters']['MPIIO_INDEP_WRITES'].sum() + \
+        self.total_mpiio_write_operations = self.mpiio_df['counters']['MPIIO_INDEP_WRITES'].sum() + \
                                        self.mpiio_df['counters'][
                                            'MPIIO_COLL_WRITES'].sum()
 
-        mpiio_coll_writes = self.mpiio_df['counters']['MPIIO_COLL_WRITES'].sum()
-        mpiio_indep_writes = self.mpiio_df['counters']['MPIIO_INDEP_WRITES'].sum()
+        self.mpiio_coll_writes = self.mpiio_df['counters']['MPIIO_COLL_WRITES'].sum()
+        self.mpiio_indep_writes = self.mpiio_df['counters']['MPIIO_INDEP_WRITES'].sum()
 
-        detected_files = []
-        if mpiio_coll_writes == 0 and total_mpiio_write_operations and total_mpiio_write_operations > \
-                thresholds['collective_operations_absolute'][0]:
+        self.detected_files_mpiio_coll_writes = []
+        if self.mpiio_coll_writes == 0 and self.total_mpiio_write_operations and self.total_mpiio_write_operations > \
+                module.thresholds['collective_operations_absolute'][0]:
             files = pd.DataFrame(df_mpiio_collective_writes.groupby('id').sum()).reset_index()
 
             for index, row in df_mpiio_collective_writes.iterrows():
                 if ((row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and
                         row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
-                        thresholds['collective_operations'][0] and
+                        module.thresholds['collective_operations'][0] and
                         (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) >
-                        thresholds['collective_operations_absolute'][0]):
-                    detected_files.append([
+                        module.thresholds['collective_operations_absolute'][0]):
+                    self.detected_files_mpiio_coll_writes.append([
                         row['id'], row['MPIIO_INDEP_WRITES'],
                         row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) * 100
                     ])
 
         column_names = ['id', 'absolute_indep_writes', 'percent_indep_writes']
-        detected_files = pd.DataFrame(detected_files, columns=column_names)
+        self.detected_files_mpiio_coll_writes = pd.DataFrame(self.detected_files_mpiio_coll_writes, columns=column_names)
 
-        check_mpi_collective_write_operation(mpiio_coll_writes, mpiio_indep_writes, total_mpiio_write_operations,
-                                             detected_files, self.report.name_records, self.dxt_mpiio)
+        # module.check_mpi_collective_write_operation(self.mpiio_coll_writes, self.mpiio_indep_writes, self.total_mpiio_write_operations,
+        #                                      detected_files_mpiio_coll_writes, self.report.name_records, self.dxt_mpiio)
 
     def posix_imbalance2(self):
-        imbalance_count = 0
+        if not self.posix_df:
+            return
 
-        posix_shared_time_imbalance_detected_files = []
+        self.imbalance_count_posix_shared_time = 0
+
+        self.posix_shared_time_imbalance_detected_files3 = []
 
         for index, row in self.aggregated.iterrows():
             if row['POSIX_BYTES_READ_max'] and abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row[
-                'POSIX_BYTES_READ_max'] > thresholds['imbalance_size'][0]:
-                imbalance_count += 1
+                'POSIX_BYTES_READ_max'] > module.thresholds['imbalance_size'][0]:
+                self.imbalance_count_posix_shared_time += 1
 
-                posix_shared_time_imbalance_detected_files.append([
+                self.posix_shared_time_imbalance_detected_files3.append([
                     row['id'],
                     abs(row['POSIX_BYTES_READ_max'] - row['POSIX_BYTES_READ_min']) / row['POSIX_BYTES_READ_max'] * 100
                 ])
 
         column_names = ['id', 'read_imbalance']
-        posix_shared_time_imbalance_detected_files = pd.DataFrame(posix_shared_time_imbalance_detected_files,
+        self.posix_shared_time_imbalance_detected_files3 = pd.DataFrame(self.posix_shared_time_imbalance_detected_files3,
                                                                   columns=column_names)
-        module.check_individual_read_imbalance(imbalance_count, posix_shared_time_imbalance_detected_files,
-                                               self.report.name_records, self.dxt_posix, self.dxt_posix_read_data)
+        # module.check_individual_read_imbalance(self.imbalance_count_posix_shared_time, self.posix_shared_time_imbalance_detected_files3,
+        #                                        self.report.name_records, self.dxt_posix, self.dxt_posix_read_data)
 
     def hdf5_check(self):
         if not self.mpiio_df:
             return
 
+
         self.report.name_records = self.report.name_records  # Will this be optimised via JIT? Nvm CPython doesn't have JIT lol
         for index, row in self.mpiio_df['counters'].iterrows():
             if self.report.name_records[int(row['id'])].endswith('.h5') or self.report.name_records[
@@ -732,8 +800,8 @@ def mpiio_non_blocking(self):
         self.mpiio_nb_reads = self.mpiio_df['counters']['MPIIO_NB_READS'].sum()
         self.mpiio_nb_writes = self.mpiio_df['counters']['MPIIO_NB_WRITES'].sum()
 
-        module.check_mpi_none_block_operation(self.mpiio_nb_reads, self.mpiio_nb_writes, self.has_hdf5_extension,
-                                              self.modules)
+        # module.check_mpi_none_block_operation(self.mpiio_nb_reads, self.mpiio_nb_writes, self.has_hdf5_extension,
+        #                                       self.modules)
 
     def CHECKnumber_of_aggregators(self):
         hints = ''
@@ -793,76 +861,231 @@ def something_else(self):
                                                            datetime.timezone.utc)
 
 
+
+@dataclass
+class DarshanTrace(AbstractDarshanTrace):
+    path: Optional[str] = None
+    report: Optional[darshan.DarshanReport] = None
+
+    stdio_df: pd.DataFrame = None
+    posix_df: pd.DataFrame = None
+    mpiio_df: pd.DataFrame = None
+    lustre_df: pd.DataFrame = None
+
+    dxt_posix: pd.DataFrame = None
+    dxt_mpiio: pd.DataFrame = None
+
+    dxt_posix_read_data: pd.DataFrame = None
+    dxt_posix_write_data: pd.DataFrame = None
+
+    shared_files: pd.DataFrame = None
+
+    def __init__(self, trace_path: str, job_information, report: darshan.DarshanReport):
+        self.path = trace_path
+
+        self.jobid = job_information['jobid']
+        self.log_ver = job_information['log_ver'] if 'log_ver' in job_information else job_information['metadata'][
+            'lib_ver']
+        self.exe = report.metadata['exe']
+
+        _start_time = datetime.datetime.fromtimestamp(job_information['start_time_sec'], tz=datetime.timezone.utc)
+        _end_time = datetime.datetime.fromtimestamp(job_information['end_time_sec'], tz=datetime.timezone.utc)
+        self.time = TimestampPair(_start_time, _end_time)
+
+        self.modules = report.modules.keys()
+
+        # TODO: Should I search in self.modules or in report.records?
+        # ! All dfs are being materialised
+        self.report = report
+        self.posix_df = report.records['POSIX'].to_df() if 'POSIX' in self.modules else None
+        self.stdio_df = report.records['STDIO'].to_df() if 'STDIO' in self.modules else None
+        self.mpiio_df = report.records['MPI-IO'].to_df() if 'MPI-IO' in self.modules else None
+
+        self.lustre_df = report.records['LUSTRE'].to_df() if 'LUSTRE' in self.modules else None
+
+        self.dxt_posix = report.records['DXT_POSIX'].to_df() if 'DXT_POSIX' in self.modules else None
+        self.dxt_mpiio = report.records['DXT_MPIIO'].to_df() if 'DXT_MPIIO' in self.modules else None
+
+        self.hints = []
+        self.files = {}
+
+
+@dataclass
+class AggregatedDarshanTraces(AbstractDarshanTrace):
+
+    traces: List[DarshanTrace] = field(default_factory=list)
+    # reports: List[darshan.DarshanReport] = field(default_factory=list)
+
+    def __init__(self, traces: List[DarshanTrace]):
+        assert len(traces) > 0
+        self.traces = traces
+
+        reports = [current_trace.report for current_trace in traces]
+        self.name_records = dict()
+        for report in reports:
+            self.name_records |= report.name_records
+
+    def aggregate_traces(self):
+        self.modules = set()
+        self.files = dict()
+        for current_trace in self.traces:
+            self.modules.union(current_trace.modules)
+
+
+            self.total_write_size_stdio += current_trace.total_write_size_stdio
+            self.total_write_size_stdio += current_trace.total_write_size_stdio
+            self.total_size_stdio += current_trace.total_size_stdio
+
+            self.total_write_size_posix += current_trace.total_write_size_posix
+            self.total_read_size_posix += current_trace.total_read_size_posix
+            self.total_size_posix += current_trace.total_size_posix
+
+            self.total_write_size_mpiio += current_trace.total_write_size_mpiio
+            self.total_read_size_mpiio += current_trace.total_read_size_mpiio
+            self.total_size_mpiio += current_trace.total_size_mpiio
+
+            self.total_size += current_trace.total_size
+            self.total_files += current_trace.total_files
+            ###
+            self.max_read_offset = max(self.max_read_offset, current_trace.max_read_offset)
+            self.max_write_offset = max(self.max_read_offset, current_trace.max_write_offset)
+            ###
+
+            self.total_files_stdio += current_trace.total_files_stdio
+            self.total_files_posix += current_trace.total_size_posix
+            self.total_files_mpiio += current_trace.total_files_mpiio
+
+            self.files |= current_trace.files
+
+            self.total_reads += current_trace.total_reads
+            self.total_writes += current_trace.total_writes
+            self.total_operations += current_trace.total_operations
+            self.total_read_size += current_trace.total_read_size
+            self.total_written_size += current_trace.total_written_size
+            self.total_posix_size += current_trace.total_posix_size
+            self.total_reads_small += current_trace.total_reads_small
+            self.total_writes_small += current_trace.total_writes_small
+
+            self.total_mem_not_aligned += current_trace.total_mem_not_aligned
+            self.total_file_not_aligned += current_trace.total_file_not_aligned
+
+            self.read_consecutive += current_trace.read_consecutive
+            self.read_sequential += current_trace.read_sequential
+            self.read_random += current_trace.read_random
+            self.write_consecutive += current_trace.write_consecutive
+            self.write_sequential += current_trace.write_sequential
+            self.write_random += current_trace.write_random
+
+            self.total_shared_reads += current_trace.total_shared_reads
+            self.total_shared_reads_small += current_trace.total_shared_reads_small
+            self.total_shared_writes += current_trace.total_shared_writes
+            self.total_shared_writes_small += current_trace.total_shared_writes_small
+
+            self.count_long_metadata += current_trace.count_long_metadata
+
+            self.posix_shared_data_imbalance_stragglers_count += current_trace.posix_shared_data_imbalance_stragglers_count
+
+            self.has_hdf5_extension = self.has_hdf5_extension or current_trace.has_hdf5_extension
+
+            self.mpiio_nb_reads += current_trace.mpiio_nb_reads
+            self.mpiio_nb_writes += current_trace.mpiio_nb_writes
+
 def log_relation_check():
     # TODO: Ensure that all logs are from a single job, generated at the same time, from the same executable and using the same library version
     pass
 
 
 def handler():
-    console = init_console()
+    console = config.init_console()
 
     insights_start_time = time.time()
 
-    log_path = args.log_paths[0]  # TODO: A single file rn
-    log = darshanll.log_open(log_path)
-
-    modules = darshanll.log_get_modules(log)
-
-    information = darshanll.log_get_job(log)
-
-    trace_path = args.log_paths[0]  # TODO: A single file rn
-
     darshan.enable_experimental()
     library_version = darshanll.get_lib_version()
 
-    # TODO: Can this be put in a with block?
-    log = darshanll.log_open(trace_path)
-    information = darshanll.log_get_job(log)
-    darshanll.log_close(log)
+    # trace_path = args.log_paths[0]  # TODO: A single file rn
+    darshan_traces = []
+
 
-    report = darshan.DarshanReport(trace_path)
-    current_trace = DarshanTrace(trace_path, information, report)
+    for trace_path in args.log_paths:
+        log = darshanll.log_open(trace_path)
+        information = darshanll.log_get_job(log)
+        darshanll.log_close(log)
+
+        report = darshan.DarshanReport(trace_path)
+        current_trace = DarshanTrace(trace_path, information, report)
+        darshan_traces.append(current_trace)
     #
 
     # Leave this as is for now
     # # Make sure log format is of the same version
-    # filename = args.log_path
-    # # check_log_version(console, args.log_path, log_version, library_version)
+    # filename = args.trace_path
+    # # check_log_version(console, args.trace_path, log_version, library_version)
     #
 
+    # Compute values for each trace
+    for current_trace in darshan_traces:
+        current_trace.generate_dxt_posix_rw_df()
+        current_trace.calculate_insights()
+        current_trace.files_stuff()
+        # current_trace.check_stdio()
+        # current_trace.check_mpiio()
+        current_trace.something()
+        current_trace.small_operation_calculation()
+        current_trace.posix_alignment()
+        current_trace.posix_redundant_reads()
+        current_trace.posix_random_check()
+        current_trace.posix_shared_file()
+        current_trace.posix_long_metadata()
+        current_trace.posix_stragglers()
+        current_trace.posix_stragglers2()
+        current_trace.posix_imbalance()
+        current_trace.hdf5_check()
+        current_trace.mpiio_non_blocking()
+        current_trace.CHECKnumber_of_aggregators()
+        current_trace.something_else()
+        # current_trace.generate_insights()
+
+    # Create aggregated trace
+    aggregated_trace = AggregatedDarshanTraces(traces=darshan_traces)
+    aggregated_trace.aggregate_traces()
+    aggregated_trace.generate_insights()
+
 
-    current_trace.generate_dxt_posix_rw_df()
-    current_trace.calculate_insights()
-    current_trace.files_stuff()
-    current_trace.check_stdio()
-    current_trace.check_mpiio()
-    current_trace.something()
-    current_trace.small_operation_calculation()
-    current_trace.posix_alignment()
-    current_trace.posix_redundant_reads()
-    current_trace.posix_random_check()
-    current_trace.posix_shared_file()
-    current_trace.posix_long_metadata()
-    current_trace.posix_stragglers()
-    current_trace.posix_stragglers2()
-    current_trace.posix_imbalance()
-    current_trace.hdf5_check()
-    current_trace.mpiio_non_blocking()
-    current_trace.CHECKnumber_of_aggregators()
-    current_trace.something_else()
 
     insights_end_time = time.time()
 
     # Version 3.4.1 of py-darshan changed the contents on what is reported in 'job'
+    job_end, job_start = set_job_time(report)
+
+    print_insights(console, current_trace, insights_end_time, insights_start_time, job_end, job_start, trace_path,
+                   report)
+
+    export_results(console, trace_path, report)
+
+
+def set_job_time(report):
     if 'start_time' in report.metadata['job']:
         job_start = datetime.datetime.fromtimestamp(report.metadata['job']['start_time'], datetime.timezone.utc)
         job_end = datetime.datetime.fromtimestamp(report.metadata['job']['end_time'], datetime.timezone.utc)
     else:
         job_start = datetime.datetime.fromtimestamp(report.metadata['job']['start_time_sec'], datetime.timezone.utc)
         job_end = datetime.datetime.fromtimestamp(report.metadata['job']['end_time_sec'], datetime.timezone.utc)
+    return job_end, job_start
 
-    console.print()
 
+def export_results(console, log_path, report):
+    # Export to HTML, SVG, and CSV
+    trace_name = os.path.basename(log_path).replace('.darshan', '')
+    out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
+    module.export_html(console, out_dir, trace_name)
+    module.export_svg(console, out_dir, trace_name)
+    module.export_csv(out_dir, trace_name, report.metadata['job']['jobid'])
+
+
+def print_insights(console, current_trace, insights_end_time, insights_start_time, job_end, job_start, log_path,
+                   report):
+    console.print()
     console.print(
         Panel(
             '\n'.join([
@@ -908,17 +1131,7 @@ def handler():
             padding=1
         )
     )
-
     console.print()
-
-    display_content(console)
-    display_thresholds(console)
-    display_footer(console, insights_start_time, insights_end_time)
-
-    # Export to HTML, SVG, and CSV
-    trace_name = os.path.basename(log_path).replace('.darshan', '')
-    out_dir = args.export_dir if args.export_dir != "" else os.getcwd()
-
-    export_html(console, out_dir, trace_name)
-    export_svg(console, out_dir, trace_name)
-    export_csv(out_dir, trace_name, report.metadata['job']['jobid'])
+    module.display_content(console)
+    module.display_thresholds(console)
+    module.display_footer(console, insights_start_time, insights_end_time)