From 18c345d37810b1252be8449f3a67a66f2849c4c4 Mon Sep 17 00:00:00 2001
From: Sebastian Bordt <sbordt@posteo.de>
Date: Thu, 4 Apr 2024 17:42:47 +0200
Subject: [PATCH] documentation

---
 docs/conf.py                     |   1 +
 docs/tabmemcheck.rst             |  27 +++++---
 tabmemcheck/__init__.py          |   8 +--
 tabmemcheck/analysis.py          |   2 +-
 tabmemcheck/datasets/__init__.py |   4 +-
 tabmemcheck/datasets/load.py     |  28 +++++---
 tabmemcheck/functions.py         | 111 +++++++++++++++++++------------
 tabmemcheck/llm.py               |   2 +-
 tabmemcheck/utils.py             |   4 +-
 9 files changed, 118 insertions(+), 69 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index c34d78e..f8d5615 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -14,6 +14,7 @@
 import sys
 
 sys.path.insert(0, os.path.abspath("../tabmemcheck/"))
+sys.path.insert(0, os.path.abspath("../tabmemcheck/datasets"))
 
 import tabmemcheck
 
diff --git a/docs/tabmemcheck.rst b/docs/tabmemcheck.rst
index f2e3f74..244f6c3 100644
--- a/docs/tabmemcheck.rst
+++ b/docs/tabmemcheck.rst
@@ -1,20 +1,28 @@
 Documentation
 =============
 
-Main Package Documentation
-------------------------------------------
+This is the documentation for the tabmemcheck package.
+
+Tests for tabular datasets (based on csv files)
+-----------------------------------------------
 
 .. automodule:: tabmemcheck
-   :members:
-   :undoc-members:
+   :members: run_all_tests, header_test, feature_names_test, row_completion_test, feature_completion_test, first_token_test, sample
    :show-inheritance:
 
-Datasets
----------------
+Dataset loading (original, perturbed, task, statistical)
+--------------------------------------------------------
 
 .. automodule:: tabmemcheck.datasets
-   :members:
-   :undoc-members:
+   :members: load_dataset, load_iris, load_wine, load_adult, load_housing, load_openml_diabetes
+   :show-inheritance:
+
+
+LLM Interface
+----------------------
+
+.. automodule:: tabmemcheck
+   :members: LLM_Interface, openai_setup, send_chat_completion, send_completion, set_logging_task, read_chatlog
    :show-inheritance:
 
 Analysis
@@ -22,15 +30,14 @@ Analysis
 
 .. automodule:: tabmemcheck.analysis
    :members:
-   :undoc-members:
    :show-inheritance:
 
+
 Utilities
 ------------------------
 
 .. autoclass:: tabmemcheck.utils
    :members:
-   :undoc-members:
    :show-inheritance:
 
 
diff --git a/tabmemcheck/__init__.py b/tabmemcheck/__init__.py
index c85785c..bd5d733 100644
--- a/tabmemcheck/__init__.py
+++ b/tabmemcheck/__init__.py
@@ -27,7 +27,7 @@
 from .version import __version__
 
 
-def load_default_system_prompts():
+def __load_default_system_prompts():
     """Load the default system prompts from the resources folder."""
     import importlib.resources as resources
     import yaml
@@ -40,7 +40,7 @@ def load_default_system_prompts():
 
 
 # global config object for the module
-class DotDict(dict):
+class __DotDict(dict):
     def __getattr__(self, attr):
         return self.get(attr)
 
@@ -52,10 +52,10 @@ def __delattr__(self, key):
             del self[key]
 
 
-config = DotDict({})
+config = __DotDict({})
 
 # default system prompts from yaml file
-config.system_prompts = load_default_system_prompts()
+config.system_prompts = __load_default_system_prompts()
 
 # default llm options
 config.temperature = 0
diff --git a/tabmemcheck/analysis.py b/tabmemcheck/analysis.py
index ecf6dae..35cc930 100644
--- a/tabmemcheck/analysis.py
+++ b/tabmemcheck/analysis.py
@@ -205,7 +205,7 @@ def conditional_completion_analysis(csv_file, completions_df):
 
 
 def levenshtein_distance_t_test(x, y, z, alternative="two-sided", return_dist=False):
-    """Test whether |x-y| < |x-z| in Levenshtein distance using a t-test.
+    """Test whether x is closer to y than z in Levenshtein distance using a t-test.
 
     x must be a list of stings.
     y and z can be either a list of strings or a list of lists of strings.
diff --git a/tabmemcheck/datasets/__init__.py b/tabmemcheck/datasets/__init__.py
index 0bdde3f..cdf5803 100644
--- a/tabmemcheck/datasets/__init__.py
+++ b/tabmemcheck/datasets/__init__.py
@@ -5,8 +5,8 @@
 from .load import (
     load_dataset,
     load_iris,
-    # load_titanic,
-    load_openml_diabetes,
+    load_wine,
     load_adult,
     load_housing,
+    load_openml_diabetes,
 )
diff --git a/tabmemcheck/datasets/load.py b/tabmemcheck/datasets/load.py
index 80d3515..ad7e53e 100644
--- a/tabmemcheck/datasets/load.py
+++ b/tabmemcheck/datasets/load.py
@@ -196,12 +196,24 @@ def report_feature_variation(df_original, df_variation):
 def load_dataset(
     csv_file: str,
     yaml_config: str = None,
-    transform=DATASET_PLAIN,
+    transform: str = DATASET_PLAIN,
     permute_columns=False,  # for perturbed transform
     print_stats=False,
     seed=None,
 ):
-    """Generic dataset loading function. Dataset tranformations are specified in a yaml configuration file."""
+    """Load a dataset from a CSV file and apply transformations as specified in a YAML configuration file.
+
+    Args:
+        csv_file (str): The path to the CSV file.
+        yaml_config (str, optional): The path to the YAML configuration file. Defaults to None.
+        transform (str, optional): The type of transformation to apply ('original', 'perturbed', 'task', 'statistical').
+        permute_columns (bool, optional): Whether to permute the columns in the perturbed version. Defaults to False.
+        print_stats (bool, optional): Whether to print statistics about the transformation. Defaults to False.
+        seed (optional): The seed for the numpy random number generator. Defaults to None.
+
+    Returns:
+        pandas.DataFrame: The transformed dataset.
+    """
     __validate_inputs(transform)
     rng = np.random.default_rng(seed=seed)
 
@@ -311,25 +323,25 @@ def load_dataset(
 
 
 def load_iris(csv_file: str = "iris.csv", *args, **kwargs):
-    """The Iris dataset. https://archive.ics.uci.edu/ml/datasets/iris"""
+    """Load the Iris dataset (https://archive.ics.uci.edu/ml/datasets/iris)."""
     return load_dataset(csv_file, "iris.yaml", *args, **kwargs)
 
 
 def load_wine(csv_file: str = "iris.csv", *args, **kwargs):
-    """The UCI Wine dataset. https://archive.ics.uci.edu/dataset/109/wine"""
+    """Load the UCI Wine dataset (https://archive.ics.uci.edu/dataset/109/wine)."""
     return load_dataset(csv_file, "wine.yaml", *args, **kwargs)
 
 
 def load_adult(csv_file: str = "adult-train.csv", *args, **kwargs):
-    """The Adult Income dataset. http://www.cs.toronto.edu/~delve/data/adult/adultDetail.html"""
+    """Load the Adult Income dataset (http://www.cs.toronto.edu/~delve/data/adult/adultDetail.html)."""
     return load_dataset(csv_file, "adult.yaml", *args, **kwargs)
 
 
 def load_housing(csv_file: str = "california-housing.csv", *args, **kwargs):
-    """California Housing dataset."""
+    """Load the California Housing dataset (https://inria.github.io/scikit-learn-mooc/python_scripts/datasets_california_housing.html)."""
     return load_dataset(csv_file, "housing.yaml", *args, **kwargs)
 
 
 def load_openml_diabetes(csv_file: str = "openml-diabetes.csv", *args, **kwargs):
-    """The OpenML Diabetes dataset. https://www.openml.org/d/37"""
-    return load_dataset("openml-diabetes.csv", "openml-diabetes.yaml", *args, **kwargs)
+    """Load the OpenML Diabetes dataset (https://www.openml.org/d/37)."""
+    return load_dataset(csv_file, "openml-diabetes.yaml", *args, **kwargs)
diff --git a/tabmemcheck/functions.py b/tabmemcheck/functions.py
index e7a01d5..97e17e9 100644
--- a/tabmemcheck/functions.py
+++ b/tabmemcheck/functions.py
@@ -104,8 +104,15 @@ def run_all_tests(
     csv_file: str,
     llm: Union[LLM_Interface, str],
     few_shot_csv_files=DEFAULT_FEW_SHOT_CSV_FILES,
-    feature_name=None,
+    unique_feature: str = None,
 ):
+    """Run different tests for memorization and prior experience with the content of the csv file.
+
+    :param csv_file: The path to the csv file.
+    :param llm: The language model to be tested.
+    :param few_shot_csv_files: A list of other csv files to be used as few-shot examples.
+    :param unique_feature: The name of the feature to be used for the feature completion test.
+    """
     llm = __llm_setup(llm)
     few_shot_csv_files = __validate_few_shot_files(csv_file, few_shot_csv_files)
     __print_info(csv_file, llm, few_shot_csv_files)
@@ -138,7 +145,7 @@ def run_all_tests(
     tabmem.config.temperature = temp
 
     row_completion_test(csv_file, llm, num_queries=25)
-    feature_completion_test(csv_file, llm, num_queries=25, feature_name=feature_name)
+    feature_completion_test(csv_file, llm, num_queries=25, feature_name=unique_feature)
     first_token_test(csv_file, llm, num_queries=25)
 
 
@@ -154,17 +161,15 @@ def feature_names_test(
     few_shot_csv_files=DEFAULT_FEW_SHOT_CSV_FILES,
     system_prompt: str = "default",
 ):
-    """Test if the model knows the names of the features.
+    """Test if the model knows the names of the features in a csv file.
 
-    The prompt format is:
-        System: <system_prompt>
-        User: Dataset: <dataset_name>
-              Feature 1, Feature 2, ..., Feature n
-        Response: Feature n+1, Feature n+2, ..., Feature m
-
-    This can be modified in the following ways:
-    - Include few-shot examples from other csv files.
+    :param csv_file: The path to the csv file.
+    :param llm: The language model to be tested.
+    :param num_prefix_features: The number of features given to the model as part of the prompt (defaults to 1/4 of the features).
+    :param few_shot_csv_files: A list of other csv files to be used as few-shot examples.
+    :param system_prompt: The system prompt to be used.
     """
+
     llm = __llm_setup(llm)
     few_shot_csv_files = __validate_few_shot_files(csv_file, few_shot_csv_files)
 
@@ -260,10 +265,6 @@ def feature_names_test(
         + response
     )
 
-    # TODO do some sort of evaluation
-    # for example, return true if it completes all but X of the feature names, correcting for upper/lower case
-    # at least do formatted printing of the results
-
 
 ####################################################################################
 # Feature Values
@@ -284,12 +285,16 @@ def header_test(
     system_prompt: str = "default",
     verbose: bool = True,
 ):
-    """Header test, using other csv files as few-shot examples.
+    """Header test for memorization.
 
-    Splits the csv file at random positions in rows 2, 4, 6, and 8. Performs 1 query for each split. Reports the best completion.
+    We split the csv file at random positions in rows split_rows and performs 1 query for each split. Then we compare the best completion with the actual header.
 
-    NOTE: This test might fail if the header and rows of the csv file are very long, and the model has a small context window.
-    NOTE: in the end, this is the case for all of our tests :)
+    :param csv_file: The path to the csv file.
+    :param llm: The language model to be tested.
+    :param split_rows: The rows at which the csv file is split for the test.
+    :param completion_length: The length of the completions in the few-shot examples (reduce for LLMs with small context windows).
+    :param few_shot_csv_files: A list of other csv files to be used as few-shot examples.
+    :param system_prompt: The system prompt to be used.
     """
     llm = __llm_setup(llm)
     few_shot_csv_files = __validate_few_shot_files(csv_file, few_shot_csv_files)
@@ -372,9 +377,6 @@ def header_test(
 
     return header_prompt, header_completion, llm_completion
 
-    # TODO return true if it completes the given row, as well as the next row.
-    # TODO count the number of correctly completed rows and print this number
-
 
 ####################################################################################
 # Row Completion
@@ -385,12 +387,21 @@ def row_completion_test(
     csv_file: str,
     llm: Union[LLM_Interface, str],
     num_prefix_rows=10,
-    num_queries=50,
+    num_queries=25,
     few_shot=7,
     out_file=None,
     system_prompt: str = "default",
 ):
-    """Row completion test: Complete the next row of the csv file, given the previous rows."""
+    """Row completion test for memorization. The test resports the number of correctly completed rows.
+
+    :param csv_file: The path to the csv file.
+    :param llm: The language model to be tested.
+    :param num_prefix_rows: The number of rows given to the model as part of the prompt.
+    :param num_queries: The number of rows that we test the model on.
+    :param few_shot: The number of few-shot examples to be used.
+    :param out_file: Optionally save all queries and responses to a csv file.
+    :param system_prompt: The system prompt to be used.
+    """
     llm = __llm_setup(llm)
 
     if system_prompt == "default":  # default system prompt?
@@ -437,7 +448,7 @@ def row_completion_test(
         if test_suffix.strip() in response.strip():
             num_exact_matches += 1
 
-    # the statistical test using the levenshtein distance TODO taken out of current version although it works
+    # the statistical test using the levenshtein distance. taken out of current version although it seems to work in practice.
     # test_prefix_rows = [prefix.split("\n") for prefix in test_prefixes]
     # test_result = analysis.levenshtein_distance_t_test(
     #    responses, test_suffixes, test_prefix_rows
@@ -467,21 +478,20 @@ def feature_completion_test(
     csv_file: str,
     llm: Union[LLM_Interface, str],
     feature_name: str = None,
-    num_queries=100,
+    num_queries=25,
     few_shot=5,
     out_file=None,
     system_prompt: str = "default",
 ):
-    """Feature completion test where we attempt to predict a single rare feature & count the number of exact matches.
-
-    The basic prompt format is the following:
-        System: <system_prompt>
-        User: Feature 1 = value 1, Feature 2 = value 2, ..., Feature n = value n
-        Response: Feature {feature_name} = value
-
-    This can be modified in the following ways:
-        - Include few-shot examples from other csv files.
-        - Don't use the feature names, but only the values.
+    """Feature completion test for memorization. The test resports the number of correctly completed features.
+
+    :param csv_file: The path to the csv file.
+    :param llm: The language model to be tested.
+    :param feature_name: The name of the feature to be used for the test.
+    :param num_queries: The number of feature values that we test the model on.
+    :param few_shot: The number of few-shot examples to be used.
+    :param out_file: Optionally save all queries and responses to a csv file.
+    :param system_prompt: The system prompt to be used.
     """
     llm = __llm_setup(llm)
 
@@ -558,12 +568,23 @@ def first_token_test(
     csv_file: str,
     llm: Union[LLM_Interface, str],
     num_prefix_rows=10,
-    num_queries=100,
+    num_queries=25,
     few_shot=7,
     out_file=None,
     system_prompt: str = "default",
 ):
-    """First token test: Complete the first token of the next row of the csv file, given the previous rows."""
+    """First token test for memorization. We ask the model to complete the first token of the next row of the csv file, given the previous rows. The test resports the number of correctly completed tokens.
+
+    Note that the ''first token'' is not actually the first token produced by the llm, but consists of the first n digits of the row. The number of digits is determined by the function build_first_token.
+
+    :param csv_file: The path to the csv file.
+    :param llm: The language model to be tested.
+    :param num_prefix_rows: The number of rows given to the model as part of the prompt.
+    :param num_queries: The number of rows that we test the model on.
+    :param few_shot: The number of few-shot examples to be used.
+    :param out_file: Optionally save all queries and responses to a csv file.
+    :param system_prompt: The system prompt to be used.
+    """
     llm = __llm_setup(llm)
 
     if (
@@ -654,7 +675,7 @@ def first_token_test(
 
 
 ####################################################################################
-# Zero-Knowledge Sampling
+# Sampling
 ####################################################################################
 
 
@@ -680,7 +701,15 @@ def sample(
     out_file=None,
     system_prompt: str = "default",
 ):
-    """zero-shot sampling from the csv file, using few-shot examples from other csv files."""
+    """Ask the model to provide random samples from the csv file.
+
+    :param csv_file: The path to the csv file.
+    :param llm: The language model to be tested.
+    :param num_queries: The desired number of samples.
+    :param few_shot_csv_files: A list of other csv files to be used as few-shot examples.
+    :param out_file: Optionally save all queries and responses to a csv file.
+    :param system_prompt: The system prompt to be used.
+    """
     llm = __llm_setup(llm)
     few_shot_csv_files = __validate_few_shot_files(csv_file, few_shot_csv_files)
 
@@ -703,7 +732,7 @@ def sample(
     )
 
     if len(cond_feature_names) > 0:
-        pass
+        raise NotImplementedError("Conditional sampling not yet supported.")
         # TODO handle the condtional case!
 
     # parse the model responses in a dataframe
diff --git a/tabmemcheck/llm.py b/tabmemcheck/llm.py
index f0c6834..2753455 100644
--- a/tabmemcheck/llm.py
+++ b/tabmemcheck/llm.py
@@ -33,7 +33,7 @@
 
 @dataclass
 class LLM_Interface:
-    """The interface to the language model."""
+    """Generic interface to a language model."""
 
     # if true, the tests use the chat_completion function, otherwise the completion function
     chat_mode = False
diff --git a/tabmemcheck/utils.py b/tabmemcheck/utils.py
index b5ae740..dbfd4df 100644
--- a/tabmemcheck/utils.py
+++ b/tabmemcheck/utils.py
@@ -518,7 +518,7 @@ class bcolors:
 
 
 def levenshtein_cmd(a: str, b: str):
-    """Visualization of the Lehvenshtein distance between a and b, using color codes to be printed in the console."""
+    """Visualization of the Levenshtein distance between a and b, using color codes to be printed in the console."""
     print_string = ""
     for opcode in levenshtein(a, b)[1]:
         op = opcode["type"]
@@ -536,7 +536,7 @@ def levenshtein_cmd(a: str, b: str):
 
 
 def levenshtein_html(a: str, b: str):
-    """HTML visualization of the lehvenshtein distance between a and b."""
+    """HTML visualization of the Levenshtein distance between a and b."""
     html_string = ""
     for opcode in levenshtein(a, b)[1]:
         op = opcode["type"]