documentation

interpretml · Apr 5, 2024 · 140fb11 · 140fb11
1 parent 18c345d
commit 140fb11
Show file tree

Hide file tree

Showing 10 changed files with 555 additions and 95 deletions.
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
@@ -0,0 +1,5 @@
+/* Custom styles for sphinx_rtd_theme */
+.wy-nav-content {
+    width: 50%;
+    max-width: 1200px; /* Adjust this value to your preference */
+}
diff --git a/docs/tabmemcheck.rst → docs/api_reference.rst b/docs/tabmemcheck.rst → docs/api_reference.rst
@@ -1,43 +1,41 @@
-Documentation
+API Reference
 =============
 
-This is the documentation for the tabmemcheck package.
-
 Tests for tabular datasets (based on csv files)
 -----------------------------------------------
 
 .. automodule:: tabmemcheck
    :members: run_all_tests, header_test, feature_names_test, row_completion_test, feature_completion_test, first_token_test, sample
    :show-inheritance:
 
-Dataset loading (original, perturbed, task, statistical)
---------------------------------------------------------
+Tabular dataset loading (original, perturbed, task, statistical)
+----------------------------------------------------------------
 
 .. automodule:: tabmemcheck.datasets
    :members: load_dataset, load_iris, load_wine, load_adult, load_housing, load_openml_diabetes
    :show-inheritance:
 
 
-LLM Interface
-----------------------
+LLM
+---
 
 .. automodule:: tabmemcheck
-   :members: LLM_Interface, openai_setup, send_chat_completion, send_completion, set_logging_task, read_chatlog
+   :members: LLM_Interface, openai_setup, send_chat_completion, send_completion
    :show-inheritance:
 
 Analysis
 ------------------------
 
 .. automodule:: tabmemcheck.analysis
-   :members:
+   :members: find_matches, is_in_df, build_first_token, find_most_unique_feature
    :show-inheritance:
 
 
 Utilities
 ------------------------
 
-.. autoclass:: tabmemcheck.utils
-   :members:
+.. automodule:: tabmemcheck.utils
+   :members: get_dataset_name, get_delimiter, get_feature_names, load_csv_df, load_csv_rows, load_csv_string, load_csv_array, load_samples, parse_feature_string, parse_feature_stings, levenshtein_cmd, levenshtein_html
    :show-inheritance:
 
 
diff --git a/docs/conf.py b/docs/conf.py
@@ -15,6 +15,7 @@
 
 sys.path.insert(0, os.path.abspath("../tabmemcheck/"))
 sys.path.insert(0, os.path.abspath("../tabmemcheck/datasets"))
+sys.path.insert(0, os.path.abspath("../examples/"))
 
 import tabmemcheck
 
@@ -57,4 +58,8 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ['_static']
+html_static_path = ["_static"]
+
+
+def setup(app):
+    app.add_css_file("custom.css")
diff --git a/docs/index.rst b/docs/index.rst
diff --git a/examples/tabular-datasets.ipynb b/examples/tabular-datasets.ipynb
diff --git a/tabmemcheck/analysis.py b/tabmemcheck/analysis.py
@@ -21,7 +21,7 @@
 
 
 def string_strip(x):
-    """ "We always convert all objects (data frames and responses) to strings and strip them of trailing whitespaces."""
+    """Convert the input (dataframe, series, or string) to string and strip trailing whitespaces."""
     # if x is data frame
     if isinstance(x, pd.DataFrame):
         x = x.astype(str)
@@ -55,23 +55,25 @@ def validate_partial_row(x, feature_names):
 
 
 def find_matches(
-    df,
+    df: pd.DataFrame,
     x,
     string_dist_fn=utils.levenshtein_distances,
     match_floating_point=True,
     strip_quotation_marks=True,
 ):
-    """Find the closest matches between x and all rows in df. By default, we use the levenshtein distance as the distance metric.
+    """Find the closest matches between a row x and all rows in the dataframe df. By default, we use the levenshtein distance as the distance metric.
 
     This function can handle a variety of formatting differences between the values in the original data
     and LLM responses that should still be counted as equal.
 
-    match_floating_point: if True, handes floating point formatting differences, e.g. 0.28 vs. .280 or 172 vs 172.0 (default: True).
-    strip_quotation_marks: if True, strips quotation marks from the values in df and x (to handle the case where a model responds with "23853", and the value in the data is 23853) (default: True).
 
-    x: A string, a pandas dataframe or a pandas Series.
+    :param df: a pandas dataframe.
+    :param x: a string, a pandas dataframe or a pandas Series.
+    :param string_dist_fn: a function that computes the distance between two strings. By default, this is the levenshtein distance.
+    :param match_floating_point: if True, handes floating point formatting differences, e.g. 0.28 vs. .280 or 172 vs 172.0 (default: True).
+    :param strip_quotation_marks: if True, strips quotation marks from the values in df and x (to handle the case where a model responds with "23853", and the value in the data is 23853) (default: True).
 
-    Returns: the minimum distance and the matching rows in df.
+    :return: the minimum distance and the matching rows in df.
     """
     # x should be a dataframe with a single row, or be convertible to this format
     x = validate_partial_row(x, df.columns)
@@ -207,12 +209,10 @@ def conditional_completion_analysis(csv_file, completions_df):
 def levenshtein_distance_t_test(x, y, z, alternative="two-sided", return_dist=False):
     """Test whether x is closer to y than z in Levenshtein distance using a t-test.
 
-    x must be a list of stings.
-    y and z can be either a list of strings or a list of lists of strings.
-
-    alternative: argument to scipy.stats.ttest_ind (default: two-sided)
-
-    Returns: scipy.stats._result_classes.TtestResult"""
+    :param x, y, z: a list of strings.
+    :param alternative: the alternative hypothesis, either 'two-sided', 'greater', or 'less'.
+    :return_dist: if True also return the distances between x and y, and x and z.
+    :return: scipy.stats._result_classes.TtestResult"""
     # convert numpy arrays to lists
     if isinstance(x, np.ndarray):
         x = x.tolist()
@@ -254,13 +254,16 @@ def levenshtein_distance_t_test(x, y, z, alternative="two-sided", return_dist=Fa
 
 
 def build_first_token(csv_file, verbose=False):
-    """We construct the first token using the first n digits of every row. The usefulness of this approach comes from the fact that in some datasets, the first token might always be the same.
+    """Given a csv file, build a first token that can be used in the first token test.
 
-    NOTE: this does not work if the first token is the id of the row, because the id is not always the same. IS THIS ACTUALLY TRUE?
+    The first token is constructed by taking the first n digits of every row in the csv file (that is, this functions determines the n).
+    Using the first n digits improves upon using the first digit on datasets where the first digit is always the same or contains few distinct values.
 
-    NOTE: we should always do a prediction test with a gbtree / logistic regression to see if the first token is actually random
+    Note: This function does NOT check if the constructed first token is random.
 
-    Returns: the number of digits that make up the first token.
+    :param csv_file: the path to the csv file.
+    :param verbose: if True, print the first tokens and their counts.
+    :return: the number of digits that make up the first token.
     """
     csv_rows = utils.load_csv_rows(csv_file, header=False)
     num_rows = len(csv_rows)
@@ -278,8 +281,10 @@ def build_first_token(csv_file, verbose=False):
 
 
 def find_most_unique_feature(csv_file):
-    """Find the feature that have the most unique values. This is useful for the feature completion test.
-    Returns: feature name, fraction of unique values
+    """Given a csv file, find the feature that has the most unique values. This is the default feature used for the feature completion test.
+
+    :param csv_file: the path to the csv file.
+    :return: the name of the most unique feature and the fraction of unique values.
     """
     feature_names = utils.get_feature_names(csv_file)
     df = utils.load_csv_df(csv_file)

diff --git a/tabmemcheck/chat_completion.py b/tabmemcheck/chat_completion.py
@@ -142,6 +142,7 @@ def row_chat_completion(
     num_queries=100,
     few_shot=7,
     out_file=None,
+    print_levenshtein=False,
 ):
     """Row  chat completion task. This task ask the LLM to predict the next row in the
     csv file, given the previous rows. This task is the basis for the row completion
@@ -167,6 +168,7 @@ def row_chat_completion(
         few_shot=few_shot,
         num_queries=num_queries,
         out_file=out_file,
+        print_levenshtein=print_levenshtein,
     )
 
     return test_prefixes, test_suffixes, responses
@@ -252,6 +254,7 @@ def prefix_suffix_chat_completion(
     system_prompt: str,
     few_shot=None,
     num_queries=100,
+    print_levenshtein=False,
     out_file=None,
     rng=None,
 ):
@@ -356,6 +359,11 @@ def prefix_suffix_chat_completion(
         test_prefixes.append(test_prefix)
         test_suffixes.append(test_suffix)
         responses.append(response)
+        # print the levenshtein distance between the true suffix and the response
+        if print_levenshtein:
+            print(
+                utils.levenshtein_cmd(test_suffix, response),
+            )
 
     # save the results to file
     if out_file is not None:

diff --git a/tabmemcheck/functions.py b/tabmemcheck/functions.py
@@ -295,6 +295,8 @@ def header_test(
     :param completion_length: The length of the completions in the few-shot examples (reduce for LLMs with small context windows).
     :param few_shot_csv_files: A list of other csv files to be used as few-shot examples.
     :param system_prompt: The system prompt to be used.
+
+    :return: The header prompt, the actual header completion, and the model response.
     """
     llm = __llm_setup(llm)
     few_shot_csv_files = __validate_few_shot_files(csv_file, few_shot_csv_files)
@@ -391,6 +393,7 @@ def row_completion_test(
     few_shot=7,
     out_file=None,
     system_prompt: str = "default",
+    print_levenshtein: bool = True,
 ):
     """Row completion test for memorization. The test resports the number of correctly completed rows.
 
@@ -401,6 +404,9 @@ def row_completion_test(
     :param few_shot: The number of few-shot examples to be used.
     :param out_file: Optionally save all queries and responses to a csv file.
     :param system_prompt: The system prompt to be used.
+    :param print_levenshtein: Print a visulization of the levenshtein distance between the model responses and the actual rows.
+
+    :return: the rows, the model responses.
     """
     llm = __llm_setup(llm)
 
@@ -427,22 +433,22 @@ def row_completion_test(
 
     # ask the model to perform row chat completion (execute the the prompt)
     if llm.chat_mode:
-        test_prefixes, test_suffixes, responses = row_chat_completion(
+        _, test_suffixes, responses = row_chat_completion(
             llm,
             csv_file,
             system_prompt,
             num_prefix_rows,
             num_queries,
             few_shot,
             out_file,
+            print_levenshtein,
         )
     else:
-        test_prefixes, test_suffixes, responses = row_completion(
+        _, test_suffixes, responses = row_completion(
             llm, csv_file, num_prefix_rows, num_queries, out_file
         )
 
-    # count the number of exact matches
-    # NOTE here we assume that the test suffix is a single row that is unique, i.e. no duplicate rows
+    # count the number of verbatim completed rows
     num_exact_matches = 0
     for test_suffix, response in zip(test_suffixes, responses):
         if test_suffix.strip() in response.strip():
@@ -466,7 +472,7 @@ def row_completion_test(
         # + f"{test_result.pvalue:.3f}."
     )
 
-    return test_prefixes, test_suffixes, responses
+    return test_suffixes, responses
 
 
 ####################################################################################
@@ -492,6 +498,8 @@ def feature_completion_test(
     :param few_shot: The number of few-shot examples to be used.
     :param out_file: Optionally save all queries and responses to a csv file.
     :param system_prompt: The system prompt to be used.
+
+    :return: the feature values, the model responses.
     """
     llm = __llm_setup(llm)
 
@@ -540,8 +548,12 @@ def build_prompt(messages):
     )
 
     # parse the model responses
-    response_df = utils.parse_feature_stings(responses, [feature_name])
-    test_suffix_df = utils.parse_feature_stings(test_suffixes, [feature_name])
+    response_df = utils.parse_feature_stings(
+        responses, [feature_name], final_delimiter="\n"
+    )
+    test_suffix_df = utils.parse_feature_stings(
+        test_suffixes, [feature_name], final_delimiter="\n"
+    )
 
     # count number of exact matches
     num_exact_matches = np.sum(
@@ -558,6 +570,8 @@ def build_prompt(messages):
         + bcolors.ENDC
     )
 
+    return test_suffix_df[feature_name].to_list(), response_df[feature_name].to_list()
+
 
 ####################################################################################
 # First Token Test