Skip to content

Commit

Permalink
documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
sbordt committed Apr 5, 2024
1 parent 18c345d commit 140fb11
Show file tree
Hide file tree
Showing 10 changed files with 555 additions and 95 deletions.
5 changes: 5 additions & 0 deletions docs/_static/custom.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/* Custom styles for sphinx_rtd_theme */
.wy-nav-content {
width: 50%;
max-width: 1200px; /* Adjust this value to your preference */
}
20 changes: 9 additions & 11 deletions docs/tabmemcheck.rst → docs/api_reference.rst
Original file line number Diff line number Diff line change
@@ -1,43 +1,41 @@
Documentation
API Reference
=============

This is the documentation for the tabmemcheck package.

Tests for tabular datasets (based on csv files)
-----------------------------------------------

.. automodule:: tabmemcheck
:members: run_all_tests, header_test, feature_names_test, row_completion_test, feature_completion_test, first_token_test, sample
:show-inheritance:

Dataset loading (original, perturbed, task, statistical)
--------------------------------------------------------
Tabular dataset loading (original, perturbed, task, statistical)
----------------------------------------------------------------

.. automodule:: tabmemcheck.datasets
:members: load_dataset, load_iris, load_wine, load_adult, load_housing, load_openml_diabetes
:show-inheritance:


LLM Interface
----------------------
LLM
---

.. automodule:: tabmemcheck
:members: LLM_Interface, openai_setup, send_chat_completion, send_completion, set_logging_task, read_chatlog
:members: LLM_Interface, openai_setup, send_chat_completion, send_completion
:show-inheritance:

Analysis
------------------------

.. automodule:: tabmemcheck.analysis
:members:
:members: find_matches, is_in_df, build_first_token, find_most_unique_feature
:show-inheritance:


Utilities
------------------------

.. autoclass:: tabmemcheck.utils
:members:
.. automodule:: tabmemcheck.utils
:members: get_dataset_name, get_delimiter, get_feature_names, load_csv_df, load_csv_rows, load_csv_string, load_csv_array, load_samples, parse_feature_string, parse_feature_stings, levenshtein_cmd, levenshtein_html
:show-inheritance:


7 changes: 6 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

sys.path.insert(0, os.path.abspath("../tabmemcheck/"))
sys.path.insert(0, os.path.abspath("../tabmemcheck/datasets"))
sys.path.insert(0, os.path.abspath("../examples/"))

import tabmemcheck

Expand Down Expand Up @@ -57,4 +58,8 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
html_static_path = ["_static"]


def setup(app):
app.add_css_file("custom.css")
113 changes: 104 additions & 9 deletions docs/index.rst

Large diffs are not rendered by default.

322 changes: 322 additions & 0 deletions examples/tabular-datasets.ipynb

Large diffs are not rendered by default.

43 changes: 24 additions & 19 deletions tabmemcheck/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@


def string_strip(x):
""" "We always convert all objects (data frames and responses) to strings and strip them of trailing whitespaces."""
"""Convert the input (dataframe, series, or string) to string and strip trailing whitespaces."""
# if x is data frame
if isinstance(x, pd.DataFrame):
x = x.astype(str)
Expand Down Expand Up @@ -55,23 +55,25 @@ def validate_partial_row(x, feature_names):


def find_matches(
df,
df: pd.DataFrame,
x,
string_dist_fn=utils.levenshtein_distances,
match_floating_point=True,
strip_quotation_marks=True,
):
"""Find the closest matches between x and all rows in df. By default, we use the levenshtein distance as the distance metric.
"""Find the closest matches between a row x and all rows in the dataframe df. By default, we use the levenshtein distance as the distance metric.
This function can handle a variety of formatting differences between the values in the original data
and LLM responses that should still be counted as equal.
match_floating_point: if True, handes floating point formatting differences, e.g. 0.28 vs. .280 or 172 vs 172.0 (default: True).
strip_quotation_marks: if True, strips quotation marks from the values in df and x (to handle the case where a model responds with "23853", and the value in the data is 23853) (default: True).
x: A string, a pandas dataframe or a pandas Series.
:param df: a pandas dataframe.
:param x: a string, a pandas dataframe or a pandas Series.
:param string_dist_fn: a function that computes the distance between two strings. By default, this is the levenshtein distance.
:param match_floating_point: if True, handes floating point formatting differences, e.g. 0.28 vs. .280 or 172 vs 172.0 (default: True).
:param strip_quotation_marks: if True, strips quotation marks from the values in df and x (to handle the case where a model responds with "23853", and the value in the data is 23853) (default: True).
Returns: the minimum distance and the matching rows in df.
:return: the minimum distance and the matching rows in df.
"""
# x should be a dataframe with a single row, or be convertible to this format
x = validate_partial_row(x, df.columns)
Expand Down Expand Up @@ -207,12 +209,10 @@ def conditional_completion_analysis(csv_file, completions_df):
def levenshtein_distance_t_test(x, y, z, alternative="two-sided", return_dist=False):
"""Test whether x is closer to y than z in Levenshtein distance using a t-test.
x must be a list of stings.
y and z can be either a list of strings or a list of lists of strings.
alternative: argument to scipy.stats.ttest_ind (default: two-sided)
Returns: scipy.stats._result_classes.TtestResult"""
:param x, y, z: a list of strings.
:param alternative: the alternative hypothesis, either 'two-sided', 'greater', or 'less'.
:return_dist: if True also return the distances between x and y, and x and z.
:return: scipy.stats._result_classes.TtestResult"""
# convert numpy arrays to lists
if isinstance(x, np.ndarray):
x = x.tolist()
Expand Down Expand Up @@ -254,13 +254,16 @@ def levenshtein_distance_t_test(x, y, z, alternative="two-sided", return_dist=Fa


def build_first_token(csv_file, verbose=False):
"""We construct the first token using the first n digits of every row. The usefulness of this approach comes from the fact that in some datasets, the first token might always be the same.
"""Given a csv file, build a first token that can be used in the first token test.
NOTE: this does not work if the first token is the id of the row, because the id is not always the same. IS THIS ACTUALLY TRUE?
The first token is constructed by taking the first n digits of every row in the csv file (that is, this functions determines the n).
Using the first n digits improves upon using the first digit on datasets where the first digit is always the same or contains few distinct values.
NOTE: we should always do a prediction test with a gbtree / logistic regression to see if the first token is actually random
Note: This function does NOT check if the constructed first token is random.
Returns: the number of digits that make up the first token.
:param csv_file: the path to the csv file.
:param verbose: if True, print the first tokens and their counts.
:return: the number of digits that make up the first token.
"""
csv_rows = utils.load_csv_rows(csv_file, header=False)
num_rows = len(csv_rows)
Expand All @@ -278,8 +281,10 @@ def build_first_token(csv_file, verbose=False):


def find_most_unique_feature(csv_file):
"""Find the feature that have the most unique values. This is useful for the feature completion test.
Returns: feature name, fraction of unique values
"""Given a csv file, find the feature that has the most unique values. This is the default feature used for the feature completion test.
:param csv_file: the path to the csv file.
:return: the name of the most unique feature and the fraction of unique values.
"""
feature_names = utils.get_feature_names(csv_file)
df = utils.load_csv_df(csv_file)
Expand Down
8 changes: 8 additions & 0 deletions tabmemcheck/chat_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def row_chat_completion(
num_queries=100,
few_shot=7,
out_file=None,
print_levenshtein=False,
):
"""Row chat completion task. This task ask the LLM to predict the next row in the
csv file, given the previous rows. This task is the basis for the row completion
Expand All @@ -167,6 +168,7 @@ def row_chat_completion(
few_shot=few_shot,
num_queries=num_queries,
out_file=out_file,
print_levenshtein=print_levenshtein,
)

return test_prefixes, test_suffixes, responses
Expand Down Expand Up @@ -252,6 +254,7 @@ def prefix_suffix_chat_completion(
system_prompt: str,
few_shot=None,
num_queries=100,
print_levenshtein=False,
out_file=None,
rng=None,
):
Expand Down Expand Up @@ -356,6 +359,11 @@ def prefix_suffix_chat_completion(
test_prefixes.append(test_prefix)
test_suffixes.append(test_suffix)
responses.append(response)
# print the levenshtein distance between the true suffix and the response
if print_levenshtein:
print(
utils.levenshtein_cmd(test_suffix, response),
)

# save the results to file
if out_file is not None:
Expand Down
28 changes: 21 additions & 7 deletions tabmemcheck/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,8 @@ def header_test(
:param completion_length: The length of the completions in the few-shot examples (reduce for LLMs with small context windows).
:param few_shot_csv_files: A list of other csv files to be used as few-shot examples.
:param system_prompt: The system prompt to be used.
:return: The header prompt, the actual header completion, and the model response.
"""
llm = __llm_setup(llm)
few_shot_csv_files = __validate_few_shot_files(csv_file, few_shot_csv_files)
Expand Down Expand Up @@ -391,6 +393,7 @@ def row_completion_test(
few_shot=7,
out_file=None,
system_prompt: str = "default",
print_levenshtein: bool = True,
):
"""Row completion test for memorization. The test resports the number of correctly completed rows.
Expand All @@ -401,6 +404,9 @@ def row_completion_test(
:param few_shot: The number of few-shot examples to be used.
:param out_file: Optionally save all queries and responses to a csv file.
:param system_prompt: The system prompt to be used.
:param print_levenshtein: Print a visulization of the levenshtein distance between the model responses and the actual rows.
:return: the rows, the model responses.
"""
llm = __llm_setup(llm)

Expand All @@ -427,22 +433,22 @@ def row_completion_test(

# ask the model to perform row chat completion (execute the the prompt)
if llm.chat_mode:
test_prefixes, test_suffixes, responses = row_chat_completion(
_, test_suffixes, responses = row_chat_completion(
llm,
csv_file,
system_prompt,
num_prefix_rows,
num_queries,
few_shot,
out_file,
print_levenshtein,
)
else:
test_prefixes, test_suffixes, responses = row_completion(
_, test_suffixes, responses = row_completion(
llm, csv_file, num_prefix_rows, num_queries, out_file
)

# count the number of exact matches
# NOTE here we assume that the test suffix is a single row that is unique, i.e. no duplicate rows
# count the number of verbatim completed rows
num_exact_matches = 0
for test_suffix, response in zip(test_suffixes, responses):
if test_suffix.strip() in response.strip():
Expand All @@ -466,7 +472,7 @@ def row_completion_test(
# + f"{test_result.pvalue:.3f}."
)

return test_prefixes, test_suffixes, responses
return test_suffixes, responses


####################################################################################
Expand All @@ -492,6 +498,8 @@ def feature_completion_test(
:param few_shot: The number of few-shot examples to be used.
:param out_file: Optionally save all queries and responses to a csv file.
:param system_prompt: The system prompt to be used.
:return: the feature values, the model responses.
"""
llm = __llm_setup(llm)

Expand Down Expand Up @@ -540,8 +548,12 @@ def build_prompt(messages):
)

# parse the model responses
response_df = utils.parse_feature_stings(responses, [feature_name])
test_suffix_df = utils.parse_feature_stings(test_suffixes, [feature_name])
response_df = utils.parse_feature_stings(
responses, [feature_name], final_delimiter="\n"
)
test_suffix_df = utils.parse_feature_stings(
test_suffixes, [feature_name], final_delimiter="\n"
)

# count number of exact matches
num_exact_matches = np.sum(
Expand All @@ -558,6 +570,8 @@ def build_prompt(messages):
+ bcolors.ENDC
)

return test_suffix_df[feature_name].to_list(), response_df[feature_name].to_list()


####################################################################################
# First Token Test
Expand Down
Loading

0 comments on commit 140fb11

Please sign in to comment.