From e208d5677c6837d590b81cb03847c0b9de100765 Mon Sep 17 00:00:00 2001
From: Clayton Thorrez <claytonthorrez@gmail.com>
Date: Sun, 22 Sep 2024 23:26:55 -0700
Subject: [PATCH] Accelerate Bradley Terry MLE model fitting (#3523)

---
 fastchat/serve/monitor/elo_analysis.py   | 261 ++-------------
 fastchat/serve/monitor/rating_systems.py | 385 +++++++++++++++++++++++
 2 files changed, 404 insertions(+), 242 deletions(-)
 create mode 100644 fastchat/serve/monitor/rating_systems.py

diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py
index bea808fc5d..6a16c92184 100644
--- a/fastchat/serve/monitor/elo_analysis.py
+++ b/fastchat/serve/monitor/elo_analysis.py
@@ -17,120 +17,18 @@
 from fastchat.model.model_registry import get_model_info
 from fastchat.serve.monitor.basic_stats import get_log_files
 from fastchat.serve.monitor.clean_battle_data import clean_battle_data
+from fastchat.serve.monitor.rating_systems import (
+    compute_elo,
+    compute_bt,
+    compute_style_control,
+    compute_bootstrap_elo,
+    compute_bootstrap_bt,
+    compute_bootstrap_style_control,
+)
 
 pd.options.display.float_format = "{:.2f}".format
 
 
-STYLE_CONTROL_ELEMENTS_V1 = [
-    "sum_assistant_a_tokens",
-    "header_count_a",
-    "list_count_a",
-    "bold_count_a",
-    "sum_assistant_b_tokens",
-    "header_count_b",
-    "list_count_b",
-    "bold_count_b",
-]
-
-
-def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
-    rating = defaultdict(lambda: INIT_RATING)
-
-    for rd, model_a, model_b, winner in battles[
-        ["model_a", "model_b", "winner"]
-    ].itertuples():
-        ra = rating[model_a]
-        rb = rating[model_b]
-        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
-        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
-        if winner == "model_a":
-            sa = 1
-        elif winner == "model_b":
-            sa = 0
-        elif winner == "tie" or winner == "tie (bothbad)":
-            sa = 0.5
-        else:
-            raise Exception(f"unexpected vote {winner}")
-        rating[model_a] += K * (sa - ea)
-        rating[model_b] += K * (1 - sa - eb)
-
-    return dict(rating)
-
-
-def get_bootstrap_result(battles, func_compute_elo, num_round=1000):
-    rows = []
-    for i in tqdm(range(num_round), desc="bootstrap"):
-        tmp_battles = battles.sample(frac=1.0, replace=True)
-        rows.append(func_compute_elo(tmp_battles))
-    df = pd.DataFrame(rows)
-    return df[df.median().sort_values(ascending=False).index]
-
-
-def compute_elo_mle_with_tie(
-    df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
-):
-    from sklearn.linear_model import LogisticRegression
-
-    ptbl_a_win = pd.pivot_table(
-        df[df["winner"] == "model_a"],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
-    ptbl_tie = pd.pivot_table(
-        df[df["winner"].isin(["tie", "tie (bothbad)"])],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
-    ptbl_tie = ptbl_tie + ptbl_tie.T
-    ptbl_b_win = pd.pivot_table(
-        df[df["winner"] == "model_b"],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
-    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
-
-    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
-
-    p = len(models)
-    X = np.zeros([p * (p - 1) * 2, p])
-    Y = np.zeros(p * (p - 1) * 2)
-
-    cur_row = 0
-    sample_weights = []
-    for m_a in ptbl_win.index:
-        for m_b in ptbl_win.columns:
-            if m_a == m_b:
-                continue
-            # if nan skip
-            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
-                continue
-            X[cur_row, models[m_a]] = +math.log(BASE)
-            X[cur_row, models[m_b]] = -math.log(BASE)
-            Y[cur_row] = 1.0
-            sample_weights.append(ptbl_win.loc[m_a, m_b])
-
-            X[cur_row + 1, models[m_a]] = math.log(BASE)
-            X[cur_row + 1, models[m_b]] = -math.log(BASE)
-            Y[cur_row + 1] = 0.0
-            sample_weights.append(ptbl_win.loc[m_b, m_a])
-            cur_row += 2
-    X = X[:cur_row]
-    Y = Y[:cur_row]
-
-    lr = LogisticRegression(fit_intercept=False, penalty=None)
-    lr.fit(X, Y, sample_weight=sample_weights)
-    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
-    if "mixtral-8x7b-instruct-v0.1" in models.index:
-        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
-    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
-
-
 def get_median_elo_from_bootstrap(bootstrap_df):
     median = dict(bootstrap_df.quantile(0.5))
     median = {k: int(v + 0.5) for k, v in median.items()}
@@ -411,129 +309,6 @@ def outlier_detect(
     return battles
 
 
-def fit_mle_elo(X, Y, models, indices=None, SCALE=400, INIT_RATING=1000):
-    from sklearn.linear_model import LogisticRegression
-
-    p = len(models.index)
-
-    lr = LogisticRegression(fit_intercept=False)
-    if indices:
-        lr.fit(X[indices], Y[indices])
-    else:
-        lr.fit(X, Y)
-
-    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
-    # calibrate llama-13b to 800 if applicable
-    if "mixtral-8x7b-instruct-v0.1" in models.index:
-        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
-    return (
-        pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False),
-        lr.coef_[0][p:],
-    )
-
-
-def construct_style_matrices(
-    df,
-    BASE=10,
-    apply_ratio=[1, 1, 1, 1],
-    style_elements=STYLE_CONTROL_ELEMENTS_V1,
-    add_one=True,
-):
-    models = pd.concat([df["model_a"], df["model_b"]]).unique()
-    models = pd.Series(np.arange(len(models)), index=models)
-
-    # duplicate battles
-    df = pd.concat([df, df], ignore_index=True)
-    p = len(models.index)
-    n = df.shape[0]
-    assert len(style_elements) % 2 == 0
-    k = int(len(style_elements) / 2)
-
-    X = np.zeros([n, p + k])
-    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
-    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
-
-    # creates turn each of the specified column in "conv_metadata" into a vector
-    style_vector = np.array(
-        [
-            df.conv_metadata.map(
-                lambda x: x[element]
-                if type(x[element]) is int
-                else sum(x[element].values())
-            ).tolist()
-            for element in style_elements
-        ]
-    )
-
-    style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
-    style_sum = (style_vector[:k] + style_vector[k:]).astype(float)
-
-    if add_one:
-        style_sum = style_sum + np.ones(style_diff.shape)
-
-    apply_ratio = np.flatnonzero(apply_ratio)
-
-    style_diff[apply_ratio] /= style_sum[
-        apply_ratio
-    ]  # Apply ratio where necessary (length, etc)
-
-    style_mean = np.mean(style_diff, axis=1)
-    style_std = np.std(style_diff, axis=1)
-
-    X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T
-
-    # one A win => two A win
-    Y = np.zeros(n)
-    Y[df["winner"] == "model_a"] = 1.0
-
-    # one tie => one A win + one B win
-    # find tie + tie (both bad) index
-    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
-    tie_idx[len(tie_idx) // 2 :] = False
-    Y[tie_idx] = 1.0
-
-    return X, Y, models
-
-
-def get_bootstrap_result_style_control(
-    X, Y, battles, models, func_compute_elo, num_round=1000
-):
-    elos = []
-    coefs = []
-    assert X.shape[0] % 2 == 0 and X.shape[0] == Y.shape[0]
-    k = int(
-        X.shape[0] / 2
-    )  # Since we duplicate the battles when constructing X and Y, we don't want to sample the duplicates
-
-    battles_tie_idx = (battles["winner"] == "tie") | (
-        battles["winner"] == "tie (bothbad)"
-    )
-    for _ in tqdm(range(num_round), desc="bootstrap"):
-        indices = np.random.choice(list(range(k)), size=(k), replace=True)
-
-        index2tie = np.zeros(k, dtype=bool)
-        index2tie[battles_tie_idx] = True
-
-        nontie_indices = indices[~index2tie[indices]]
-        tie_indices = np.concatenate(
-            [indices[index2tie[indices]], indices[index2tie[indices]] + k]
-        )
-
-        _X = np.concatenate([X[nontie_indices], X[nontie_indices], X[tie_indices]])
-        _Y = np.concatenate([Y[nontie_indices], Y[nontie_indices], Y[tie_indices]])
-
-        assert _X.shape == X.shape and _Y.shape == Y.shape
-
-        states = ~_X[:, : len(models)].any(axis=0)
-
-        elo, coef = func_compute_elo(_X, _Y, models=models[~states])
-        elos.append(elo)
-        coefs.append(coef)
-
-    df = pd.DataFrame(elos)
-    return df[df.median().sort_values(ascending=False).index], coefs
-
-
 def filter_long_conv(row):
     threshold = 768
     for conversation_type in ["conversation_a", "conversation_b"]:
@@ -557,6 +332,7 @@ def report_elo_analysis_results(
     scale=1,
     filter_func=lambda x: True,
     style_control=False,
+    num_cpu=None,
 ):
     battles = pd.DataFrame(battles_json)
 
@@ -598,19 +374,18 @@ def report_elo_analysis_results(
 
     if rating_system == "bt":
         if style_control:
-            X, Y, models = construct_style_matrices(battles)
-            bootstrap_df, boostrap_coef = get_bootstrap_result_style_control(
-                X, Y, battles, models, fit_mle_elo, num_round=num_bootstrap
+            bootstrap_df, boostrap_coef = compute_bootstrap_style_control(
+                battles, num_round=num_bootstrap
             )
-            elo_rating_final, coef_final = fit_mle_elo(X, Y, models)
+            elo_rating_final, coef_final = compute_style_control(battles)
         else:
-            bootstrap_df = get_bootstrap_result(
-                battles, compute_elo_mle_with_tie, num_round=num_bootstrap
+            bootstrap_df = compute_bootstrap_bt(
+                battles, num_round=num_bootstrap, num_cpu=num_cpu
             )
-            elo_rating_final = compute_elo_mle_with_tie(battles)
+            elo_rating_final = compute_bt(battles)
     elif rating_system == "elo":
-        bootstrap_df = get_bootstrap_result(
-            battles, compute_elo, num_round=num_bootstrap
+        bootstrap_df = compute_bootstrap_elo(
+            battles, num_round=num_bootstrap, num_cpu=num_cpu
         )
         elo_rating_median = get_median_elo_from_bootstrap(bootstrap_df)
         elo_rating_final = elo_rating_median
@@ -715,6 +490,7 @@ def pretty_print_elo_rating(rating):
     parser.add_argument("--category", nargs="+", default=["full"])
     parser.add_argument("--scale", type=float, default=1)
     parser.add_argument("--style-control", action="store_true")
+    parser.add_argument("--num-cpu", type=int, default=12)
     args = parser.parse_args()
 
     np.random.seed(42)
@@ -753,6 +529,7 @@ def pretty_print_elo_rating(rating):
             scale=args.scale,
             filter_func=filter_func,
             style_control=args.style_control,
+            num_cpu=args.num_cpu,
         )
 
     for cat in args.category:
diff --git a/fastchat/serve/monitor/rating_systems.py b/fastchat/serve/monitor/rating_systems.py
new file mode 100644
index 0000000000..6dda5b5e62
--- /dev/null
+++ b/fastchat/serve/monitor/rating_systems.py
@@ -0,0 +1,385 @@
+import os
+import math
+import multiprocessing as mp
+from functools import partial
+import numpy as np
+from scipy.special import expit
+from scipy.optimize import minimize
+import pandas as pd
+from tqdm import tqdm
+
+
+STYLE_CONTROL_ELEMENTS_V1 = [
+    "sum_assistant_a_tokens",
+    "header_count_a",
+    "list_count_a",
+    "bold_count_a",
+    "sum_assistant_b_tokens",
+    "header_count_b",
+    "list_count_b",
+    "bold_count_b",
+]
+
+
+def get_matchups_models(df):
+    n_rows = len(df)
+    model_indices, models = pd.factorize(pd.concat([df["model_a"], df["model_b"]]))
+    matchups = np.column_stack([model_indices[:n_rows], model_indices[n_rows:]])
+    return matchups, models.to_list()
+
+
+def preprocess_for_elo(df):
+    """
+    in Elo we want numpy arrays for matchups and outcomes
+      matchups: int32 (N,2)  contains model ids for the competitors in a match
+      outcomes: float64 (N,) contains 1.0, 0.5, or 0.0 representing win, tie, or loss for model_a
+    """
+    matchups, models = get_matchups_models(df)
+    outcomes = np.full(len(df), 0.5)
+    outcomes[df["winner"] == "model_a"] = 1.0
+    outcomes[df["winner"] == "model_b"] = 0.0
+    return matchups, outcomes, models
+
+
+def preprocess_for_bt(df):
+    """in BT we only need the unique (matchup,outcome) sets along with the weights of how often they occur"""
+    n_rows = len(df)
+    # the 3 columns of schedule represent: model_a id, model_b id, outcome_id
+    schedule = np.full((n_rows, 3), fill_value=1, dtype=np.int32)
+    # set the two model cols by mapping the model names to their int ids
+    schedule[:, [0, 1]], models = get_matchups_models(df)
+    # map outcomes to integers (must be same dtype as model ids so it can be in the same array)
+    # model_a win -> 2, tie -> 1 (prefilled by default), model_b win -> 0
+    schedule[df["winner"] == "model_a", 2] = 2
+    schedule[df["winner"] == "model_b", 2] = 0
+    # count the number of occurances of each observed result
+    matchups_outcomes, weights = np.unique(schedule, return_counts=True, axis=0)
+    matchups = matchups_outcomes[:, [0, 1]]
+    # map 2 -> 1.0, 1 -> 0.5, 0 -> 0.0 which will be used as labels during optimization
+    outcomes = matchups_outcomes[:, 2].astype(np.float64) / 2.0
+    weights = weights.astype(np.float64)
+    # each possible result is weighted according to number of times it occured in the dataset
+    return matchups, outcomes, models, weights
+
+
+def preprocess_for_style(
+    df,
+    apply_ratio=[1, 1, 1, 1],
+    style_elements=STYLE_CONTROL_ELEMENTS_V1,
+    add_one=True,
+):
+    matchups, outcomes, models = preprocess_for_elo(
+        df
+    )  # this can use the same preprocessing as Elo
+
+    n = matchups.shape[0]
+    k = int(len(style_elements) / 2)
+
+    def extract_style_feature(x, feature):
+        val = x[feature]
+        if isinstance(val, int):
+            return val
+        else:
+            return sum(val.values())
+
+    style_vector = np.zeros(shape=(2 * k, n), dtype=np.int32)
+    for idx, element in enumerate(style_elements):
+        style_vector[idx, :] = df.conv_metadata.map(
+            partial(extract_style_feature, feature=element)
+        ).values
+    style_vector = np.ascontiguousarray(style_vector)
+
+    style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
+    style_sum = (style_vector[:k] + style_vector[k:]).astype(float)
+
+    if add_one:
+        style_sum = style_sum + np.ones(style_diff.shape)
+
+    apply_ratio = np.flatnonzero(apply_ratio)
+
+    # Apply ratio where necessary (length, etc)
+    style_diff[apply_ratio] /= style_sum[apply_ratio]
+
+    style_mean = np.mean(style_diff, axis=1)
+    style_std = np.std(style_diff, axis=1)
+    features = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T
+
+    return matchups, features, outcomes, models
+
+
+def fit_vectorized_elo(
+    matchups,
+    outcomes,
+    sample_indices,
+    num_models,
+    k=4.0,
+    base=10.0,
+    init_rating=1000.0,
+    scale=400.0,
+):
+    """fit multiple sets of Elo ratings on different samples of the data at the same time"""
+    alpha = math.log(base) / scale
+    num_samples = sample_indices.shape[1]
+    ratings = np.zeros(shape=(num_samples, num_models), dtype=np.float64)
+    # iterate over the rows of sample_indices, each column is an index into a match in the input arrays
+    sample_range = np.arange(num_samples)
+    for matchup_indices in sample_indices:
+        model_a_indices = matchups[matchup_indices, 0]
+        model_b_indices = matchups[matchup_indices, 1]
+        model_a_ratings = ratings[sample_range, model_a_indices]
+        model_b_ratings = ratings[sample_range, model_b_indices]
+        sample_outcomes = outcomes[matchup_indices]
+        probs = expit(alpha * (model_a_ratings - model_b_ratings))
+        updates = k * (sample_outcomes - probs)
+        ratings[sample_range, model_a_indices] += updates
+        ratings[sample_range, model_b_indices] -= updates
+    return ratings + init_rating
+
+
+def compute_elo(df, k=4.0, base=10.0, init_rating=1000.0, scale=400.0):
+    matchups, outcomes, models = preprocess_for_elo(df)
+    alpha = math.log(base) / scale
+    ratings = np.full(shape=(len(models),), fill_value=init_rating)
+    for (model_a_idx, model_b_idx), outcome in zip(matchups, outcomes):
+        prob = 1.0 / (
+            1.0 + math.exp(alpha * (ratings[model_b_idx] - ratings[model_a_idx]))
+        )
+        update = k * (outcome - prob)
+        ratings[model_a_idx] += update
+        ratings[model_b_idx] -= update
+    return {model: ratings[idx] for idx, model in enumerate(models)}
+
+
+def compute_bootstrap_elo(
+    df, num_round=100, k=4.0, base=10.0, init_rating=1000.0, scale=400.0
+):
+    matchups, outcomes, models = preprocess_for_elo(df)
+    sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))
+    ratings = fit_vectorized_elo(
+        matchups, outcomes, sample_indices, len(models), k, base, init_rating, scale
+    )
+    df = pd.DataFrame(data=ratings, columns=models)
+    return df[df.median().sort_values(ascending=False).index]
+
+
+def bt_loss_and_grad(ratings, matchups, outcomes, weights, alpha=1.0):
+    matchup_ratings = ratings[matchups]
+    logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
+    probs = expit(logits)
+    # this form naturally counts a draw as half a win and half a loss
+    loss = -(
+        (np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes)) * weights
+    ).sum()
+    matchups_grads = -alpha * (outcomes - probs) * weights
+    model_grad = np.zeros_like(ratings)
+    # aggregate gradients at the model level using the indices in matchups
+    np.add.at(
+        model_grad,
+        matchups[:, [0, 1]],
+        matchups_grads[:, None] * np.array([1.0, -1.0], dtype=np.float64),
+    )
+    return loss, model_grad
+
+
+def fit_bt(matchups, outcomes, weights, n_models, alpha, tol=1e-6):
+    initial_ratings = np.zeros(n_models, dtype=np.float64)
+    result = minimize(
+        fun=bt_loss_and_grad,
+        x0=initial_ratings,
+        args=(matchups, outcomes, weights, alpha),
+        jac=True,
+        method="L-BFGS-B",
+        options={"disp": False, "maxiter": 100, "gtol": tol},
+    )
+    return result["x"]
+
+
+def scale_and_offset(
+    ratings,
+    models,
+    scale=400,
+    init_rating=1000,
+    baseline_model="mixtral-8x7b-instruct-v0.1",
+    baseline_rating=1114,
+):
+    """convert ratings from the natural scale to the Elo rating scale with an anchored baseline"""
+    scaled_ratings = (ratings * scale) + init_rating
+    if baseline_model in models:
+        baseline_idx = models.index(baseline_model)
+        scaled_ratings += baseline_rating - scaled_ratings[..., [baseline_idx]]
+    return scaled_ratings
+
+
+def compute_bt(df, base=10.0, scale=400.0, init_rating=1000, tol=1e-6):
+    matchups, outcomes, models, weights = preprocess_for_bt(df)
+    ratings = fit_bt(matchups, outcomes, weights, len(models), math.log(base), tol)
+    scaled_ratings = scale_and_offset(ratings, models, scale, init_rating=init_rating)
+    return pd.Series(scaled_ratings, index=models).sort_values(ascending=False)
+
+
+def compute_bootstrap_bt(
+    battles,
+    num_round,
+    base=10.0,
+    scale=400.0,
+    init_rating=1000.0,
+    tol=1e-6,
+    num_cpu=None,
+):
+    matchups, outcomes, models, weights = preprocess_for_bt(battles)
+    # bootstrap sample the unique outcomes and their counts directly using the multinomial distribution
+    rng = np.random.default_rng(seed=0)
+    idxs = rng.multinomial(
+        n=len(battles), pvals=weights / weights.sum(), size=(num_round)
+    )
+    # only the distribution over their occurance counts changes between samples (and it can be 0)
+    boot_weights = idxs.astype(np.float64) / len(battles)
+
+    # the only thing different across samples is the distribution of weights
+    bt_fn = partial(
+        fit_bt, matchups, outcomes, n_models=len(models), alpha=np.log(base), tol=tol
+    )
+    with mp.Pool(num_cpu if num_cpu else os.cpu_count()) as pool:
+        results = list(tqdm(pool.imap_unordered(bt_fn, boot_weights), total=num_round))
+
+    ratings = np.array(results)
+    scaled_ratings = scale_and_offset(ratings, models, scale, init_rating)
+    df = pd.DataFrame(scaled_ratings, columns=models)
+    return df[df.median().sort_values(ascending=False).index]
+
+
+DIFF_MASK = np.array(
+    [1.0, -1.0], dtype=np.float64
+)  # create globally to not incur the instantiation cost in each call
+
+
+def contextual_bt_loss_and_grad(
+    params,
+    n_competitors,
+    matchups,
+    features,
+    outcomes,
+    alpha=1.0,
+    reg=1.0,
+    half_reg=0.5,
+):
+    reg_loss = half_reg * np.inner(params, params)
+
+    # Split params into ratings and feature parameters
+    ratings = params[:n_competitors]
+    feature_params = params[n_competitors:]
+
+    matchup_ratings = ratings[matchups]
+    bt_logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
+    context_logits = np.dot(features, feature_params)
+    probs = expit(bt_logits + context_logits)
+    loss = (
+        -((np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes))).sum()
+        + reg_loss
+    )
+
+    error = outcomes - probs
+    grad = reg * params  # initialize the grad as the regularization grad
+    matchups_grads = -alpha * error
+    np.add.at(
+        grad[:n_competitors], matchups[:, [0, 1]], matchups_grads[:, None] * DIFF_MASK
+    )
+    grad[n_competitors:] -= np.dot(features.T, error)
+    return loss, grad
+
+
+# note on regularization:
+# default reg is to 0.5 since the LogisticRegression default is 1.0
+# in the original implementation, matchups were duplicated
+# that made the ratio of log loss to reg loss "twice as high"
+# in this non-duplicated version for parity we also reduce the reg by one half to match
+def fit_contextual_bt(
+    matchups,
+    features,
+    outcomes,
+    models,
+    idxs=None,
+    alpha=math.log(10.0),
+    reg=0.5,
+    tol=1e-6,
+):
+    n_features = features.shape[1]
+    n_models = len(models)
+    initial_params = np.zeros(n_models + n_features, dtype=np.float64)
+    half_reg = reg / 2.0
+
+    # sample idxs optionally allow for fitting on a bootstrap sample of the dataset
+    if idxs is not None:
+        matchups, features, outcomes = matchups[idxs], features[idxs], outcomes[idxs]
+
+    result = minimize(
+        fun=contextual_bt_loss_and_grad,
+        x0=initial_params,
+        args=(n_models, matchups, features, outcomes, alpha, reg, half_reg),
+        jac=True,
+        method="L-BFGS-B",
+        options={"disp": False, "maxiter": 100, "gtol": tol},
+    )
+    return result["x"]
+
+
+def compute_style_control(
+    df, alpha=math.log(10.0), reg=0.5, init_rating=1000.0, scale=400.0, tol=1e-6
+):
+    matchups, features, outcomes, models = preprocess_for_style(df)
+    ratings_params = fit_contextual_bt(
+        matchups,
+        features,
+        outcomes,
+        models=models,
+        alpha=alpha,
+        reg=reg,
+        tol=tol,
+    )
+    ratings = ratings_params[: len(models)]
+    params = ratings_params[len(models) :]
+    scaled_ratings = scale_and_offset(ratings, models, scale, init_rating)
+    scaled_ratings = pd.Series(scaled_ratings, index=models).sort_values(
+        ascending=False
+    )
+    return scaled_ratings, params
+
+
+def compute_bootstrap_style_control(
+    df,
+    num_round,
+    alpha=math.log(10.0),
+    reg=0.5,
+    init_rating=1000.0,
+    scale=400.0,
+    tol=1e-6,
+    num_cpu=None,
+):
+    matchups, features, outcomes, models = preprocess_for_style(df)
+
+    contextual_bt_fn = partial(
+        fit_contextual_bt,
+        matchups,
+        features,
+        outcomes,
+        models,
+        alpha=alpha,
+        reg=reg,
+        tol=tol,
+    )
+
+    boot_idxs = np.random.randint(
+        low=0, high=matchups.shape[0], size=(num_round, matchups.shape[0])
+    )
+
+    with mp.Pool(num_cpu if num_cpu else os.cpu_count()) as pool:
+        results = list(
+            tqdm(pool.imap_unordered(contextual_bt_fn, boot_idxs), total=num_round)
+        )
+
+    ratings_params = np.array(results)
+    ratings = ratings_params[:, : len(models)]
+    params = ratings_params[:, len(models) :]
+    scaled_ratings = scale_and_offset(ratings, models, scale, init_rating)
+    df = pd.DataFrame(scaled_ratings, columns=models)
+    return df[df.median().sort_values(ascending=False).index], params