From e208d5677c6837d590b81cb03847c0b9de100765 Mon Sep 17 00:00:00 2001 From: Clayton Thorrez Date: Sun, 22 Sep 2024 23:26:55 -0700 Subject: [PATCH] Accelerate Bradley Terry MLE model fitting (#3523) --- fastchat/serve/monitor/elo_analysis.py | 261 ++------------- fastchat/serve/monitor/rating_systems.py | 385 +++++++++++++++++++++++ 2 files changed, 404 insertions(+), 242 deletions(-) create mode 100644 fastchat/serve/monitor/rating_systems.py diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py index bea808fc5d..6a16c92184 100644 --- a/fastchat/serve/monitor/elo_analysis.py +++ b/fastchat/serve/monitor/elo_analysis.py @@ -17,120 +17,18 @@ from fastchat.model.model_registry import get_model_info from fastchat.serve.monitor.basic_stats import get_log_files from fastchat.serve.monitor.clean_battle_data import clean_battle_data +from fastchat.serve.monitor.rating_systems import ( + compute_elo, + compute_bt, + compute_style_control, + compute_bootstrap_elo, + compute_bootstrap_bt, + compute_bootstrap_style_control, +) pd.options.display.float_format = "{:.2f}".format -STYLE_CONTROL_ELEMENTS_V1 = [ - "sum_assistant_a_tokens", - "header_count_a", - "list_count_a", - "bold_count_a", - "sum_assistant_b_tokens", - "header_count_b", - "list_count_b", - "bold_count_b", -] - - -def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000): - rating = defaultdict(lambda: INIT_RATING) - - for rd, model_a, model_b, winner in battles[ - ["model_a", "model_b", "winner"] - ].itertuples(): - ra = rating[model_a] - rb = rating[model_b] - ea = 1 / (1 + BASE ** ((rb - ra) / SCALE)) - eb = 1 / (1 + BASE ** ((ra - rb) / SCALE)) - if winner == "model_a": - sa = 1 - elif winner == "model_b": - sa = 0 - elif winner == "tie" or winner == "tie (bothbad)": - sa = 0.5 - else: - raise Exception(f"unexpected vote {winner}") - rating[model_a] += K * (sa - ea) - rating[model_b] += K * (1 - sa - eb) - - return dict(rating) - - -def get_bootstrap_result(battles, func_compute_elo, num_round=1000): - rows = [] - for i in tqdm(range(num_round), desc="bootstrap"): - tmp_battles = battles.sample(frac=1.0, replace=True) - rows.append(func_compute_elo(tmp_battles)) - df = pd.DataFrame(rows) - return df[df.median().sort_values(ascending=False).index] - - -def compute_elo_mle_with_tie( - df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None -): - from sklearn.linear_model import LogisticRegression - - ptbl_a_win = pd.pivot_table( - df[df["winner"] == "model_a"], - index="model_a", - columns="model_b", - aggfunc="size", - fill_value=0, - ) - ptbl_tie = pd.pivot_table( - df[df["winner"].isin(["tie", "tie (bothbad)"])], - index="model_a", - columns="model_b", - aggfunc="size", - fill_value=0, - ) - ptbl_tie = ptbl_tie + ptbl_tie.T - ptbl_b_win = pd.pivot_table( - df[df["winner"] == "model_b"], - index="model_a", - columns="model_b", - aggfunc="size", - fill_value=0, - ) - ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie - - models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index) - - p = len(models) - X = np.zeros([p * (p - 1) * 2, p]) - Y = np.zeros(p * (p - 1) * 2) - - cur_row = 0 - sample_weights = [] - for m_a in ptbl_win.index: - for m_b in ptbl_win.columns: - if m_a == m_b: - continue - # if nan skip - if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]): - continue - X[cur_row, models[m_a]] = +math.log(BASE) - X[cur_row, models[m_b]] = -math.log(BASE) - Y[cur_row] = 1.0 - sample_weights.append(ptbl_win.loc[m_a, m_b]) - - X[cur_row + 1, models[m_a]] = math.log(BASE) - X[cur_row + 1, models[m_b]] = -math.log(BASE) - Y[cur_row + 1] = 0.0 - sample_weights.append(ptbl_win.loc[m_b, m_a]) - cur_row += 2 - X = X[:cur_row] - Y = Y[:cur_row] - - lr = LogisticRegression(fit_intercept=False, penalty=None) - lr.fit(X, Y, sample_weight=sample_weights) - elo_scores = SCALE * lr.coef_[0] + INIT_RATING - if "mixtral-8x7b-instruct-v0.1" in models.index: - elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]] - return pd.Series(elo_scores, index=models.index).sort_values(ascending=False) - - def get_median_elo_from_bootstrap(bootstrap_df): median = dict(bootstrap_df.quantile(0.5)) median = {k: int(v + 0.5) for k, v in median.items()} @@ -411,129 +309,6 @@ def outlier_detect( return battles -def fit_mle_elo(X, Y, models, indices=None, SCALE=400, INIT_RATING=1000): - from sklearn.linear_model import LogisticRegression - - p = len(models.index) - - lr = LogisticRegression(fit_intercept=False) - if indices: - lr.fit(X[indices], Y[indices]) - else: - lr.fit(X, Y) - - elo_scores = SCALE * lr.coef_[0] + INIT_RATING - # calibrate llama-13b to 800 if applicable - if "mixtral-8x7b-instruct-v0.1" in models.index: - elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]] - return ( - pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False), - lr.coef_[0][p:], - ) - - -def construct_style_matrices( - df, - BASE=10, - apply_ratio=[1, 1, 1, 1], - style_elements=STYLE_CONTROL_ELEMENTS_V1, - add_one=True, -): - models = pd.concat([df["model_a"], df["model_b"]]).unique() - models = pd.Series(np.arange(len(models)), index=models) - - # duplicate battles - df = pd.concat([df, df], ignore_index=True) - p = len(models.index) - n = df.shape[0] - assert len(style_elements) % 2 == 0 - k = int(len(style_elements) / 2) - - X = np.zeros([n, p + k]) - X[np.arange(n), models[df["model_a"]]] = +math.log(BASE) - X[np.arange(n), models[df["model_b"]]] = -math.log(BASE) - - # creates turn each of the specified column in "conv_metadata" into a vector - style_vector = np.array( - [ - df.conv_metadata.map( - lambda x: x[element] - if type(x[element]) is int - else sum(x[element].values()) - ).tolist() - for element in style_elements - ] - ) - - style_diff = (style_vector[:k] - style_vector[k:]).astype(float) - style_sum = (style_vector[:k] + style_vector[k:]).astype(float) - - if add_one: - style_sum = style_sum + np.ones(style_diff.shape) - - apply_ratio = np.flatnonzero(apply_ratio) - - style_diff[apply_ratio] /= style_sum[ - apply_ratio - ] # Apply ratio where necessary (length, etc) - - style_mean = np.mean(style_diff, axis=1) - style_std = np.std(style_diff, axis=1) - - X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T - - # one A win => two A win - Y = np.zeros(n) - Y[df["winner"] == "model_a"] = 1.0 - - # one tie => one A win + one B win - # find tie + tie (both bad) index - tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)") - tie_idx[len(tie_idx) // 2 :] = False - Y[tie_idx] = 1.0 - - return X, Y, models - - -def get_bootstrap_result_style_control( - X, Y, battles, models, func_compute_elo, num_round=1000 -): - elos = [] - coefs = [] - assert X.shape[0] % 2 == 0 and X.shape[0] == Y.shape[0] - k = int( - X.shape[0] / 2 - ) # Since we duplicate the battles when constructing X and Y, we don't want to sample the duplicates - - battles_tie_idx = (battles["winner"] == "tie") | ( - battles["winner"] == "tie (bothbad)" - ) - for _ in tqdm(range(num_round), desc="bootstrap"): - indices = np.random.choice(list(range(k)), size=(k), replace=True) - - index2tie = np.zeros(k, dtype=bool) - index2tie[battles_tie_idx] = True - - nontie_indices = indices[~index2tie[indices]] - tie_indices = np.concatenate( - [indices[index2tie[indices]], indices[index2tie[indices]] + k] - ) - - _X = np.concatenate([X[nontie_indices], X[nontie_indices], X[tie_indices]]) - _Y = np.concatenate([Y[nontie_indices], Y[nontie_indices], Y[tie_indices]]) - - assert _X.shape == X.shape and _Y.shape == Y.shape - - states = ~_X[:, : len(models)].any(axis=0) - - elo, coef = func_compute_elo(_X, _Y, models=models[~states]) - elos.append(elo) - coefs.append(coef) - - df = pd.DataFrame(elos) - return df[df.median().sort_values(ascending=False).index], coefs - - def filter_long_conv(row): threshold = 768 for conversation_type in ["conversation_a", "conversation_b"]: @@ -557,6 +332,7 @@ def report_elo_analysis_results( scale=1, filter_func=lambda x: True, style_control=False, + num_cpu=None, ): battles = pd.DataFrame(battles_json) @@ -598,19 +374,18 @@ def report_elo_analysis_results( if rating_system == "bt": if style_control: - X, Y, models = construct_style_matrices(battles) - bootstrap_df, boostrap_coef = get_bootstrap_result_style_control( - X, Y, battles, models, fit_mle_elo, num_round=num_bootstrap + bootstrap_df, boostrap_coef = compute_bootstrap_style_control( + battles, num_round=num_bootstrap ) - elo_rating_final, coef_final = fit_mle_elo(X, Y, models) + elo_rating_final, coef_final = compute_style_control(battles) else: - bootstrap_df = get_bootstrap_result( - battles, compute_elo_mle_with_tie, num_round=num_bootstrap + bootstrap_df = compute_bootstrap_bt( + battles, num_round=num_bootstrap, num_cpu=num_cpu ) - elo_rating_final = compute_elo_mle_with_tie(battles) + elo_rating_final = compute_bt(battles) elif rating_system == "elo": - bootstrap_df = get_bootstrap_result( - battles, compute_elo, num_round=num_bootstrap + bootstrap_df = compute_bootstrap_elo( + battles, num_round=num_bootstrap, num_cpu=num_cpu ) elo_rating_median = get_median_elo_from_bootstrap(bootstrap_df) elo_rating_final = elo_rating_median @@ -715,6 +490,7 @@ def pretty_print_elo_rating(rating): parser.add_argument("--category", nargs="+", default=["full"]) parser.add_argument("--scale", type=float, default=1) parser.add_argument("--style-control", action="store_true") + parser.add_argument("--num-cpu", type=int, default=12) args = parser.parse_args() np.random.seed(42) @@ -753,6 +529,7 @@ def pretty_print_elo_rating(rating): scale=args.scale, filter_func=filter_func, style_control=args.style_control, + num_cpu=args.num_cpu, ) for cat in args.category: diff --git a/fastchat/serve/monitor/rating_systems.py b/fastchat/serve/monitor/rating_systems.py new file mode 100644 index 0000000000..6dda5b5e62 --- /dev/null +++ b/fastchat/serve/monitor/rating_systems.py @@ -0,0 +1,385 @@ +import os +import math +import multiprocessing as mp +from functools import partial +import numpy as np +from scipy.special import expit +from scipy.optimize import minimize +import pandas as pd +from tqdm import tqdm + + +STYLE_CONTROL_ELEMENTS_V1 = [ + "sum_assistant_a_tokens", + "header_count_a", + "list_count_a", + "bold_count_a", + "sum_assistant_b_tokens", + "header_count_b", + "list_count_b", + "bold_count_b", +] + + +def get_matchups_models(df): + n_rows = len(df) + model_indices, models = pd.factorize(pd.concat([df["model_a"], df["model_b"]])) + matchups = np.column_stack([model_indices[:n_rows], model_indices[n_rows:]]) + return matchups, models.to_list() + + +def preprocess_for_elo(df): + """ + in Elo we want numpy arrays for matchups and outcomes + matchups: int32 (N,2) contains model ids for the competitors in a match + outcomes: float64 (N,) contains 1.0, 0.5, or 0.0 representing win, tie, or loss for model_a + """ + matchups, models = get_matchups_models(df) + outcomes = np.full(len(df), 0.5) + outcomes[df["winner"] == "model_a"] = 1.0 + outcomes[df["winner"] == "model_b"] = 0.0 + return matchups, outcomes, models + + +def preprocess_for_bt(df): + """in BT we only need the unique (matchup,outcome) sets along with the weights of how often they occur""" + n_rows = len(df) + # the 3 columns of schedule represent: model_a id, model_b id, outcome_id + schedule = np.full((n_rows, 3), fill_value=1, dtype=np.int32) + # set the two model cols by mapping the model names to their int ids + schedule[:, [0, 1]], models = get_matchups_models(df) + # map outcomes to integers (must be same dtype as model ids so it can be in the same array) + # model_a win -> 2, tie -> 1 (prefilled by default), model_b win -> 0 + schedule[df["winner"] == "model_a", 2] = 2 + schedule[df["winner"] == "model_b", 2] = 0 + # count the number of occurances of each observed result + matchups_outcomes, weights = np.unique(schedule, return_counts=True, axis=0) + matchups = matchups_outcomes[:, [0, 1]] + # map 2 -> 1.0, 1 -> 0.5, 0 -> 0.0 which will be used as labels during optimization + outcomes = matchups_outcomes[:, 2].astype(np.float64) / 2.0 + weights = weights.astype(np.float64) + # each possible result is weighted according to number of times it occured in the dataset + return matchups, outcomes, models, weights + + +def preprocess_for_style( + df, + apply_ratio=[1, 1, 1, 1], + style_elements=STYLE_CONTROL_ELEMENTS_V1, + add_one=True, +): + matchups, outcomes, models = preprocess_for_elo( + df + ) # this can use the same preprocessing as Elo + + n = matchups.shape[0] + k = int(len(style_elements) / 2) + + def extract_style_feature(x, feature): + val = x[feature] + if isinstance(val, int): + return val + else: + return sum(val.values()) + + style_vector = np.zeros(shape=(2 * k, n), dtype=np.int32) + for idx, element in enumerate(style_elements): + style_vector[idx, :] = df.conv_metadata.map( + partial(extract_style_feature, feature=element) + ).values + style_vector = np.ascontiguousarray(style_vector) + + style_diff = (style_vector[:k] - style_vector[k:]).astype(float) + style_sum = (style_vector[:k] + style_vector[k:]).astype(float) + + if add_one: + style_sum = style_sum + np.ones(style_diff.shape) + + apply_ratio = np.flatnonzero(apply_ratio) + + # Apply ratio where necessary (length, etc) + style_diff[apply_ratio] /= style_sum[apply_ratio] + + style_mean = np.mean(style_diff, axis=1) + style_std = np.std(style_diff, axis=1) + features = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T + + return matchups, features, outcomes, models + + +def fit_vectorized_elo( + matchups, + outcomes, + sample_indices, + num_models, + k=4.0, + base=10.0, + init_rating=1000.0, + scale=400.0, +): + """fit multiple sets of Elo ratings on different samples of the data at the same time""" + alpha = math.log(base) / scale + num_samples = sample_indices.shape[1] + ratings = np.zeros(shape=(num_samples, num_models), dtype=np.float64) + # iterate over the rows of sample_indices, each column is an index into a match in the input arrays + sample_range = np.arange(num_samples) + for matchup_indices in sample_indices: + model_a_indices = matchups[matchup_indices, 0] + model_b_indices = matchups[matchup_indices, 1] + model_a_ratings = ratings[sample_range, model_a_indices] + model_b_ratings = ratings[sample_range, model_b_indices] + sample_outcomes = outcomes[matchup_indices] + probs = expit(alpha * (model_a_ratings - model_b_ratings)) + updates = k * (sample_outcomes - probs) + ratings[sample_range, model_a_indices] += updates + ratings[sample_range, model_b_indices] -= updates + return ratings + init_rating + + +def compute_elo(df, k=4.0, base=10.0, init_rating=1000.0, scale=400.0): + matchups, outcomes, models = preprocess_for_elo(df) + alpha = math.log(base) / scale + ratings = np.full(shape=(len(models),), fill_value=init_rating) + for (model_a_idx, model_b_idx), outcome in zip(matchups, outcomes): + prob = 1.0 / ( + 1.0 + math.exp(alpha * (ratings[model_b_idx] - ratings[model_a_idx])) + ) + update = k * (outcome - prob) + ratings[model_a_idx] += update + ratings[model_b_idx] -= update + return {model: ratings[idx] for idx, model in enumerate(models)} + + +def compute_bootstrap_elo( + df, num_round=100, k=4.0, base=10.0, init_rating=1000.0, scale=400.0 +): + matchups, outcomes, models = preprocess_for_elo(df) + sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round)) + ratings = fit_vectorized_elo( + matchups, outcomes, sample_indices, len(models), k, base, init_rating, scale + ) + df = pd.DataFrame(data=ratings, columns=models) + return df[df.median().sort_values(ascending=False).index] + + +def bt_loss_and_grad(ratings, matchups, outcomes, weights, alpha=1.0): + matchup_ratings = ratings[matchups] + logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1]) + probs = expit(logits) + # this form naturally counts a draw as half a win and half a loss + loss = -( + (np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes)) * weights + ).sum() + matchups_grads = -alpha * (outcomes - probs) * weights + model_grad = np.zeros_like(ratings) + # aggregate gradients at the model level using the indices in matchups + np.add.at( + model_grad, + matchups[:, [0, 1]], + matchups_grads[:, None] * np.array([1.0, -1.0], dtype=np.float64), + ) + return loss, model_grad + + +def fit_bt(matchups, outcomes, weights, n_models, alpha, tol=1e-6): + initial_ratings = np.zeros(n_models, dtype=np.float64) + result = minimize( + fun=bt_loss_and_grad, + x0=initial_ratings, + args=(matchups, outcomes, weights, alpha), + jac=True, + method="L-BFGS-B", + options={"disp": False, "maxiter": 100, "gtol": tol}, + ) + return result["x"] + + +def scale_and_offset( + ratings, + models, + scale=400, + init_rating=1000, + baseline_model="mixtral-8x7b-instruct-v0.1", + baseline_rating=1114, +): + """convert ratings from the natural scale to the Elo rating scale with an anchored baseline""" + scaled_ratings = (ratings * scale) + init_rating + if baseline_model in models: + baseline_idx = models.index(baseline_model) + scaled_ratings += baseline_rating - scaled_ratings[..., [baseline_idx]] + return scaled_ratings + + +def compute_bt(df, base=10.0, scale=400.0, init_rating=1000, tol=1e-6): + matchups, outcomes, models, weights = preprocess_for_bt(df) + ratings = fit_bt(matchups, outcomes, weights, len(models), math.log(base), tol) + scaled_ratings = scale_and_offset(ratings, models, scale, init_rating=init_rating) + return pd.Series(scaled_ratings, index=models).sort_values(ascending=False) + + +def compute_bootstrap_bt( + battles, + num_round, + base=10.0, + scale=400.0, + init_rating=1000.0, + tol=1e-6, + num_cpu=None, +): + matchups, outcomes, models, weights = preprocess_for_bt(battles) + # bootstrap sample the unique outcomes and their counts directly using the multinomial distribution + rng = np.random.default_rng(seed=0) + idxs = rng.multinomial( + n=len(battles), pvals=weights / weights.sum(), size=(num_round) + ) + # only the distribution over their occurance counts changes between samples (and it can be 0) + boot_weights = idxs.astype(np.float64) / len(battles) + + # the only thing different across samples is the distribution of weights + bt_fn = partial( + fit_bt, matchups, outcomes, n_models=len(models), alpha=np.log(base), tol=tol + ) + with mp.Pool(num_cpu if num_cpu else os.cpu_count()) as pool: + results = list(tqdm(pool.imap_unordered(bt_fn, boot_weights), total=num_round)) + + ratings = np.array(results) + scaled_ratings = scale_and_offset(ratings, models, scale, init_rating) + df = pd.DataFrame(scaled_ratings, columns=models) + return df[df.median().sort_values(ascending=False).index] + + +DIFF_MASK = np.array( + [1.0, -1.0], dtype=np.float64 +) # create globally to not incur the instantiation cost in each call + + +def contextual_bt_loss_and_grad( + params, + n_competitors, + matchups, + features, + outcomes, + alpha=1.0, + reg=1.0, + half_reg=0.5, +): + reg_loss = half_reg * np.inner(params, params) + + # Split params into ratings and feature parameters + ratings = params[:n_competitors] + feature_params = params[n_competitors:] + + matchup_ratings = ratings[matchups] + bt_logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1]) + context_logits = np.dot(features, feature_params) + probs = expit(bt_logits + context_logits) + loss = ( + -((np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes))).sum() + + reg_loss + ) + + error = outcomes - probs + grad = reg * params # initialize the grad as the regularization grad + matchups_grads = -alpha * error + np.add.at( + grad[:n_competitors], matchups[:, [0, 1]], matchups_grads[:, None] * DIFF_MASK + ) + grad[n_competitors:] -= np.dot(features.T, error) + return loss, grad + + +# note on regularization: +# default reg is to 0.5 since the LogisticRegression default is 1.0 +# in the original implementation, matchups were duplicated +# that made the ratio of log loss to reg loss "twice as high" +# in this non-duplicated version for parity we also reduce the reg by one half to match +def fit_contextual_bt( + matchups, + features, + outcomes, + models, + idxs=None, + alpha=math.log(10.0), + reg=0.5, + tol=1e-6, +): + n_features = features.shape[1] + n_models = len(models) + initial_params = np.zeros(n_models + n_features, dtype=np.float64) + half_reg = reg / 2.0 + + # sample idxs optionally allow for fitting on a bootstrap sample of the dataset + if idxs is not None: + matchups, features, outcomes = matchups[idxs], features[idxs], outcomes[idxs] + + result = minimize( + fun=contextual_bt_loss_and_grad, + x0=initial_params, + args=(n_models, matchups, features, outcomes, alpha, reg, half_reg), + jac=True, + method="L-BFGS-B", + options={"disp": False, "maxiter": 100, "gtol": tol}, + ) + return result["x"] + + +def compute_style_control( + df, alpha=math.log(10.0), reg=0.5, init_rating=1000.0, scale=400.0, tol=1e-6 +): + matchups, features, outcomes, models = preprocess_for_style(df) + ratings_params = fit_contextual_bt( + matchups, + features, + outcomes, + models=models, + alpha=alpha, + reg=reg, + tol=tol, + ) + ratings = ratings_params[: len(models)] + params = ratings_params[len(models) :] + scaled_ratings = scale_and_offset(ratings, models, scale, init_rating) + scaled_ratings = pd.Series(scaled_ratings, index=models).sort_values( + ascending=False + ) + return scaled_ratings, params + + +def compute_bootstrap_style_control( + df, + num_round, + alpha=math.log(10.0), + reg=0.5, + init_rating=1000.0, + scale=400.0, + tol=1e-6, + num_cpu=None, +): + matchups, features, outcomes, models = preprocess_for_style(df) + + contextual_bt_fn = partial( + fit_contextual_bt, + matchups, + features, + outcomes, + models, + alpha=alpha, + reg=reg, + tol=tol, + ) + + boot_idxs = np.random.randint( + low=0, high=matchups.shape[0], size=(num_round, matchups.shape[0]) + ) + + with mp.Pool(num_cpu if num_cpu else os.cpu_count()) as pool: + results = list( + tqdm(pool.imap_unordered(contextual_bt_fn, boot_idxs), total=num_round) + ) + + ratings_params = np.array(results) + ratings = ratings_params[:, : len(models)] + params = ratings_params[:, len(models) :] + scaled_ratings = scale_and_offset(ratings, models, scale, init_rating) + df = pd.DataFrame(scaled_ratings, columns=models) + return df[df.median().sort_values(ascending=False).index], params