Skip to content

Commit

Permalink
Accelerate Bradley Terry MLE model fitting (lm-sys#3523)
Browse files Browse the repository at this point in the history
  • Loading branch information
cthorrez authored Sep 23, 2024
1 parent 3773213 commit e208d56
Show file tree
Hide file tree
Showing 2 changed files with 404 additions and 242 deletions.
261 changes: 19 additions & 242 deletions fastchat/serve/monitor/elo_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,120 +17,18 @@
from fastchat.model.model_registry import get_model_info
from fastchat.serve.monitor.basic_stats import get_log_files
from fastchat.serve.monitor.clean_battle_data import clean_battle_data
from fastchat.serve.monitor.rating_systems import (
compute_elo,
compute_bt,
compute_style_control,
compute_bootstrap_elo,
compute_bootstrap_bt,
compute_bootstrap_style_control,
)

pd.options.display.float_format = "{:.2f}".format


STYLE_CONTROL_ELEMENTS_V1 = [
"sum_assistant_a_tokens",
"header_count_a",
"list_count_a",
"bold_count_a",
"sum_assistant_b_tokens",
"header_count_b",
"list_count_b",
"bold_count_b",
]


def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
rating = defaultdict(lambda: INIT_RATING)

for rd, model_a, model_b, winner in battles[
["model_a", "model_b", "winner"]
].itertuples():
ra = rating[model_a]
rb = rating[model_b]
ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
if winner == "model_a":
sa = 1
elif winner == "model_b":
sa = 0
elif winner == "tie" or winner == "tie (bothbad)":
sa = 0.5
else:
raise Exception(f"unexpected vote {winner}")
rating[model_a] += K * (sa - ea)
rating[model_b] += K * (1 - sa - eb)

return dict(rating)


def get_bootstrap_result(battles, func_compute_elo, num_round=1000):
rows = []
for i in tqdm(range(num_round), desc="bootstrap"):
tmp_battles = battles.sample(frac=1.0, replace=True)
rows.append(func_compute_elo(tmp_battles))
df = pd.DataFrame(rows)
return df[df.median().sort_values(ascending=False).index]


def compute_elo_mle_with_tie(
df, SCALE=400, BASE=10, INIT_RATING=1000, sample_weight=None
):
from sklearn.linear_model import LogisticRegression

ptbl_a_win = pd.pivot_table(
df[df["winner"] == "model_a"],
index="model_a",
columns="model_b",
aggfunc="size",
fill_value=0,
)
ptbl_tie = pd.pivot_table(
df[df["winner"].isin(["tie", "tie (bothbad)"])],
index="model_a",
columns="model_b",
aggfunc="size",
fill_value=0,
)
ptbl_tie = ptbl_tie + ptbl_tie.T
ptbl_b_win = pd.pivot_table(
df[df["winner"] == "model_b"],
index="model_a",
columns="model_b",
aggfunc="size",
fill_value=0,
)
ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

p = len(models)
X = np.zeros([p * (p - 1) * 2, p])
Y = np.zeros(p * (p - 1) * 2)

cur_row = 0
sample_weights = []
for m_a in ptbl_win.index:
for m_b in ptbl_win.columns:
if m_a == m_b:
continue
# if nan skip
if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
continue
X[cur_row, models[m_a]] = +math.log(BASE)
X[cur_row, models[m_b]] = -math.log(BASE)
Y[cur_row] = 1.0
sample_weights.append(ptbl_win.loc[m_a, m_b])

X[cur_row + 1, models[m_a]] = math.log(BASE)
X[cur_row + 1, models[m_b]] = -math.log(BASE)
Y[cur_row + 1] = 0.0
sample_weights.append(ptbl_win.loc[m_b, m_a])
cur_row += 2
X = X[:cur_row]
Y = Y[:cur_row]

lr = LogisticRegression(fit_intercept=False, penalty=None)
lr.fit(X, Y, sample_weight=sample_weights)
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
if "mixtral-8x7b-instruct-v0.1" in models.index:
elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)


def get_median_elo_from_bootstrap(bootstrap_df):
median = dict(bootstrap_df.quantile(0.5))
median = {k: int(v + 0.5) for k, v in median.items()}
Expand Down Expand Up @@ -411,129 +309,6 @@ def outlier_detect(
return battles


def fit_mle_elo(X, Y, models, indices=None, SCALE=400, INIT_RATING=1000):
from sklearn.linear_model import LogisticRegression

p = len(models.index)

lr = LogisticRegression(fit_intercept=False)
if indices:
lr.fit(X[indices], Y[indices])
else:
lr.fit(X, Y)

elo_scores = SCALE * lr.coef_[0] + INIT_RATING
# calibrate llama-13b to 800 if applicable
if "mixtral-8x7b-instruct-v0.1" in models.index:
elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
return (
pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False),
lr.coef_[0][p:],
)


def construct_style_matrices(
df,
BASE=10,
apply_ratio=[1, 1, 1, 1],
style_elements=STYLE_CONTROL_ELEMENTS_V1,
add_one=True,
):
models = pd.concat([df["model_a"], df["model_b"]]).unique()
models = pd.Series(np.arange(len(models)), index=models)

# duplicate battles
df = pd.concat([df, df], ignore_index=True)
p = len(models.index)
n = df.shape[0]
assert len(style_elements) % 2 == 0
k = int(len(style_elements) / 2)

X = np.zeros([n, p + k])
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)

# creates turn each of the specified column in "conv_metadata" into a vector
style_vector = np.array(
[
df.conv_metadata.map(
lambda x: x[element]
if type(x[element]) is int
else sum(x[element].values())
).tolist()
for element in style_elements
]
)

style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
style_sum = (style_vector[:k] + style_vector[k:]).astype(float)

if add_one:
style_sum = style_sum + np.ones(style_diff.shape)

apply_ratio = np.flatnonzero(apply_ratio)

style_diff[apply_ratio] /= style_sum[
apply_ratio
] # Apply ratio where necessary (length, etc)

style_mean = np.mean(style_diff, axis=1)
style_std = np.std(style_diff, axis=1)

X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T

# one A win => two A win
Y = np.zeros(n)
Y[df["winner"] == "model_a"] = 1.0

# one tie => one A win + one B win
# find tie + tie (both bad) index
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
tie_idx[len(tie_idx) // 2 :] = False
Y[tie_idx] = 1.0

return X, Y, models


def get_bootstrap_result_style_control(
X, Y, battles, models, func_compute_elo, num_round=1000
):
elos = []
coefs = []
assert X.shape[0] % 2 == 0 and X.shape[0] == Y.shape[0]
k = int(
X.shape[0] / 2
) # Since we duplicate the battles when constructing X and Y, we don't want to sample the duplicates

battles_tie_idx = (battles["winner"] == "tie") | (
battles["winner"] == "tie (bothbad)"
)
for _ in tqdm(range(num_round), desc="bootstrap"):
indices = np.random.choice(list(range(k)), size=(k), replace=True)

index2tie = np.zeros(k, dtype=bool)
index2tie[battles_tie_idx] = True

nontie_indices = indices[~index2tie[indices]]
tie_indices = np.concatenate(
[indices[index2tie[indices]], indices[index2tie[indices]] + k]
)

_X = np.concatenate([X[nontie_indices], X[nontie_indices], X[tie_indices]])
_Y = np.concatenate([Y[nontie_indices], Y[nontie_indices], Y[tie_indices]])

assert _X.shape == X.shape and _Y.shape == Y.shape

states = ~_X[:, : len(models)].any(axis=0)

elo, coef = func_compute_elo(_X, _Y, models=models[~states])
elos.append(elo)
coefs.append(coef)

df = pd.DataFrame(elos)
return df[df.median().sort_values(ascending=False).index], coefs


def filter_long_conv(row):
threshold = 768
for conversation_type in ["conversation_a", "conversation_b"]:
Expand All @@ -557,6 +332,7 @@ def report_elo_analysis_results(
scale=1,
filter_func=lambda x: True,
style_control=False,
num_cpu=None,
):
battles = pd.DataFrame(battles_json)

Expand Down Expand Up @@ -598,19 +374,18 @@ def report_elo_analysis_results(

if rating_system == "bt":
if style_control:
X, Y, models = construct_style_matrices(battles)
bootstrap_df, boostrap_coef = get_bootstrap_result_style_control(
X, Y, battles, models, fit_mle_elo, num_round=num_bootstrap
bootstrap_df, boostrap_coef = compute_bootstrap_style_control(
battles, num_round=num_bootstrap
)
elo_rating_final, coef_final = fit_mle_elo(X, Y, models)
elo_rating_final, coef_final = compute_style_control(battles)
else:
bootstrap_df = get_bootstrap_result(
battles, compute_elo_mle_with_tie, num_round=num_bootstrap
bootstrap_df = compute_bootstrap_bt(
battles, num_round=num_bootstrap, num_cpu=num_cpu
)
elo_rating_final = compute_elo_mle_with_tie(battles)
elo_rating_final = compute_bt(battles)
elif rating_system == "elo":
bootstrap_df = get_bootstrap_result(
battles, compute_elo, num_round=num_bootstrap
bootstrap_df = compute_bootstrap_elo(
battles, num_round=num_bootstrap, num_cpu=num_cpu
)
elo_rating_median = get_median_elo_from_bootstrap(bootstrap_df)
elo_rating_final = elo_rating_median
Expand Down Expand Up @@ -715,6 +490,7 @@ def pretty_print_elo_rating(rating):
parser.add_argument("--category", nargs="+", default=["full"])
parser.add_argument("--scale", type=float, default=1)
parser.add_argument("--style-control", action="store_true")
parser.add_argument("--num-cpu", type=int, default=12)
args = parser.parse_args()

np.random.seed(42)
Expand Down Expand Up @@ -753,6 +529,7 @@ def pretty_print_elo_rating(rating):
scale=args.scale,
filter_func=filter_func,
style_control=args.style_control,
num_cpu=args.num_cpu,
)

for cat in args.category:
Expand Down
Loading

0 comments on commit e208d56

Please sign in to comment.