Skip to content

Commit

Permalink
Support for closed markets benchmarks + a little cleaning up (#17)
Browse files Browse the repository at this point in the history
  • Loading branch information
kongzii authored Feb 20, 2024
1 parent e951b91 commit d226831
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 102 deletions.
101 changes: 65 additions & 36 deletions prediction_market_agent_tooling/benchmark/agents.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import random
import typing as t
from datetime import datetime

from prediction_market_agent_tooling.benchmark.utils import (
EvaluatedQuestion,
OutcomePrediction,
Prediction,
)
Expand All @@ -13,51 +13,84 @@ def __init__(self, agent_name: str, max_workers: t.Optional[int] = None):
self.agent_name = agent_name
self.max_workers = max_workers # Limit the number of workers that can run this worker in parallel threads

def evaluate(self, market_question: str) -> EvaluatedQuestion:
raise NotImplementedError
def is_predictable(self, market_question: str) -> bool:
"""
Override if the agent can decide to not predict the question, before doing the hard work.
"""
return True

def research(self, market_question: str) -> t.Optional[str]:
def predict(self, market_question: str) -> Prediction:
"""
Predict the outcome of the market question.
"""
raise NotImplementedError

def predict(
self, market_question: str, researched: str, evaluated: EvaluatedQuestion
def check_and_predict(self, market_question: str) -> Prediction:
is_predictable = self.is_predictable(market_question=market_question)
if not is_predictable:
return Prediction(is_predictable=is_predictable)
return self.predict(market_question=market_question)

def is_predictable_restricted(
self,
market_question: str,
time_restriction_up_to: datetime,
) -> bool:
"""
Override if the agent can decide to not predict the question, before doing the hard work.
Data used for the evaluation must be restricted to the time_restriction_up_to.
"""
return True

def predict_restricted(
self,
market_question: str,
time_restriction_up_to: datetime,
) -> Prediction:
"""
Predict the outcome of the market question.
Data used for the prediction must be restricted to the time_restriction_up_to.
"""
raise NotImplementedError

def evaluate_research_predict(self, market_question: str) -> Prediction:
eval = self.evaluate(market_question=market_question)
if not eval.is_predictable:
return Prediction(evaluation=eval)
researched = self.research(market_question=market_question)
if researched is None:
return Prediction(evaluation=eval)
return self.predict(
def check_and_predict_restricted(
self,
market_question: str,
time_restriction_up_to: datetime,
) -> Prediction:
"""
Data used must be restricted to the time_restriction_up_to.
"""
is_predictable = self.is_predictable_restricted(
market_question=market_question,
time_restriction_up_to=time_restriction_up_to,
)
if not is_predictable:
return Prediction(is_predictable=is_predictable)
return self.predict_restricted(
market_question=market_question,
researched=researched,
evaluated=eval,
time_restriction_up_to=time_restriction_up_to,
)


class RandomAgent(AbstractBenchmarkedAgent):
def evaluate(self, market_question: str) -> EvaluatedQuestion:
return EvaluatedQuestion(question=market_question, is_predictable=True)

def research(self, market_question: str) -> str:
return "" # No research for a random agent, but can't be None.

def predict(
self, market_question: str, researched: str, evaluated: EvaluatedQuestion
) -> Prediction:
def predict(self, market_question: str) -> Prediction:
p_yes, confidence = random.random(), random.random()
return Prediction(
evaluation=evaluated,
outcome_prediction=OutcomePrediction(
p_yes=p_yes,
confidence=confidence,
info_utility=None,
),
)

def predict_restricted(
self, market_question: str, time_restriction_up_to: datetime
) -> Prediction:
return self.predict(market_question)


class FixedAgent(AbstractBenchmarkedAgent):
def __init__(
Expand All @@ -66,21 +99,17 @@ def __init__(
super().__init__(agent_name, max_workers)
self.fixed_answer = fixed_answer

def evaluate(self, market_question: str) -> EvaluatedQuestion:
return EvaluatedQuestion(question=market_question, is_predictable=True)

def research(self, market_question: str) -> str:
return "" # No research for a fixed agent, but can't be None.

def predict(
self, market_question: str, researched: str, evaluated: EvaluatedQuestion
) -> Prediction:
def predict(self, market_question: str) -> Prediction:
p_yes, confidence = 1.0 if self.fixed_answer else 0.0, 1.0
return Prediction(
evaluation=evaluated,
outcome_prediction=OutcomePrediction(
p_yes=p_yes,
confidence=confidence,
info_utility=None,
),
)

def predict_restricted(
self, market_question: str, time_restriction_up_to: datetime
) -> Prediction:
return self.predict(market_question)
70 changes: 42 additions & 28 deletions prediction_market_agent_tooling/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from prediction_market_agent_tooling.benchmark.agents import AbstractBenchmarkedAgent
from prediction_market_agent_tooling.benchmark.utils import (
Market,
MarketResolution,
Prediction,
PredictionsCache,
get_llm_api_call_cost,
Expand Down Expand Up @@ -134,8 +135,13 @@ def run_agents(self, enable_timing: bool = True) -> None:
def get_prediction_result(market: Market) -> tuple[str, Prediction]:
with get_openai_callback() as cb:
start = time.time()
prediction = agent.evaluate_research_predict(
market_question=market.question
prediction = (
agent.check_and_predict(market_question=market.question)
if not market.is_resolved
else agent.check_and_predict_restricted(
market_question=market.question,
time_restriction_up_to=market.created_time, # TODO: Add support for resolved_at and any time in between.
)
)

prediction.time = time.time() - start if enable_timing else None
Expand Down Expand Up @@ -263,8 +269,9 @@ def _compute_correct_outcome_percentage(

correct_outcome_count = 0
for p, m in zip(predictions, markets):
if (check_not_none(p.outcome_prediction).p_yes > 0.5 and m.p_yes > 0.5) or (
check_not_none(p.outcome_prediction).p_yes < 0.5 and m.p_yes < 0.5
if (
check_not_none(p.outcome_prediction).probable_resolution
== m.probable_resolution
):
correct_outcome_count += 1

Expand All @@ -279,8 +286,18 @@ def _compute_precision_and_recall_percentages(
if not predictions:
return None, None

ground_truth = [m.p_yes > 0.5 for m in markets]
y_pred = [check_not_none(p.outcome_prediction).p_yes > 0.5 for p in predictions]
ground_truth = [
(1 if m.probable_resolution == MarketResolution.YES else 0) for m in markets
]
y_pred = [
(
1
if check_not_none(p.outcome_prediction).probable_resolution
== MarketResolution.YES
else 0
)
for p in predictions
]

precision = precision_score(
ground_truth, y_pred, pos_label=pos_label, zero_division=0.0
Expand Down Expand Up @@ -332,9 +349,7 @@ def _compute_mean_time(
def _compute_ratio_evaluated_as_answerable(
self, predictions: t.List[Prediction], markets: t.List[Market]
) -> float:
return sum(
1 for p in predictions if p.evaluation and p.evaluation.is_predictable
) / len(predictions)
return sum(1 for p in predictions if p.is_predictable) / len(predictions)

def _compute_ratio_answered(
self, predictions: t.List[Prediction], markets: t.List[Market]
Expand Down Expand Up @@ -374,27 +389,27 @@ def get_markets_summary(self) -> t.Dict[str, t.List[str | float]]:
]
markets_summary[f"{agent} p_yes"] = [
(
p.outcome_prediction.p_yes
if p.evaluation
and p.evaluation.is_predictable
f"{p.outcome_prediction.p_yes:.2f} [{p.outcome_prediction.probable_resolution.value}]"
if p.is_predictable
and p.outcome_prediction # Is answerable and answered
else "N/A"
if not p.evaluation
and not p.outcome_prediction # Not evaluated for some reason
else "S"
if p.evaluation
and not p.evaluation.is_predictable # Skipped (evaluated to be not predictable)
else "F"
if p.evaluation
and p.evaluation.is_predictable
and not p.outcome_prediction # Failed (no prediction)
else should_not_happen(
f"Unexpected case in get_markets_summary() for {p}."
else (
"S"
if not p.is_predictable # Skipped (evaluated to be not predictable)
else (
"F"
if p.is_predictable
and not p.outcome_prediction # Failed (no prediction)
else should_not_happen(
f"Unexpected case in get_markets_summary() for {p}."
)
)
)
)
for p in agent_predictions
]
markets_summary[f"reference p_yes"] = [m.p_yes for m in self.markets]
markets_summary[f"reference p_yes"] = [
f"{m.p_yes:.2f} [{m.probable_resolution}]" for m in self.markets
]
return markets_summary

def calculate_expected_returns(
Expand All @@ -409,21 +424,20 @@ def calculate_expected_returns(
# TODO: Add support for different bet sizes -- if we bet a low amount (such as <10 units), the real shares will be very close to that we calculate below (bet_units / share_price),
# but if one bets a lot, it will change the share price along the way, and so he/she receives less than `bet_units / share_price`, but it's more complicated to calculate.
bet_units = 10 # Assuming the agent always bet 10 units per market.
buy_yes_threshold = 0.5 # If the agent's prediction is > 50% it should buy "yes", otherwise "no".

assert prediction.outcome_prediction is not None
# Assume that market starts at 50/50 and so the price is 0.5 at the time we are buying it,
# we can't use {yes,no}_outcome_price atm, because it would just cancel out to EV = 0.0,
# as it's the same as the probability.
yes_shares = (
bet_units / 0.5 # market.yes_outcome_price
if prediction.outcome_prediction.p_yes > buy_yes_threshold
if prediction.outcome_prediction.probable_resolution == MarketResolution.YES
and market.yes_outcome_price > 0
else 0
)
no_shares = (
bet_units / 0.5 # market.no_outcome_price
if prediction.outcome_prediction.p_yes <= buy_yes_threshold
if prediction.outcome_prediction.probable_resolution == MarketResolution.NO
and market.no_outcome_price > 0
else 0
)
Expand Down
Loading

0 comments on commit d226831

Please sign in to comment.