diff --git a/prediction_market_agent_tooling/benchmark/agents.py b/prediction_market_agent_tooling/benchmark/agents.py index 2ea68015..92563e3c 100644 --- a/prediction_market_agent_tooling/benchmark/agents.py +++ b/prediction_market_agent_tooling/benchmark/agents.py @@ -1,8 +1,8 @@ import random import typing as t +from datetime import datetime from prediction_market_agent_tooling.benchmark.utils import ( - EvaluatedQuestion, OutcomePrediction, Prediction, ) @@ -13,44 +13,72 @@ def __init__(self, agent_name: str, max_workers: t.Optional[int] = None): self.agent_name = agent_name self.max_workers = max_workers # Limit the number of workers that can run this worker in parallel threads - def evaluate(self, market_question: str) -> EvaluatedQuestion: - raise NotImplementedError + def is_predictable(self, market_question: str) -> bool: + """ + Override if the agent can decide to not predict the question, before doing the hard work. + """ + return True - def research(self, market_question: str) -> t.Optional[str]: + def predict(self, market_question: str) -> Prediction: + """ + Predict the outcome of the market question. + """ raise NotImplementedError - def predict( - self, market_question: str, researched: str, evaluated: EvaluatedQuestion + def check_and_predict(self, market_question: str) -> Prediction: + is_predictable = self.is_predictable(market_question=market_question) + if not is_predictable: + return Prediction(is_predictable=is_predictable) + return self.predict(market_question=market_question) + + def is_predictable_restricted( + self, + market_question: str, + time_restriction_up_to: datetime, + ) -> bool: + """ + Override if the agent can decide to not predict the question, before doing the hard work. + + Data used for the evaluation must be restricted to the time_restriction_up_to. + """ + return True + + def predict_restricted( + self, + market_question: str, + time_restriction_up_to: datetime, ) -> Prediction: + """ + Predict the outcome of the market question. + + Data used for the prediction must be restricted to the time_restriction_up_to. + """ raise NotImplementedError - def evaluate_research_predict(self, market_question: str) -> Prediction: - eval = self.evaluate(market_question=market_question) - if not eval.is_predictable: - return Prediction(evaluation=eval) - researched = self.research(market_question=market_question) - if researched is None: - return Prediction(evaluation=eval) - return self.predict( + def check_and_predict_restricted( + self, + market_question: str, + time_restriction_up_to: datetime, + ) -> Prediction: + """ + Data used must be restricted to the time_restriction_up_to. + """ + is_predictable = self.is_predictable_restricted( + market_question=market_question, + time_restriction_up_to=time_restriction_up_to, + ) + if not is_predictable: + return Prediction(is_predictable=is_predictable) + return self.predict_restricted( market_question=market_question, - researched=researched, - evaluated=eval, + time_restriction_up_to=time_restriction_up_to, ) class RandomAgent(AbstractBenchmarkedAgent): - def evaluate(self, market_question: str) -> EvaluatedQuestion: - return EvaluatedQuestion(question=market_question, is_predictable=True) - - def research(self, market_question: str) -> str: - return "" # No research for a random agent, but can't be None. - - def predict( - self, market_question: str, researched: str, evaluated: EvaluatedQuestion - ) -> Prediction: + def predict(self, market_question: str) -> Prediction: p_yes, confidence = random.random(), random.random() return Prediction( - evaluation=evaluated, outcome_prediction=OutcomePrediction( p_yes=p_yes, confidence=confidence, @@ -58,6 +86,11 @@ def predict( ), ) + def predict_restricted( + self, market_question: str, time_restriction_up_to: datetime + ) -> Prediction: + return self.predict(market_question) + class FixedAgent(AbstractBenchmarkedAgent): def __init__( @@ -66,21 +99,17 @@ def __init__( super().__init__(agent_name, max_workers) self.fixed_answer = fixed_answer - def evaluate(self, market_question: str) -> EvaluatedQuestion: - return EvaluatedQuestion(question=market_question, is_predictable=True) - - def research(self, market_question: str) -> str: - return "" # No research for a fixed agent, but can't be None. - - def predict( - self, market_question: str, researched: str, evaluated: EvaluatedQuestion - ) -> Prediction: + def predict(self, market_question: str) -> Prediction: p_yes, confidence = 1.0 if self.fixed_answer else 0.0, 1.0 return Prediction( - evaluation=evaluated, outcome_prediction=OutcomePrediction( p_yes=p_yes, confidence=confidence, info_utility=None, ), ) + + def predict_restricted( + self, market_question: str, time_restriction_up_to: datetime + ) -> Prediction: + return self.predict(market_question) diff --git a/prediction_market_agent_tooling/benchmark/benchmark.py b/prediction_market_agent_tooling/benchmark/benchmark.py index 2e060c68..c349fc4a 100644 --- a/prediction_market_agent_tooling/benchmark/benchmark.py +++ b/prediction_market_agent_tooling/benchmark/benchmark.py @@ -13,6 +13,7 @@ from prediction_market_agent_tooling.benchmark.agents import AbstractBenchmarkedAgent from prediction_market_agent_tooling.benchmark.utils import ( Market, + MarketResolution, Prediction, PredictionsCache, get_llm_api_call_cost, @@ -134,8 +135,13 @@ def run_agents(self, enable_timing: bool = True) -> None: def get_prediction_result(market: Market) -> tuple[str, Prediction]: with get_openai_callback() as cb: start = time.time() - prediction = agent.evaluate_research_predict( - market_question=market.question + prediction = ( + agent.check_and_predict(market_question=market.question) + if not market.is_resolved + else agent.check_and_predict_restricted( + market_question=market.question, + time_restriction_up_to=market.created_time, # TODO: Add support for resolved_at and any time in between. + ) ) prediction.time = time.time() - start if enable_timing else None @@ -263,8 +269,9 @@ def _compute_correct_outcome_percentage( correct_outcome_count = 0 for p, m in zip(predictions, markets): - if (check_not_none(p.outcome_prediction).p_yes > 0.5 and m.p_yes > 0.5) or ( - check_not_none(p.outcome_prediction).p_yes < 0.5 and m.p_yes < 0.5 + if ( + check_not_none(p.outcome_prediction).probable_resolution + == m.probable_resolution ): correct_outcome_count += 1 @@ -279,8 +286,18 @@ def _compute_precision_and_recall_percentages( if not predictions: return None, None - ground_truth = [m.p_yes > 0.5 for m in markets] - y_pred = [check_not_none(p.outcome_prediction).p_yes > 0.5 for p in predictions] + ground_truth = [ + (1 if m.probable_resolution == MarketResolution.YES else 0) for m in markets + ] + y_pred = [ + ( + 1 + if check_not_none(p.outcome_prediction).probable_resolution + == MarketResolution.YES + else 0 + ) + for p in predictions + ] precision = precision_score( ground_truth, y_pred, pos_label=pos_label, zero_division=0.0 @@ -332,9 +349,7 @@ def _compute_mean_time( def _compute_ratio_evaluated_as_answerable( self, predictions: t.List[Prediction], markets: t.List[Market] ) -> float: - return sum( - 1 for p in predictions if p.evaluation and p.evaluation.is_predictable - ) / len(predictions) + return sum(1 for p in predictions if p.is_predictable) / len(predictions) def _compute_ratio_answered( self, predictions: t.List[Prediction], markets: t.List[Market] @@ -374,27 +389,27 @@ def get_markets_summary(self) -> t.Dict[str, t.List[str | float]]: ] markets_summary[f"{agent} p_yes"] = [ ( - p.outcome_prediction.p_yes - if p.evaluation - and p.evaluation.is_predictable + f"{p.outcome_prediction.p_yes:.2f} [{p.outcome_prediction.probable_resolution.value}]" + if p.is_predictable and p.outcome_prediction # Is answerable and answered - else "N/A" - if not p.evaluation - and not p.outcome_prediction # Not evaluated for some reason - else "S" - if p.evaluation - and not p.evaluation.is_predictable # Skipped (evaluated to be not predictable) - else "F" - if p.evaluation - and p.evaluation.is_predictable - and not p.outcome_prediction # Failed (no prediction) - else should_not_happen( - f"Unexpected case in get_markets_summary() for {p}." + else ( + "S" + if not p.is_predictable # Skipped (evaluated to be not predictable) + else ( + "F" + if p.is_predictable + and not p.outcome_prediction # Failed (no prediction) + else should_not_happen( + f"Unexpected case in get_markets_summary() for {p}." + ) + ) ) ) for p in agent_predictions ] - markets_summary[f"reference p_yes"] = [m.p_yes for m in self.markets] + markets_summary[f"reference p_yes"] = [ + f"{m.p_yes:.2f} [{m.probable_resolution}]" for m in self.markets + ] return markets_summary def calculate_expected_returns( @@ -409,7 +424,6 @@ def calculate_expected_returns( # TODO: Add support for different bet sizes -- if we bet a low amount (such as <10 units), the real shares will be very close to that we calculate below (bet_units / share_price), # but if one bets a lot, it will change the share price along the way, and so he/she receives less than `bet_units / share_price`, but it's more complicated to calculate. bet_units = 10 # Assuming the agent always bet 10 units per market. - buy_yes_threshold = 0.5 # If the agent's prediction is > 50% it should buy "yes", otherwise "no". assert prediction.outcome_prediction is not None # Assume that market starts at 50/50 and so the price is 0.5 at the time we are buying it, @@ -417,13 +431,13 @@ def calculate_expected_returns( # as it's the same as the probability. yes_shares = ( bet_units / 0.5 # market.yes_outcome_price - if prediction.outcome_prediction.p_yes > buy_yes_threshold + if prediction.outcome_prediction.probable_resolution == MarketResolution.YES and market.yes_outcome_price > 0 else 0 ) no_shares = ( bet_units / 0.5 # market.no_outcome_price - if prediction.outcome_prediction.p_yes <= buy_yes_threshold + if prediction.outcome_prediction.probable_resolution == MarketResolution.NO and market.no_outcome_price > 0 else 0 ) diff --git a/prediction_market_agent_tooling/benchmark/utils.py b/prediction_market_agent_tooling/benchmark/utils.py index 8563da4a..e74998d6 100644 --- a/prediction_market_agent_tooling/benchmark/utils.py +++ b/prediction_market_agent_tooling/benchmark/utils.py @@ -7,12 +7,9 @@ import requests from pydantic import BaseModel, validator -MANIFOLD_API_LIMIT = 1000 # Manifold will only return up to 1000 markets - +from prediction_market_agent_tooling.tools.utils import should_not_happen -class EvaluatedQuestion(BaseModel): - question: str - is_predictable: bool +MANIFOLD_API_LIMIT = 1000 # Manifold will only return up to 1000 markets class MarketSource(str, Enum): @@ -20,15 +17,24 @@ class MarketSource(str, Enum): POLYMARKET = "polymarket" +class MarketFilter(str, Enum): + open = "open" + resolved = "resolved" + + +class MarketResolution(str, Enum): + YES = "yes" + NO = "no" + + class Market(BaseModel): source: MarketSource question: str url: str p_yes: float volume: float - is_resolved: bool created_time: datetime - resolution: str | None = None + resolution: MarketResolution | None = None outcomePrices: list[float] | None = None @validator("outcomePrices", pre=True) @@ -45,6 +51,10 @@ def _validate_created_time(cls, value: datetime) -> datetime: value = value.replace(tzinfo=pytz.UTC) return value + @property + def is_resolved(self) -> bool: + return self.resolution is not None + @property def p_no(self) -> float: return 1 - self.p_yes @@ -59,6 +69,16 @@ def no_outcome_price(self) -> float: # Use the outcome price if available, otherwise assume it's p_yes. return self.outcomePrices[1] if self.outcomePrices else 1 - self.p_yes + @property + def probable_resolution(self) -> MarketResolution: + return ( + self.resolution + if self.resolution is not None + else MarketResolution.YES + if self.p_yes > 0.5 + else MarketResolution.NO + ) + class OutcomePrediction(BaseModel): p_yes: float @@ -66,12 +86,12 @@ class OutcomePrediction(BaseModel): info_utility: t.Optional[float] @property - def binary_answer(self) -> bool: - return self.p_yes > 0.5 + def probable_resolution(self) -> MarketResolution: + return MarketResolution.YES if self.p_yes > 0.5 else MarketResolution.NO class Prediction(BaseModel): - evaluation: t.Optional[EvaluatedQuestion] = None + is_predictable: bool = True outcome_prediction: t.Optional[OutcomePrediction] = None time: t.Optional[float] = None @@ -142,16 +162,24 @@ def get_manifold_markets( m["source"] = MarketSource.MANIFOLD # Map JSON fields to Market fields - fields_map = { - "probability": "p_yes", - "isResolved": "is_resolved", - "createdTime": "created_time", + fields_map = {"probability": "p_yes", "createdTime": "created_time"} + process_values = { + "resolution": lambda v: v.lower() if v else None, } - def _map_fields(old: dict[str, str], mapping: dict[str, str]) -> dict[str, str]: - return {mapping.get(k, k): v for k, v in old.items()} + def _map_fields( + old: dict[str, str], + mapping: dict[str, str], + processing: dict[str, t.Callable[[t.Any], t.Any]], + ) -> dict[str, str]: + return { + mapping.get(k, k): processing.get(k, lambda x: x)(v) for k, v in old.items() + } - markets = [Market.model_validate(_map_fields(m, fields_map)) for m in markets_json] + markets = [ + Market.model_validate(_map_fields(m, fields_map, process_values)) + for m in markets_json + ] return markets @@ -241,6 +269,21 @@ def get_polymarket_markets( if excluded_questions and m_json["question"] in excluded_questions: continue + resolution = ( + MarketResolution.YES + if closed and m_json["outcomePrices"][0] == "1.0" + else ( + MarketResolution.NO + if closed and m_json["outcomePrices"][1] == "1.0" + else ( + should_not_happen() + if closed + and m_json["outcomePrices"] not in (["1.0", "0.0"], ["0.0", "1.0"]) + else None + ) + ) + ) + markets.append( Market( question=m_json["question"], @@ -251,7 +294,7 @@ def get_polymarket_markets( created_time=m_json["created_at"], outcomePrices=m_json["outcomePrices"], volume=m_json["volume"], - is_resolved=False, + resolution=resolution, source=MarketSource.POLYMARKET, ) ) @@ -261,15 +304,26 @@ def get_polymarket_markets( def get_markets( number: int, source: MarketSource, + filter_: MarketFilter = MarketFilter.open, excluded_questions: set[str] | None = None, ) -> t.List[Market]: if source == MarketSource.MANIFOLD: return get_manifold_markets_paged( - number=number, excluded_questions=excluded_questions + number=number, excluded_questions=excluded_questions, filter_=filter_.value ) elif source == MarketSource.POLYMARKET: return get_polymarket_markets( - limit=number, excluded_questions=excluded_questions + limit=number, + excluded_questions=excluded_questions, + closed=( + True + if filter_ == MarketFilter.resolved + else ( + False + if filter_ == MarketFilter.open + else should_not_happen(f"Unknown filter {filter_} for polymarket.") + ) + ), ) else: raise ValueError(f"Unknown market source: {source}") @@ -304,7 +358,3 @@ def get_llm_api_call_cost( model_cost += model_costs[model]["completion_tokens"] * completion_tokens model_cost /= 1000 return model_cost - - -def should_not_happen(message: str, E: t.Type[Exception] = RuntimeError) -> t.NoReturn: - raise E(message) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 094df17d..9d28c418 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -4,7 +4,6 @@ import prediction_market_agent_tooling.benchmark.benchmark as bm from prediction_market_agent_tooling.benchmark.utils import ( - EvaluatedQuestion, MarketSource, OutcomePrediction, get_markets, @@ -15,12 +14,9 @@ class DummyAgent(bm.AbstractBenchmarkedAgent): def __init__(self) -> None: super().__init__(agent_name="dummy") - def evaluate_research_predict(self, market_question: str) -> bm.Prediction: + def check_and_predict(self, market_question: str) -> bm.Prediction: return bm.Prediction( - evaluation=EvaluatedQuestion( - question=market_question, - is_predictable=True, - ), + is_predictable=True, outcome_prediction=OutcomePrediction( p_yes=0.6, confidence=0.8, @@ -38,12 +34,9 @@ class DummyAgentNoPrediction(bm.AbstractBenchmarkedAgent): def __init__(self) -> None: super().__init__(agent_name="dummy_no_prediction") - def evaluate_research_predict(self, market_question: str) -> bm.Prediction: + def check_and_predict(self, market_question: str) -> bm.Prediction: return bm.Prediction( - evaluation=EvaluatedQuestion( - question=market_question, - is_predictable=False, - ), + is_predictable=False, outcome_prediction=None, ) @@ -54,9 +47,7 @@ def dummy_agent_no_prediction() -> DummyAgentNoPrediction: def test_agent_prediction(dummy_agent: DummyAgent) -> None: - prediction = dummy_agent.evaluate_research_predict( - market_question="Will GNO go up?" - ) + prediction = dummy_agent.check_and_predict(market_question="Will GNO go up?") assert prediction.outcome_prediction is not None assert prediction.outcome_prediction.p_yes == 0.6 assert prediction.outcome_prediction.confidence == 0.8