diff --git a/prediction_market_agent_tooling/benchmark/agents.py b/prediction_market_agent_tooling/benchmark/agents.py
index 2ea68015..92563e3c 100644
--- a/prediction_market_agent_tooling/benchmark/agents.py
+++ b/prediction_market_agent_tooling/benchmark/agents.py
@@ -1,8 +1,8 @@
 import random
 import typing as t
+from datetime import datetime
 
 from prediction_market_agent_tooling.benchmark.utils import (
-    EvaluatedQuestion,
     OutcomePrediction,
     Prediction,
 )
@@ -13,44 +13,72 @@ def __init__(self, agent_name: str, max_workers: t.Optional[int] = None):
         self.agent_name = agent_name
         self.max_workers = max_workers  # Limit the number of workers that can run this worker in parallel threads
 
-    def evaluate(self, market_question: str) -> EvaluatedQuestion:
-        raise NotImplementedError
+    def is_predictable(self, market_question: str) -> bool:
+        """
+        Override if the agent can decide to not predict the question, before doing the hard work.
+        """
+        return True
 
-    def research(self, market_question: str) -> t.Optional[str]:
+    def predict(self, market_question: str) -> Prediction:
+        """
+        Predict the outcome of the market question.
+        """
         raise NotImplementedError
 
-    def predict(
-        self, market_question: str, researched: str, evaluated: EvaluatedQuestion
+    def check_and_predict(self, market_question: str) -> Prediction:
+        is_predictable = self.is_predictable(market_question=market_question)
+        if not is_predictable:
+            return Prediction(is_predictable=is_predictable)
+        return self.predict(market_question=market_question)
+
+    def is_predictable_restricted(
+        self,
+        market_question: str,
+        time_restriction_up_to: datetime,
+    ) -> bool:
+        """
+        Override if the agent can decide to not predict the question, before doing the hard work.
+
+        Data used for the evaluation must be restricted to the time_restriction_up_to.
+        """
+        return True
+
+    def predict_restricted(
+        self,
+        market_question: str,
+        time_restriction_up_to: datetime,
     ) -> Prediction:
+        """
+        Predict the outcome of the market question.
+
+        Data used for the prediction must be restricted to the time_restriction_up_to.
+        """
         raise NotImplementedError
 
-    def evaluate_research_predict(self, market_question: str) -> Prediction:
-        eval = self.evaluate(market_question=market_question)
-        if not eval.is_predictable:
-            return Prediction(evaluation=eval)
-        researched = self.research(market_question=market_question)
-        if researched is None:
-            return Prediction(evaluation=eval)
-        return self.predict(
+    def check_and_predict_restricted(
+        self,
+        market_question: str,
+        time_restriction_up_to: datetime,
+    ) -> Prediction:
+        """
+        Data used must be restricted to the time_restriction_up_to.
+        """
+        is_predictable = self.is_predictable_restricted(
+            market_question=market_question,
+            time_restriction_up_to=time_restriction_up_to,
+        )
+        if not is_predictable:
+            return Prediction(is_predictable=is_predictable)
+        return self.predict_restricted(
             market_question=market_question,
-            researched=researched,
-            evaluated=eval,
+            time_restriction_up_to=time_restriction_up_to,
         )
 
 
 class RandomAgent(AbstractBenchmarkedAgent):
-    def evaluate(self, market_question: str) -> EvaluatedQuestion:
-        return EvaluatedQuestion(question=market_question, is_predictable=True)
-
-    def research(self, market_question: str) -> str:
-        return ""  # No research for a random agent, but can't be None.
-
-    def predict(
-        self, market_question: str, researched: str, evaluated: EvaluatedQuestion
-    ) -> Prediction:
+    def predict(self, market_question: str) -> Prediction:
         p_yes, confidence = random.random(), random.random()
         return Prediction(
-            evaluation=evaluated,
             outcome_prediction=OutcomePrediction(
                 p_yes=p_yes,
                 confidence=confidence,
@@ -58,6 +86,11 @@ def predict(
             ),
         )
 
+    def predict_restricted(
+        self, market_question: str, time_restriction_up_to: datetime
+    ) -> Prediction:
+        return self.predict(market_question)
+
 
 class FixedAgent(AbstractBenchmarkedAgent):
     def __init__(
@@ -66,21 +99,17 @@ def __init__(
         super().__init__(agent_name, max_workers)
         self.fixed_answer = fixed_answer
 
-    def evaluate(self, market_question: str) -> EvaluatedQuestion:
-        return EvaluatedQuestion(question=market_question, is_predictable=True)
-
-    def research(self, market_question: str) -> str:
-        return ""  # No research for a fixed agent, but can't be None.
-
-    def predict(
-        self, market_question: str, researched: str, evaluated: EvaluatedQuestion
-    ) -> Prediction:
+    def predict(self, market_question: str) -> Prediction:
         p_yes, confidence = 1.0 if self.fixed_answer else 0.0, 1.0
         return Prediction(
-            evaluation=evaluated,
             outcome_prediction=OutcomePrediction(
                 p_yes=p_yes,
                 confidence=confidence,
                 info_utility=None,
             ),
         )
+
+    def predict_restricted(
+        self, market_question: str, time_restriction_up_to: datetime
+    ) -> Prediction:
+        return self.predict(market_question)
diff --git a/prediction_market_agent_tooling/benchmark/benchmark.py b/prediction_market_agent_tooling/benchmark/benchmark.py
index 2e060c68..c349fc4a 100644
--- a/prediction_market_agent_tooling/benchmark/benchmark.py
+++ b/prediction_market_agent_tooling/benchmark/benchmark.py
@@ -13,6 +13,7 @@
 from prediction_market_agent_tooling.benchmark.agents import AbstractBenchmarkedAgent
 from prediction_market_agent_tooling.benchmark.utils import (
     Market,
+    MarketResolution,
     Prediction,
     PredictionsCache,
     get_llm_api_call_cost,
@@ -134,8 +135,13 @@ def run_agents(self, enable_timing: bool = True) -> None:
             def get_prediction_result(market: Market) -> tuple[str, Prediction]:
                 with get_openai_callback() as cb:
                     start = time.time()
-                    prediction = agent.evaluate_research_predict(
-                        market_question=market.question
+                    prediction = (
+                        agent.check_and_predict(market_question=market.question)
+                        if not market.is_resolved
+                        else agent.check_and_predict_restricted(
+                            market_question=market.question,
+                            time_restriction_up_to=market.created_time,  # TODO: Add support for resolved_at and any time in between.
+                        )
                     )
 
                     prediction.time = time.time() - start if enable_timing else None
@@ -263,8 +269,9 @@ def _compute_correct_outcome_percentage(
 
         correct_outcome_count = 0
         for p, m in zip(predictions, markets):
-            if (check_not_none(p.outcome_prediction).p_yes > 0.5 and m.p_yes > 0.5) or (
-                check_not_none(p.outcome_prediction).p_yes < 0.5 and m.p_yes < 0.5
+            if (
+                check_not_none(p.outcome_prediction).probable_resolution
+                == m.probable_resolution
             ):
                 correct_outcome_count += 1
 
@@ -279,8 +286,18 @@ def _compute_precision_and_recall_percentages(
         if not predictions:
             return None, None
 
-        ground_truth = [m.p_yes > 0.5 for m in markets]
-        y_pred = [check_not_none(p.outcome_prediction).p_yes > 0.5 for p in predictions]
+        ground_truth = [
+            (1 if m.probable_resolution == MarketResolution.YES else 0) for m in markets
+        ]
+        y_pred = [
+            (
+                1
+                if check_not_none(p.outcome_prediction).probable_resolution
+                == MarketResolution.YES
+                else 0
+            )
+            for p in predictions
+        ]
 
         precision = precision_score(
             ground_truth, y_pred, pos_label=pos_label, zero_division=0.0
@@ -332,9 +349,7 @@ def _compute_mean_time(
     def _compute_ratio_evaluated_as_answerable(
         self, predictions: t.List[Prediction], markets: t.List[Market]
     ) -> float:
-        return sum(
-            1 for p in predictions if p.evaluation and p.evaluation.is_predictable
-        ) / len(predictions)
+        return sum(1 for p in predictions if p.is_predictable) / len(predictions)
 
     def _compute_ratio_answered(
         self, predictions: t.List[Prediction], markets: t.List[Market]
@@ -374,27 +389,27 @@ def get_markets_summary(self) -> t.Dict[str, t.List[str | float]]:
             ]
             markets_summary[f"{agent} p_yes"] = [
                 (
-                    p.outcome_prediction.p_yes
-                    if p.evaluation
-                    and p.evaluation.is_predictable
+                    f"{p.outcome_prediction.p_yes:.2f} [{p.outcome_prediction.probable_resolution.value}]"
+                    if p.is_predictable
                     and p.outcome_prediction  # Is answerable and answered
-                    else "N/A"
-                    if not p.evaluation
-                    and not p.outcome_prediction  # Not evaluated for some reason
-                    else "S"
-                    if p.evaluation
-                    and not p.evaluation.is_predictable  # Skipped (evaluated to be not predictable)
-                    else "F"
-                    if p.evaluation
-                    and p.evaluation.is_predictable
-                    and not p.outcome_prediction  # Failed (no prediction)
-                    else should_not_happen(
-                        f"Unexpected case in get_markets_summary() for {p}."
+                    else (
+                        "S"
+                        if not p.is_predictable  # Skipped (evaluated to be not predictable)
+                        else (
+                            "F"
+                            if p.is_predictable
+                            and not p.outcome_prediction  # Failed (no prediction)
+                            else should_not_happen(
+                                f"Unexpected case in get_markets_summary() for {p}."
+                            )
+                        )
                     )
                 )
                 for p in agent_predictions
             ]
-        markets_summary[f"reference p_yes"] = [m.p_yes for m in self.markets]
+        markets_summary[f"reference p_yes"] = [
+            f"{m.p_yes:.2f} [{m.probable_resolution}]" for m in self.markets
+        ]
         return markets_summary
 
     def calculate_expected_returns(
@@ -409,7 +424,6 @@ def calculate_expected_returns(
         # TODO: Add support for different bet sizes -- if we bet a low amount (such as <10 units), the real shares will be very close to that we calculate below (bet_units / share_price),
         # but if one bets a lot, it will change the share price along the way, and so he/she receives less than `bet_units / share_price`, but it's more complicated to calculate.
         bet_units = 10  # Assuming the agent always bet 10 units per market.
-        buy_yes_threshold = 0.5  # If the agent's prediction is > 50% it should buy "yes", otherwise "no".
 
         assert prediction.outcome_prediction is not None
         # Assume that market starts at 50/50 and so the price is 0.5 at the time we are buying it,
@@ -417,13 +431,13 @@ def calculate_expected_returns(
         # as it's the same as the probability.
         yes_shares = (
             bet_units / 0.5  # market.yes_outcome_price
-            if prediction.outcome_prediction.p_yes > buy_yes_threshold
+            if prediction.outcome_prediction.probable_resolution == MarketResolution.YES
             and market.yes_outcome_price > 0
             else 0
         )
         no_shares = (
             bet_units / 0.5  # market.no_outcome_price
-            if prediction.outcome_prediction.p_yes <= buy_yes_threshold
+            if prediction.outcome_prediction.probable_resolution == MarketResolution.NO
             and market.no_outcome_price > 0
             else 0
         )
diff --git a/prediction_market_agent_tooling/benchmark/utils.py b/prediction_market_agent_tooling/benchmark/utils.py
index 8563da4a..e74998d6 100644
--- a/prediction_market_agent_tooling/benchmark/utils.py
+++ b/prediction_market_agent_tooling/benchmark/utils.py
@@ -7,12 +7,9 @@
 import requests
 from pydantic import BaseModel, validator
 
-MANIFOLD_API_LIMIT = 1000  # Manifold will only return up to 1000 markets
-
+from prediction_market_agent_tooling.tools.utils import should_not_happen
 
-class EvaluatedQuestion(BaseModel):
-    question: str
-    is_predictable: bool
+MANIFOLD_API_LIMIT = 1000  # Manifold will only return up to 1000 markets
 
 
 class MarketSource(str, Enum):
@@ -20,15 +17,24 @@ class MarketSource(str, Enum):
     POLYMARKET = "polymarket"
 
 
+class MarketFilter(str, Enum):
+    open = "open"
+    resolved = "resolved"
+
+
+class MarketResolution(str, Enum):
+    YES = "yes"
+    NO = "no"
+
+
 class Market(BaseModel):
     source: MarketSource
     question: str
     url: str
     p_yes: float
     volume: float
-    is_resolved: bool
     created_time: datetime
-    resolution: str | None = None
+    resolution: MarketResolution | None = None
     outcomePrices: list[float] | None = None
 
     @validator("outcomePrices", pre=True)
@@ -45,6 +51,10 @@ def _validate_created_time(cls, value: datetime) -> datetime:
             value = value.replace(tzinfo=pytz.UTC)
         return value
 
+    @property
+    def is_resolved(self) -> bool:
+        return self.resolution is not None
+
     @property
     def p_no(self) -> float:
         return 1 - self.p_yes
@@ -59,6 +69,16 @@ def no_outcome_price(self) -> float:
         # Use the outcome price if available, otherwise assume it's p_yes.
         return self.outcomePrices[1] if self.outcomePrices else 1 - self.p_yes
 
+    @property
+    def probable_resolution(self) -> MarketResolution:
+        return (
+            self.resolution
+            if self.resolution is not None
+            else MarketResolution.YES
+            if self.p_yes > 0.5
+            else MarketResolution.NO
+        )
+
 
 class OutcomePrediction(BaseModel):
     p_yes: float
@@ -66,12 +86,12 @@ class OutcomePrediction(BaseModel):
     info_utility: t.Optional[float]
 
     @property
-    def binary_answer(self) -> bool:
-        return self.p_yes > 0.5
+    def probable_resolution(self) -> MarketResolution:
+        return MarketResolution.YES if self.p_yes > 0.5 else MarketResolution.NO
 
 
 class Prediction(BaseModel):
-    evaluation: t.Optional[EvaluatedQuestion] = None
+    is_predictable: bool = True
     outcome_prediction: t.Optional[OutcomePrediction] = None
 
     time: t.Optional[float] = None
@@ -142,16 +162,24 @@ def get_manifold_markets(
         m["source"] = MarketSource.MANIFOLD
 
     # Map JSON fields to Market fields
-    fields_map = {
-        "probability": "p_yes",
-        "isResolved": "is_resolved",
-        "createdTime": "created_time",
+    fields_map = {"probability": "p_yes", "createdTime": "created_time"}
+    process_values = {
+        "resolution": lambda v: v.lower() if v else None,
     }
 
-    def _map_fields(old: dict[str, str], mapping: dict[str, str]) -> dict[str, str]:
-        return {mapping.get(k, k): v for k, v in old.items()}
+    def _map_fields(
+        old: dict[str, str],
+        mapping: dict[str, str],
+        processing: dict[str, t.Callable[[t.Any], t.Any]],
+    ) -> dict[str, str]:
+        return {
+            mapping.get(k, k): processing.get(k, lambda x: x)(v) for k, v in old.items()
+        }
 
-    markets = [Market.model_validate(_map_fields(m, fields_map)) for m in markets_json]
+    markets = [
+        Market.model_validate(_map_fields(m, fields_map, process_values))
+        for m in markets_json
+    ]
 
     return markets
 
@@ -241,6 +269,21 @@ def get_polymarket_markets(
         if excluded_questions and m_json["question"] in excluded_questions:
             continue
 
+        resolution = (
+            MarketResolution.YES
+            if closed and m_json["outcomePrices"][0] == "1.0"
+            else (
+                MarketResolution.NO
+                if closed and m_json["outcomePrices"][1] == "1.0"
+                else (
+                    should_not_happen()
+                    if closed
+                    and m_json["outcomePrices"] not in (["1.0", "0.0"], ["0.0", "1.0"])
+                    else None
+                )
+            )
+        )
+
         markets.append(
             Market(
                 question=m_json["question"],
@@ -251,7 +294,7 @@ def get_polymarket_markets(
                 created_time=m_json["created_at"],
                 outcomePrices=m_json["outcomePrices"],
                 volume=m_json["volume"],
-                is_resolved=False,
+                resolution=resolution,
                 source=MarketSource.POLYMARKET,
             )
         )
@@ -261,15 +304,26 @@ def get_polymarket_markets(
 def get_markets(
     number: int,
     source: MarketSource,
+    filter_: MarketFilter = MarketFilter.open,
     excluded_questions: set[str] | None = None,
 ) -> t.List[Market]:
     if source == MarketSource.MANIFOLD:
         return get_manifold_markets_paged(
-            number=number, excluded_questions=excluded_questions
+            number=number, excluded_questions=excluded_questions, filter_=filter_.value
         )
     elif source == MarketSource.POLYMARKET:
         return get_polymarket_markets(
-            limit=number, excluded_questions=excluded_questions
+            limit=number,
+            excluded_questions=excluded_questions,
+            closed=(
+                True
+                if filter_ == MarketFilter.resolved
+                else (
+                    False
+                    if filter_ == MarketFilter.open
+                    else should_not_happen(f"Unknown filter {filter_} for polymarket.")
+                )
+            ),
         )
     else:
         raise ValueError(f"Unknown market source: {source}")
@@ -304,7 +358,3 @@ def get_llm_api_call_cost(
     model_cost += model_costs[model]["completion_tokens"] * completion_tokens
     model_cost /= 1000
     return model_cost
-
-
-def should_not_happen(message: str, E: t.Type[Exception] = RuntimeError) -> t.NoReturn:
-    raise E(message)
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 094df17d..9d28c418 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -4,7 +4,6 @@
 
 import prediction_market_agent_tooling.benchmark.benchmark as bm
 from prediction_market_agent_tooling.benchmark.utils import (
-    EvaluatedQuestion,
     MarketSource,
     OutcomePrediction,
     get_markets,
@@ -15,12 +14,9 @@ class DummyAgent(bm.AbstractBenchmarkedAgent):
     def __init__(self) -> None:
         super().__init__(agent_name="dummy")
 
-    def evaluate_research_predict(self, market_question: str) -> bm.Prediction:
+    def check_and_predict(self, market_question: str) -> bm.Prediction:
         return bm.Prediction(
-            evaluation=EvaluatedQuestion(
-                question=market_question,
-                is_predictable=True,
-            ),
+            is_predictable=True,
             outcome_prediction=OutcomePrediction(
                 p_yes=0.6,
                 confidence=0.8,
@@ -38,12 +34,9 @@ class DummyAgentNoPrediction(bm.AbstractBenchmarkedAgent):
     def __init__(self) -> None:
         super().__init__(agent_name="dummy_no_prediction")
 
-    def evaluate_research_predict(self, market_question: str) -> bm.Prediction:
+    def check_and_predict(self, market_question: str) -> bm.Prediction:
         return bm.Prediction(
-            evaluation=EvaluatedQuestion(
-                question=market_question,
-                is_predictable=False,
-            ),
+            is_predictable=False,
             outcome_prediction=None,
         )
 
@@ -54,9 +47,7 @@ def dummy_agent_no_prediction() -> DummyAgentNoPrediction:
 
 
 def test_agent_prediction(dummy_agent: DummyAgent) -> None:
-    prediction = dummy_agent.evaluate_research_predict(
-        market_question="Will GNO go up?"
-    )
+    prediction = dummy_agent.check_and_predict(market_question="Will GNO go up?")
     assert prediction.outcome_prediction is not None
     assert prediction.outcome_prediction.p_yes == 0.6
     assert prediction.outcome_prediction.confidence == 0.8