diff --git a/prediction_market_agent_tooling/deploy/agent.py b/prediction_market_agent_tooling/deploy/agent.py index c23c94d5..531891a9 100644 --- a/prediction_market_agent_tooling/deploy/agent.py +++ b/prediction_market_agent_tooling/deploy/agent.py @@ -66,6 +66,7 @@ ) from prediction_market_agent_tooling.tools.hexbytes_custom import HexBytes from prediction_market_agent_tooling.tools.ipfs.ipfs_handler import IPFSHandler +from prediction_market_agent_tooling.tools.is_invalid import is_invalid from prediction_market_agent_tooling.tools.is_predictable import is_predictable_binary from prediction_market_agent_tooling.tools.langfuse_ import langfuse_context, observe from prediction_market_agent_tooling.tools.utils import DatetimeUTC, utcnow @@ -295,6 +296,7 @@ class DeployableTraderAgent(DeployableAgent): bet_on_n_markets_per_run: int = 1 min_required_balance_to_operate: xDai | None = xdai_type(1) min_balance_to_keep_in_native_currency: xDai | None = xdai_type(0.1) + allow_invalid_questions: bool = False def __init__( self, @@ -403,6 +405,9 @@ def verify_market(self, market_type: MarketType, market: AgentMarket) -> bool: if not is_predictable_binary(market.question): return False + if not self.allow_invalid_questions and is_invalid(market.question): + return False + return True def answer_binary_market(self, market: AgentMarket) -> ProbabilisticAnswer | None: diff --git a/prediction_market_agent_tooling/tools/is_invalid.py b/prediction_market_agent_tooling/tools/is_invalid.py new file mode 100644 index 00000000..0532294f --- /dev/null +++ b/prediction_market_agent_tooling/tools/is_invalid.py @@ -0,0 +1,92 @@ +import tenacity + +from prediction_market_agent_tooling.config import APIKeys +from prediction_market_agent_tooling.loggers import logger +from prediction_market_agent_tooling.tools.cache import persistent_inmemory_cache +from prediction_market_agent_tooling.tools.is_predictable import ( + parse_decision_yes_no_completion, +) +from prediction_market_agent_tooling.tools.langfuse_ import ( + get_langfuse_langchain_config, + observe, +) +from prediction_market_agent_tooling.tools.utils import ( + LLM_SEED, + LLM_SUPER_LOW_TEMPERATURE, +) + +# I tried to make it return a JSON, but it didn't work well in combo with asking it to do chain of thought. +# Rules are almost copy-pasted from https://cdn.kleros.link/ipfs/QmZM12kkguXFk2C94ykrKpambt4iUVKsVsxGxDEdLS68ws/omen-rules.pdf, +# with some small prompting mods and I removed the point about "The outcome of the market must be known by its Resolution Date.", because that can not be verified before-hand. +# and also point about "in which none of the answers are valid will resolve as invalid" and "in which multiple answers are valid will resolve as invalid.", because before hand we can not know if one of the outcomes happened or not. +QUESTION_IS_INVALID_PROMPT = """Main signs about an invalid question (sometimes referred to as a "market"): +- The market's question is about immoral violence, dead or assatination. +- The violant event can be caused by a single conscious being. +- The violant event is done ilegally. +- The market should not directly incentivize immoral violent (such as murder, rape or unjust imprisonment) actions which could likely be performed by any participant. +- Invalid: Will Donald Trump be alive on the 01/12/2021? (Anyone could bet on “No” and kill him for a guaranteed profit. Anyone could bet on “Yes” to effectively put a bounty on his head). +- Invalid: Will Hera be a victim of swatting in 2020? (Anyone could falsely call the emergency services on him in order to win the bet) +- This does not prevent markets: + - Whose topics are violent events not caused by conscious beings. + - Valid: How many people will die from COVID19 in 2020? (Viruses don’t use prediction markets). + - Whose main source of uncertainty is not related to a potential violent action. + - Valid: Will Trump win the 2020 US presidential election? (The main source of uncertainty is the vote of US citizens, not a potential murder of a presidential candidate). + - Which could give an incentive only to specific participants to commit an immoral violent action, but are in practice unlikely. + - Valid: Will the US be engaged in a military conflict with a UN member state in 2021? (It’s unlikely for the US to declare war in order to win a bet on this market). + - Valid: Will Derek Chauvin go to jail for the murder of George Flyod? (It’s unlikely that the jurors would collude to make a wrong verdict in order to win this market). +- Questions with relative dates will resolve as invalid. Dates must be stated in absolute terms, not relative depending on the current time. +- Invalid: Who will be the president of the United States in 6 months? (“in 6 months depends on the current time”). +- Questions about moral values and not facts will be resolved as invalid. +- Invalid: “Is it ethical to eat meat?”. + +Follow a chain of thought to evaluate if the question is invalid: + +First, write the parts of the following question: + +"{question}" + +Then, write down what is the future event of the question, what it refers to and when that event will happen if the question contains it. + +Then, explain why do you think it is or isn't invalid. + +Finally, write your final decision, write `decision: ` followed by either "yes it is invalid" or "no it isn't invalid" about the question. Don't write anything else after that. You must include "yes" or "no". +""" + + +@persistent_inmemory_cache +@tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_fixed(1)) +@observe() +def is_invalid( + question: str, + engine: str = "gpt-4o", + temperature: float = LLM_SUPER_LOW_TEMPERATURE, + seed: int = LLM_SEED, + prompt_template: str = QUESTION_IS_INVALID_PROMPT, + max_tokens: int = 1024, +) -> bool: + """ + Evaluate if the question is actually answerable. + """ + try: + from langchain.prompts import ChatPromptTemplate + from langchain_openai import ChatOpenAI + except ImportError: + logger.error("langchain not installed, skipping is_invalid") + return True + + llm = ChatOpenAI( + model=engine, + temperature=temperature, + seed=seed, + api_key=APIKeys().openai_api_key_secretstr_v1, + ) + + prompt = ChatPromptTemplate.from_template(template=prompt_template) + messages = prompt.format_messages(question=question) + completion = str( + llm.invoke( + messages, max_tokens=max_tokens, config=get_langfuse_langchain_config() + ).content + ) + + return parse_decision_yes_no_completion(question, completion) diff --git a/prediction_market_agent_tooling/tools/utils.py b/prediction_market_agent_tooling/tools/utils.py index d0c9876e..42ac1a03 100644 --- a/prediction_market_agent_tooling/tools/utils.py +++ b/prediction_market_agent_tooling/tools/utils.py @@ -22,8 +22,10 @@ T = TypeVar("T") # t=0 is mathematically impossible and it's not clear how OpenAI (and others) handle it, as a result, even with t=0, gpt-4-turbo produces very different outputs, -# it seems that using a very low temperature is the best way to have as consistent outputs as possible: https://community.openai.com/t/why-the-api-output-is-inconsistent-even-after-the-temperature-is-set-to-0/329541/12 +# see this experiment to figure out if you should use LLM_SUPER_LOW_TEMPERATURE or just 0: https://github.com/gnosis/prediction-market-agent/pull/438. LLM_SUPER_LOW_TEMPERATURE = 0.00000001 +# For consistent results, also include seed for models that supports it. +LLM_SEED = 0 def check_not_none( diff --git a/pyproject.toml b/pyproject.toml index 1e2b11c0..98f6d552 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "prediction-market-agent-tooling" -version = "0.51.0" +version = "0.51.1" description = "Tools to benchmark, deploy and monitor prediction market agents." authors = ["Gnosis"] readme = "README.md" diff --git a/tests/tools/test_is_invalid.py b/tests/tools/test_is_invalid.py new file mode 100644 index 00000000..a34f69e9 --- /dev/null +++ b/tests/tools/test_is_invalid.py @@ -0,0 +1,81 @@ +import pytest + +from prediction_market_agent_tooling.tools.is_invalid import is_invalid +from tests.utils import RUN_PAID_TESTS + + +@pytest.mark.skipif(not RUN_PAID_TESTS, reason="This test costs money to run.") +@pytest.mark.parametrize( + "question, invalid", + [ + ( + "Russian nuke in space?", + False, + ), + ( + "Russian nuke in space by March 31?", + False, + ), + ( + "Russian nuke in space in 2024?", + False, + ), + ( + "Russian nuke in space in 24 days?", + True, # Invalid because the timeframe is relative. + ), + ( + "Will there be an AI language model that surpasses ChatGPT and other OpenAI models before the end of 2024?", + False, + ), + ("Will Vladimir Putin be the President of Russia at the end of 2024?", False), + ( + "This market resolves YES when an artificial agent is appointed to the board of directors of a S&P500 company, meanwhile every day I will bet M25 in NO.", + False, + ), + ( + "Will there be a >0 value liquidity event for me, a former Consensys Software Inc. employee, on my shares of the company?", + False, + ), + ( + "Will this market have an odd number of traders by the end of 2024?", + False, # Not invalid, but it should be filtered out by `is_predictable_binary` function. + ), + ( + "Did COVID-19 come from a laboratory?", + False, + ), + ( + "What percentile did the median superforecaster get in the 2023 ACX prediction contest?", + False, # Not invalid, but should be filtered out by `is_predictable_binary` function as it isn't a Yes/No market. + ), + ( + "Will Hezbollah leader Hassan Nasrallah be confirmed dead by 6 October 2024?", + True, # Invalid, because this market could motivate people to do it. + ), + ( + "Will Netanyahu die before the end of October 7th 2024", + True, # Invalid, because this market could motivate people to do it. + ), + ( + "Will Travis James Mullis be executed in Texas on 1 October 2024?", + False, # Considered okay, becaues execution is done legally by the state, and so this market can not motivate people to do it. + ), + ( + "Will the second nitrogen gas execution in Alabama be carried out by 5 October 2024?", + False, # Considered okay, becaues execution is done legally by the state, and so this market can not motivate people to do it. + ), + ( + "Will someone beat me?", + True, # Invalid because it's violent. + ), + ( + "Is it okay to hunt?", + True, # Invalid because it's about ethicalness. + ), + ], +) +def test_is_invalid(question: str, invalid: bool) -> None: + assert ( + is_invalid(question=question) == invalid + ), f"Question is not evaluated correctly."