From 2c68a13bfe10b86f40e3eefc3fcfacb32c00b02a Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Sat, 5 Oct 2024 20:29:21 -0700 Subject: [PATCH] Code sync (#3546) --- README.md | 2 +- fastchat/constants.py | 13 +- fastchat/conversation.py | 242 ++++++++++++-- fastchat/model/model_adapter.py | 91 +++++- fastchat/model/model_registry.py | 208 +++++++++--- fastchat/serve/api_provider.py | 302 +++++++++++++----- fastchat/serve/call_monitor.py | 16 +- fastchat/serve/gradio_block_arena_anony.py | 44 ++- fastchat/serve/gradio_block_arena_named.py | 11 +- fastchat/serve/gradio_block_arena_vision.py | 195 +++++++---- .../serve/gradio_block_arena_vision_anony.py | 143 +++++---- .../serve/gradio_block_arena_vision_named.py | 175 ++++++++-- fastchat/serve/gradio_global_state.py | 12 + fastchat/serve/gradio_web_server.py | 53 ++- fastchat/serve/gradio_web_server_multi.py | 145 ++++++--- fastchat/serve/monitor/monitor.py | 116 +++++-- fastchat/serve/monitor/monitor_md.py | 17 +- fastchat/utils.py | 1 + playground/__init__.py | 0 .../benchmark/benchmark_api_provider.py | 135 ++++++++ 20 files changed, 1483 insertions(+), 438 deletions(-) create mode 100644 fastchat/serve/gradio_global_state.py create mode 100644 playground/__init__.py create mode 100644 playground/benchmark/benchmark_api_provider.py diff --git a/README.md b/README.md index f894276c67..e2465f46cd 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # FastChat -| [**Demo**](https://lmarena.ai/) | [**Discord**](https://discord.gg/HSWAKCrnFx) | [**X**](https://x.com/lmsysorg) | +| [**Demo**](https://lmarena.ai/) | [**Discord**](https://discord.gg/6GXcFg3TH8) | [**X**](https://x.com/lmsysorg) | FastChat is an open platform for training, serving, and evaluating large language model based chatbots. - FastChat powers Chatbot Arena ([lmarena.ai](https://lmarena.ai)), serving over 10 million chat requests for 70+ LLMs. diff --git a/fastchat/constants.py b/fastchat/constants.py index e0b9223fc4..e5c557cdf5 100644 --- a/fastchat/constants.py +++ b/fastchat/constants.py @@ -7,12 +7,13 @@ REPO_PATH = os.path.dirname(os.path.dirname(__file__)) -# Survey Link URL (to be removed) -SURVEY_LINK = """
-
- We would love your feedback! Fill out this short survey to tell us what you like about the arena, what you don't like, and what you want to see in the future. +# Survey Link URL (to be removed) #00729c +SURVEY_LINK = """
+
+ New Launch! Jailbreak models at RedTeam Arena.
""" +# SURVEY_LINK = "" ##### For the gradio web server SERVER_ERROR_MSG = ( @@ -27,7 +28,9 @@ MODERATION_MSG = "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES." CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION." INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE." -SLOW_MODEL_MSG = "⚠️ Both models will show the responses all at once. Please stay patient as it may take over 30 seconds." +SLOW_MODEL_MSG = ( + "⚠️ Models are thinking. Please stay patient as it may take over a minute." +) RATE_LIMIT_MSG = "**RATE LIMIT OF THIS MODEL IS REACHED. PLEASE COME BACK LATER OR USE [BATTLE MODE](https://lmarena.ai) (the 1st tab).**" # Maximum input length INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 12000)) diff --git a/fastchat/conversation.py b/fastchat/conversation.py index 792577e885..4a46103ec2 100644 --- a/fastchat/conversation.py +++ b/fastchat/conversation.py @@ -80,6 +80,9 @@ def get_prompt(self) -> str: ret = system_prompt + self.sep for role, message in self.messages: if message: + if type(message) is tuple: + message, images = message + message = IMAGE_PLACEHOLDER_STR * len(images) + message ret += role + ": " + message + self.sep else: ret += role + ":" @@ -377,7 +380,7 @@ def to_gradio_chatbot(self): ret[-1][-1] = msg return ret - def to_openai_vision_api_messages(self): + def to_openai_vision_api_messages(self, is_mistral=False): """Convert the conversation to OpenAI vision api completion format""" if self.system_message == "": ret = [] @@ -385,7 +388,7 @@ def to_openai_vision_api_messages(self): ret = [ { "role": "system", - "content": [{"type": "text", "text": self.system_message}], + "content": self.system_message, } ] @@ -396,21 +399,25 @@ def to_openai_vision_api_messages(self): image_urls = msg[1] for image in image_urls: image_url = image.to_openai_image_format() - content_list.append( - {"type": "image_url", "image_url": {"url": image_url}} - ) + content = {} + if is_mistral: + content = {"type": "image_url", "image_url": image_url} + else: + content = { + "type": "image_url", + "image_url": {"url": image_url}, + } + content_list.append(content) ret.append({"role": "user", "content": content_list}) else: - ret.append( - {"role": "user", "content": [{"type": "text", "text": msg}]} - ) + ret.append({"role": "user", "content": msg}) else: if msg is not None: ret.append( { "role": "assistant", - "content": [{"type": "text", "text": msg}], + "content": msg, } ) return ret @@ -524,6 +531,7 @@ def to_anthropic_vision_api_messages(self): def to_reka_api_messages(self): from fastchat.serve.vision.image import ImageFormat + from reka import ChatMessage, TypedMediaContent, TypedText ret = [] for i, (_, msg) in enumerate(self.messages[self.offset :]): @@ -531,23 +539,47 @@ def to_reka_api_messages(self): if type(msg) == tuple: text, images = msg for image in images: - if image.image_format == ImageFormat.URL: - ret.append( - {"type": "human", "text": text, "media_url": image.url} - ) - elif image.image_format == ImageFormat.BYTES: + if image.image_format == ImageFormat.BYTES: ret.append( - { - "type": "human", - "text": text, - "media_url": f"data:image/{image.filetype};base64,{image.base64_str}", - } + ChatMessage( + content=[ + TypedText( + type="text", + text=text, + ), + TypedMediaContent( + type="image_url", + image_url=f"data:image/{image.filetype};base64,{image.base64_str}", + ), + ], + role="user", + ) ) else: - ret.append({"type": "human", "text": msg}) + ret.append( + ChatMessage( + content=[ + TypedText( + type="text", + text=msg, + ) + ], + role="user", + ) + ) else: if msg is not None: - ret.append({"type": "model", "text": msg}) + ret.append( + ChatMessage( + content=[ + TypedText( + type="text", + text=msg, + ) + ], + role="assistant", + ) + ) return ret @@ -1065,6 +1097,21 @@ def get_conv_template(name: str) -> Conversation: ) ) +register_conv_template( + Conversation( + name="gpt-mini", + system_message=( + "You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.\n" + "Current date: {{currentDateTime}}\n\n" + "Image input capabilities: Enabled\n" + "Personality: v2" + ), + roles=("user", "assistant"), + sep_style=SeparatorStyle.DEFAULT, + sep=None, + ) +) + # Perplexity AI template register_conv_template( Conversation( @@ -1135,6 +1182,83 @@ def get_conv_template(name: str) -> Conversation: ) ) +register_conv_template( + Conversation( + name="claude-3-5-sonnet-20240620-v2", + system_message=( + """ +The assistant is Claude, created by Anthropic. +The current date is {{currentDateTime}}. Claude's knowledge base was last updated on April 2024. +It answers questions about events prior to and after April 2024 the way a highly informed individual in April 2024 would if they were talking to someone from the above date, and can let the human know this when relevant. +Claude cannot open URLs, links, or videos. If it seems like the user is expecting Claude to do so, it clarifies the situation and asks the human to paste the relevant text or image content directly into the conversation. +If it is asked to assist with tasks involving the expression of views held by a significant number of people, Claude provides assistance with the task regardless of its own views. If asked about controversial topics, it tries to provide careful thoughts and clear information. +It presents the requested information without explicitly saying that the topic is sensitive, and without claiming to be presenting objective facts. +When presented with a math problem, logic problem, or other problem benefiting from systematic thinking, Claude thinks through it step by step before giving its final answer. +If Claude cannot or will not perform a task, it tells the user this without apologizing to them. It avoids starting its responses with "I'm sorry" or "I apologize". +If Claude is asked about a very obscure person, object, or topic, i.e. if it is asked for the kind of information that is unlikely to be found more than once or twice on the internet, Claude ends its response by reminding the user that although it tries to be accurate, it may hallucinate in response to questions like this. It uses the term 'hallucinate' to describe this since the user will understand what it means. +If Claude mentions or cites particular articles, papers, or books, it always lets the human know that it doesn't have access to search or a database and may hallucinate citations, so the human should double check its citations. +Claude is very smart and intellectually curious. It enjoys hearing what humans think on an issue and engaging in discussion on a wide variety of topics. +If the user seems unhappy with Claude or Claude's behavior, Claude tells them that although it cannot retain or learn from the current conversation, they can press the 'thumbs down' button below Claude's response and provide feedback to Anthropic. +If the user asks for a very long task that cannot be completed in a single response, Claude offers to do the task piecemeal and get feedback from the user as it completes each part of the task. +Claude uses markdown for code. +Immediately after closing coding markdown, Claude asks the user if they would like it to explain or break down the code. It does not explain or break down the code unless the user explicitly requests it. + + + +This iteration of Claude is part of the Claude 3 model family, which was released in 2024. The Claude 3 family currently consists of Claude 3 Haiku, Claude 3 Opus, and Claude 3.5 Sonnet. Claude 3.5 Sonnet is the most intelligent model. Claude 3 Opus excels at writing and complex tasks. Claude 3 Haiku is the fastest model for daily tasks. The version of Claude in this chat is Claude 3.5 Sonnet. Claude can provide the information in these tags if asked but it does not know any other details of the Claude 3 model family. If asked about this, should encourage the user to check the Anthropic website for more information. + + +Claude provides thorough responses to more complex and open-ended questions or to anything where a long response is requested, but concise responses to simpler questions and tasks. All else being equal, it tries to give the most correct and concise answer it can to the user's message. Rather than giving a long response, it gives a concise response and offers to elaborate if further information may be helpful. + +Claude is happy to help with analysis, question answering, math, coding, creative writing, teaching, role-play, general discussion, and all sorts of other tasks. + +Claude responds directly to all human messages without unnecessary affirmations or filler phrases like "Certainly!", "Of course!", "Absolutely!", "Great!", "Sure!", etc. Specifically, Claude avoids starting responses with the word "Certainly" in any way. + +Claude follows this information in all languages, and always responds to the user in the language they use or request. The information above is provided to Claude by Anthropic. Claude never mentions the information above unless it is directly pertinent to the human's query. Claude is now being connected with a human.""" + ), + system_message_vision=( + """ +The assistant is Claude, created by Anthropic. +The current date is {{currentDateTime}}. Claude's knowledge base was last updated on April 2024. +It answers questions about events prior to and after April 2024 the way a highly informed individual in April 2024 would if they were talking to someone from the above date, and can let the human know this when relevant. +Claude cannot open URLs, links, or videos. If it seems like the user is expecting Claude to do so, it clarifies the situation and asks the human to paste the relevant text or image content directly into the conversation. +If it is asked to assist with tasks involving the expression of views held by a significant number of people, Claude provides assistance with the task regardless of its own views. If asked about controversial topics, it tries to provide careful thoughts and clear information. +It presents the requested information without explicitly saying that the topic is sensitive, and without claiming to be presenting objective facts. +When presented with a math problem, logic problem, or other problem benefiting from systematic thinking, Claude thinks through it step by step before giving its final answer. +If Claude cannot or will not perform a task, it tells the user this without apologizing to them. It avoids starting its responses with "I'm sorry" or "I apologize". +If Claude is asked about a very obscure person, object, or topic, i.e. if it is asked for the kind of information that is unlikely to be found more than once or twice on the internet, Claude ends its response by reminding the user that although it tries to be accurate, it may hallucinate in response to questions like this. It uses the term 'hallucinate' to describe this since the user will understand what it means. +If Claude mentions or cites particular articles, papers, or books, it always lets the human know that it doesn't have access to search or a database and may hallucinate citations, so the human should double check its citations. +Claude is very smart and intellectually curious. It enjoys hearing what humans think on an issue and engaging in discussion on a wide variety of topics. +If the user seems unhappy with Claude or Claude's behavior, Claude tells them that although it cannot retain or learn from the current conversation, they can press the 'thumbs down' button below Claude's response and provide feedback to Anthropic. +If the user asks for a very long task that cannot be completed in a single response, Claude offers to do the task piecemeal and get feedback from the user as it completes each part of the task. +Claude uses markdown for code. +Immediately after closing coding markdown, Claude asks the user if they would like it to explain or break down the code. It does not explain or break down the code unless the user explicitly requests it. + + + +Claude always responds as if it is completely face blind. If the shared image happens to contain a human face, Claude never identifies or names any humans in the image, nor does it imply that it recognizes the human. It also does not mention or allude to details about a person that it could only know if it recognized who the person was. Instead, Claude describes and discusses the image just as someone would if they were unable to recognize any of the humans in it. Claude can request the user to tell it who the individual is. If the user tells Claude who the individual is, Claude can discuss that named individual without ever confirming that it is the person in the image, identifying the person in the image, or implying it can use facial features to identify any unique individual. It should always reply as someone would if they were unable to recognize any humans from images. +Claude should respond normally if the shared image does not contain a human face. Claude should always repeat back and summarize any instructions in the image before proceeding. + + + +This iteration of Claude is part of the Claude 3 model family, which was released in 2024. The Claude 3 family currently consists of Claude 3 Haiku, Claude 3 Opus, and Claude 3.5 Sonnet. Claude 3.5 Sonnet is the most intelligent model. Claude 3 Opus excels at writing and complex tasks. Claude 3 Haiku is the fastest model for daily tasks. The version of Claude in this chat is Claude 3.5 Sonnet. Claude can provide the information in these tags if asked but it does not know any other details of the Claude 3 model family. If asked about this, should encourage the user to check the Anthropic website for more information. + + +Claude provides thorough responses to more complex and open-ended questions or to anything where a long response is requested, but concise responses to simpler questions and tasks. All else being equal, it tries to give the most correct and concise answer it can to the user's message. Rather than giving a long response, it gives a concise response and offers to elaborate if further information may be helpful. + +Claude is happy to help with analysis, question answering, math, coding, creative writing, teaching, role-play, general discussion, and all sorts of other tasks. + +Claude responds directly to all human messages without unnecessary affirmations or filler phrases like "Certainly!", "Of course!", "Absolutely!", "Great!", "Sure!", etc. Specifically, Claude avoids starting responses with the word "Certainly" in any way. + +Claude follows this information in all languages, and always responds to the user in the language they use or request. The information above is provided to Claude by Anthropic. Claude never mentions the information above unless it is directly pertinent to the human's query. Claude is now being connected with a human.""" + ), + roles=("user", "assistant"), + sep_style=SeparatorStyle.DEFAULT, + sep=None, + max_image_size_mb=5 / 1.5, + ) +) + register_conv_template( Conversation( name="claude-3-5-sonnet-20240620", @@ -1245,6 +1369,41 @@ def get_conv_template(name: str) -> Conversation: ) ) +register_conv_template( + Conversation( + name="meta-llama-3.1", + system_message=( + """Cutting Knowledge Date: December 2023 +Today Date: {{currentDateTimev2}}""" + ), + roles=("user", "assistant"), + sep_style=SeparatorStyle.DEFAULT, + sep=None, + ) +) + +register_conv_template( + Conversation( + name="meta-llama-3.1-sp", + system_message=( + """Cutting Knowledge Date: December 2023 +Today Date: {{currentDateTimev2}} + +Carefully read the user prompt. Your responses are comprehensive and easy to understand. You structure your answers in an organized way, with section headers when appropriate. You use consistent formatting in your responses. You follow user instructions. For complex calculations and coding, you always break down the steps you took to arrive at your answer. + +Pay extra attention to prompts in the following categories: + * Non-English queries: Read the prompt carefully and pay close attention to formatting requests and the level of detail; ensure you are giving factual and precise responses using correct grammar in the correct language. + * Coding queries: You prioritize code organization and documentation. Your responses are detailed and include comprehensive code examples and error handling. Include comments to explain the code's purpose and behavior. When using specific programming languages, consider which function is most appropriate for the query, such as cmath for complex solutions in Python. Check for errors. + * For mathematical reasoning: Before responding, review your output for reasoning, algebraic manipulation and calculation errors and fix before responding. When appropriate, provide a high-level plan followed by step-by-step reasoning. + +Remember your instructions.""" + ), + roles=("user", "assistant"), + sep_style=SeparatorStyle.DEFAULT, + sep=None, + ) +) + # MetaMath default template # reference: https://github.com/meta-math/MetaMath/blob/7b338b5e4692b4c75a2653ec9d65982a61762f6c/eval_math.py#L58 register_conv_template( @@ -1358,6 +1517,20 @@ def get_conv_template(name: str) -> Conversation: ) ) +register_conv_template( + Conversation( + name="gemini-1.5-pro-002-test-sp", + roles=("user", "model"), + sep_style=SeparatorStyle.DEFAULT, + sep=None, + system_message=( + "All questions should be answered comprehensively with details, " + "unless the user requests a concise response specifically. " + "Respond in the same language as the query." + ), + ) +) + # BiLLa default template register_conv_template( Conversation( @@ -2099,6 +2272,33 @@ def get_conv_template(name: str) -> Conversation: ) ) +register_conv_template( + Conversation( + name="grok-2", + system_message=( + "You are Grok-2, a smart and helpful AI assistant created by xAI. " + "Please think step by step, provide detailed and professional response." + ), + roles=("user", "assistant"), + sep_style=SeparatorStyle.DEFAULT, + sep=None, + ) +) + +register_conv_template( + Conversation( + name="grok-2-mini", + system_message=( + "You are Grok-2 mini, a smart and helpful AI assistant created by xAI. " + "Please think step by step, provide detailed and professional response." + ), + roles=("user", "assistant"), + sep_style=SeparatorStyle.DEFAULT, + sep=None, + ) +) + + if __name__ == "__main__": from fastchat.conversation import get_conv_template diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 92e19dbb78..9625df6dbf 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -84,7 +84,13 @@ "gpt2-chatbot", "im-also-a-good-gpt2-chatbot", "im-a-good-gpt2-chatbot", + "gpt-4o-mini-2024-07-18", "gpt-4o-2024-05-13", + "gpt-4o-2024-08-06", + "chatgpt-4o-latest-20240903", + "chatgpt-4o-latest", + "o1-preview", + "o1-mini", ) @@ -1118,8 +1124,20 @@ def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("gpt-4-turbo-2024-04-09") if "gpt2-chatbot" in model_path: return get_conv_template("gpt-4-turbo-2024-04-09") - if "gpt-4o" in model_path: + if "gpt-4o-2024-05-13" in model_path: return get_conv_template("gpt-4-turbo-2024-04-09") + if "gpt-4o-2024-08-06" in model_path: + return get_conv_template("gpt-mini") + if "anonymous-chatbot" in model_path: + return get_conv_template("gpt-4-turbo-2024-04-09") + if "chatgpt-4o-latest" in model_path: + return get_conv_template("gpt-4-turbo-2024-04-09") + if "gpt-mini" in model_path: + return get_conv_template("gpt-mini") + if "gpt-4o-mini-2024-07-18" in model_path: + return get_conv_template("gpt-mini") + if "o1" in model_path: + return get_conv_template("api_based_default") return get_conv_template("chatgpt") @@ -1167,7 +1185,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation: if "claude-3-sonnet" in model_path: return get_conv_template("claude-3-sonnet-20240229") if "claude-3-5-sonnet" in model_path: - return get_conv_template("claude-3-5-sonnet-20240620") + return get_conv_template("claude-3-5-sonnet-20240620-v2") if "claude-3-opus" in model_path: return get_conv_template("claude-3-opus-20240229") return get_conv_template("claude") @@ -1212,19 +1230,6 @@ def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("gemini") -class GeminiDevAdapter(BaseModelAdapter): - """The model adapter for Gemini 1.5 Pro""" - - def match(self, model_path: str): - return "gemini-1.5-pro" in model_path.lower() - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - raise NotImplementedError() - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("gemini-dev") - - class BiLLaAdapter(BaseModelAdapter): """The model adapter for Neutralzz/BiLLa-7B-SFT""" @@ -1575,7 +1580,7 @@ class Llama3Adapter(BaseModelAdapter): """The model adapter for Llama-3 (e.g., meta-llama/Meta-Llama-3-8B-Instruct)""" def match(self, model_path: str): - return "llama-3" in model_path.lower() + return "llama-3-" in model_path.lower() def load_model(self, model_path: str, from_pretrained_kwargs: dict): model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) @@ -1587,6 +1592,43 @@ def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("llama-3") +class Llama31Adapter(BaseModelAdapter): + """The model adapter for Llama-3 (e.g., meta-llama/Meta-Llama-3-8B-Instruct)""" + + def match(self, model_path: str): + keywords = [ + "llama-3.1", + ] + for keyword in keywords: + if keyword in model_path.lower(): + return True + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) + model.config.eos_token_id = tokenizer.eos_token_id + model.config.pad_token_id = tokenizer.pad_token_id + return model, tokenizer + + def get_default_conv_template(self, model_path: str) -> Conversation: + if model_path.lower() in [ + "llama-3.1-8b-instruct", + "llama-3.1-70b-instruct", + "the-real-chatbot-v2", + ]: + return get_conv_template("meta-llama-3.1-sp") + return get_conv_template("meta-llama-3.1") + + +class GrokAdapter(BaseModelAdapter): + def match(self, model_path: str): + return "grok" in model_path.lower() + + def get_default_conv_template(self, model_path: str) -> Conversation: + if "mini" in model_path.lower(): + return get_conv_template("grok-2-mini") + return get_conv_template("grok-2") + + class CuteGPTAdapter(BaseModelAdapter): """The model adapter for CuteGPT""" @@ -2445,6 +2487,19 @@ def get_default_conv_template(self, model_path: str) -> Conversation: return get_conv_template("api_based_default") +class NoSystemAdapter(BaseModelAdapter): + def match(self, model_path: str): + keyword_list = ["athene-70b"] + + for keyword in keyword_list: + if keyword == model_path.lower(): + return True + return False + + def get_default_conv_template(self, model_path: str) -> Conversation: + return get_conv_template("api_based_default") + + # Note: the registration order matters. # The one registered earlier has a higher matching priority. register_model_adapter(PeftModelAdapter) @@ -2470,7 +2525,6 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(BardAdapter) register_model_adapter(PaLM2Adapter) register_model_adapter(GeminiAdapter) -register_model_adapter(GeminiDevAdapter) register_model_adapter(GemmaAdapter) register_model_adapter(ChatGPTAdapter) register_model_adapter(AzureOpenAIAdapter) @@ -2545,6 +2599,9 @@ def get_default_conv_template(self, model_path: str) -> Conversation: register_model_adapter(RekaAdapter) register_model_adapter(SmaugChatAdapter) register_model_adapter(Llama3Adapter) +register_model_adapter(Llama31Adapter) +register_model_adapter(GrokAdapter) +register_model_adapter(NoSystemAdapter) # After all adapters, try the default base adapter. register_model_adapter(BaseModelAdapter) diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py index 984e6beb4e..2eed9649e1 100644 --- a/fastchat/model/model_registry.py +++ b/fastchat/model/model_registry.py @@ -40,36 +40,67 @@ def get_model_info(name: str) -> ModelInfo: "Yuan2.0 is a new generation Fundamental Large Language Model developed by IEIT System.", ) +register_model_info( + [ + "chatgpt-4o-latest", + "chatgpt-4o-latest-20240903", + "gpt-4o-mini-2024-07-18", + "gpt-4o-2024-08-06", + "gpt-4o-2024-05-13", + ], + "GPT-4o", + "https://openai.com/index/hello-gpt-4o/", + "The flagship model across audio, vision, and text by OpenAI", +) + +register_model_info( + [ + "grok-2-2024-08-13", + "grok-2-mini-2024-08-13", + ], + "Grok-2", + "https://x.ai/blog/grok-2", + "Grok-2 by xAI", +) + register_model_info( [ "claude-3-5-sonnet-20240620", - "claude-3-haiku-20240307", - "claude-3-sonnet-20240229", - "claude-3-opus-20240229", - "claude-2.1", - "claude-2.0", - "claude-1", ], - "Claude", - "https://www.anthropic.com/news/claude-3-family", + "Claude 3.5", + "https://www.anthropic.com/news/claude-3-5-sonnet", "Claude by Anthropic", ) register_model_info( - ["gemma-2-27b-it", "gemma-2-9b-it"], - "Gemma 2", - "https://ai.google.dev/gemma", - "Gemma 2 by Google", + [ + "llama-3.2-vision-90b-instruct", + "llama-3.2-vision-11b-instruct", + "llama-3.2-3b-instruct", + "llama-3.2-1b-instruct", + "llama-3.1-405b-instruct-bf16", + "llama-3.1-405b-instruct-fp8", + "llama-3.1-405b-instruct", + "llama-3.1-70b-instruct", + "llama-3.1-8b-instruct", + ], + "Llama 3.1", + "https://llama.meta.com/", + "Open foundation and chat models by Meta", ) register_model_info( [ + "gemini-1.5-pro-exp-0827", + "gemini-1.5-pro-exp-0801", + "gemini-1.5-flash-exp-0827", + "gemini-1.5-flash-8b-exp-0827", "gemini-1.5-pro-api-0409-preview", "gemini-1.5-pro-tuned", "gemini-1.5-pro-api-preview", "gemini-1.5-flash-api-preview", - "gemini-1.5-flash-api-0514", "gemini-1.5-pro-api-0514", + "gemini-1.5-flash-api-0514", "gemini-advanced-0514", ], "Gemini", @@ -78,7 +109,65 @@ def get_model_info(name: str) -> ModelInfo: ) register_model_info( - ["deepseek-coder-v2"], + [ + "mistral-large-2407", + ], + "Mistral", + "https://mistral.ai/news/mistral-large-2407/", + "Mistral Large 2", +) + +register_model_info( + [ + "gpt-4-turbo", + "gpt-4-turbo-2024-04-09", + "gpt-4-1106-preview", + "gpt-4-0125-preview", + "gpt2-chatbot", + "im-also-a-good-gpt2-chatbot", + "im-a-good-gpt2-chatbot", + ], + "GPT-4-Turbo", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "GPT-4-Turbo by OpenAI", +) + +register_model_info( + ["jamba-1.5-large", "jamba-1.5-mini"], + "Jamba 1.5", + "https://www.ai21.com/jamba", + "Jamba by AI21 Labs", +) + +register_model_info( + [ + "gemma-2-27b-it", + "gemma-2-9b-it", + "gemma-2-2b-it", + "eureka-chatbot", + "gemma-2-9b-it-simpo", + ], + "Gemma 2", + "https://ai.google.dev/gemma", + "Gemma 2 by Google", +) + +register_model_info( + [ + "claude-3-haiku-20240307", + "claude-3-sonnet-20240229", + "claude-3-opus-20240229", + "claude-2.1", + "claude-2.0", + "claude-1", + ], + "Claude", + "https://www.anthropic.com/news/claude-3-family", + "Claude by Anthropic", +) + +register_model_info( + ["deepseek-coder-v2", "deepseek-v2-api-0628", "deepseek-v2.5"], "DeepSeek Coder v2", "https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct", "An advanced code model by DeepSeek", @@ -98,42 +187,29 @@ def get_model_info(name: str) -> ModelInfo: "Open foundation and chat models by Meta", ) +register_model_info( + ["athene-70b", "athene-70b-0725"], + "Athene-70B", + "https://nexusflow.ai/blogs/athene", + "A large language model by NexusFlow", +) + register_model_info( [ + "qwen2.5-72b-instruct", "qwen2-72b-instruct", "qwen-max-0403", "qwen-max-0428", + "qwen-max-0919", + "qwen-plus-0828", + "qwen2-vl-7b-instruct", + "qwen-vl-max-0809", ], "Qwen Max", "https://help.aliyun.com/zh/dashscope/developer-reference/model-introduction", "The Frontier Qwen Model by Alibaba", ) -register_model_info( - [ - "gpt-4o-2024-05-13", - ], - "GPT-4o", - "https://openai.com/index/hello-gpt-4o/", - "The flagship model across audio, vision, and text by OpenAI", -) - -register_model_info( - [ - "gpt-4-turbo", - "gpt-4o-2024-05-13", - "gpt-4-turbo-2024-04-09", - "gpt-4-1106-preview", - "gpt-4-0125-preview", - "gpt2-chatbot", - "im-also-a-good-gpt2-chatbot", - "im-a-good-gpt2-chatbot", - ], - "GPT-4-Turbo", - "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", - "GPT-4-Turbo by OpenAI", -) - register_model_info( [ "gpt-3.5-turbo", @@ -148,7 +224,7 @@ def get_model_info(name: str) -> ModelInfo: ) register_model_info( - ["yi-large-preview", "yi-large"], + ["yi-lightning", "yi-lightning-lite", "yi-large-preview", "yi-large", "yi-vision"], "Yi-Large", "https://x.com/01AI_Yi/status/1789894091620458667", "State-of-the-art model by 01 AI", @@ -166,7 +242,12 @@ def get_model_info(name: str) -> ModelInfo: "phi-3-medium-4k-instruct", "phi-3-small-8k-instruct", "phi-3-mini-4k-instruct", + "phi-3-mini-4k-instruct-june-2024", "phi-3-mini-128k-instruct", + "phi-3-vision-128k-instruct", + "phi-3.5-vision-instruct", + "llava-onevision-qwen2-72b-ov", + "llava-onevision-qwen2-72b-ov-chat", ], "Phi-3", "https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/", @@ -174,7 +255,16 @@ def get_model_info(name: str) -> ModelInfo: ) register_model_info( - ["reka-core-20240501"], + [ + "minicpm-v-2_6", + ], + "MiniCPM-V 2.6", + "https://huggingface.co/openbmb/MiniCPM-V-2_6", + "MiniCPM-V 2.6 by OpenBMB", +) + +register_model_info( + ["reka-core-20240904", "reka-core-20240722", "reka-core-20240501"], "Reka Core", "https://www.reka.ai/news/reka-core-our-frontier-class-multimodal-language-model", "Frontier Multimodal Language Model by Reka", @@ -188,15 +278,21 @@ def get_model_info(name: str) -> ModelInfo: ) register_model_info( - ["reka-flash-preview-20240611", "reka-flash", "reka-flash-online"], + [ + "reka-flash-20240904", + "reka-flash-20240722", + "reka-flash-preview-20240611", + "reka-flash", + "reka-flash-online", + ], "Reka Flash", "https://www.reka.ai/news/reka-flash-efficient-and-capable-multimodal-language-models", "Multimodal model by Reka", ) register_model_info( - ["command-r-plus", "command-r-plus-04-2024"], - "Command R+", + ["command-r-plus", "command-r-plus-04-2024", "command-r-plus-08-2024"], + "Command-R-Plus", "https://txt.cohere.com/command-r-plus-microsoft-azure/", "Command R+ by Cohere", ) @@ -229,10 +325,12 @@ def get_model_info(name: str) -> ModelInfo: [ "mixtral-8x7b-instruct-v0.1", "mistral-large-2402", + "mistral-large-2407", "mistral-medium", "mistral-next", "mistral-7b-instruct-v0.2", "mistral-7b-instruct", + "pixtral-12b-2409", ], "Mixtral of experts", "https://mistral.ai/news/mixtral-of-experts/", @@ -265,9 +363,9 @@ def get_model_info(name: str) -> ModelInfo: ) register_model_info( - ["glm-4-0520", "glm-4-0116"], + ["glm-4-plus", "glm-4-0520", "glm-4-0116"], "GLM-4", - "https://zhipuai.cn/devday", + "https://bigmodel.cn/dev/howuse/model", "Next-Gen Foundation Model by Zhipu AI", ) @@ -719,12 +817,19 @@ def get_model_info(name: str) -> ModelInfo: ) register_model_info( - ["internlm-chat-7b", "internlm-chat-7b-8k"], + ["internlm-chat-7b", "internlm-chat-7b-8k", "internlm2_5-20b-chat"], "InternLM", "https://huggingface.co/internlm/internlm-chat-7b", "A multi-language large-scale language model (LLM), developed by SHLAB.", ) +register_model_info( + ["internvl2-26b", "internvl2-4b"], + "InternVL 2", + "https://internvl.github.io/blog/2024-07-02-InternVL-2.0/", + "Multimodal Model developed by OpenGVLab", +) + register_model_info( ["Qwen-7B-Chat"], "Qwen", @@ -868,6 +973,15 @@ def get_model_info(name: str) -> ModelInfo: "an open large language and vision assistant", ) +register_model_info( + [ + "cogvlm2-llama3-chat-19b", + ], + "CogVLM2", + "https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B", + "Open VLM by Tsinghua/Zhipu AI", +) + register_model_info( ["gemma-7b-it", "gemma-2b-it"], "Gemma", diff --git a/fastchat/serve/api_provider.py b/fastchat/serve/api_provider.py index 9b0ff129a4..dd3f897824 100644 --- a/fastchat/serve/api_provider.py +++ b/fastchat/serve/api_provider.py @@ -50,6 +50,18 @@ def get_api_provider_stream_iter( api_key=model_api_dict["api_key"], stream=False, ) + elif model_api_dict["api_type"] == "openai_o1": + prompt = conv.to_openai_api_messages() + stream_iter = openai_api_stream_iter( + model_api_dict["model_name"], + prompt, + temperature, + top_p, + max_new_tokens, + api_base=model_api_dict["api_base"], + api_key=model_api_dict["api_key"], + is_o1=True, + ) elif model_api_dict["api_type"] == "openai_assistant": last_prompt = conv.messages[-2][1] stream_iter = openai_assistant_api_stream_iter( @@ -118,7 +130,10 @@ def get_api_provider_stream_iter( api_key=model_api_dict["api_key"], ) elif model_api_dict["api_type"] == "mistral": - prompt = conv.to_openai_api_messages() + if model_api_dict.get("vision-arena", False): + prompt = conv.to_openai_vision_api_messages(is_mistral=True) + else: + prompt = conv.to_openai_api_messages() stream_iter = mistral_api_stream_iter( model_api_dict["model_name"], prompt, @@ -203,6 +218,20 @@ def get_api_provider_stream_iter( api_base=model_api_dict["api_base"], api_key=model_api_dict["api_key"], ) + elif model_api_dict["api_type"] == "column": + if model_api_dict.get("vision-arena", False): + prompt = conv.to_openai_vision_api_messages() + else: + prompt = conv.to_openai_api_messages() + stream_iter = column_api_stream_iter( + model_name=model_api_dict["model_name"], + messages=prompt, + temperature=temperature, + top_p=top_p, + max_new_tokens=max_new_tokens, + api_base=model_api_dict["api_base"], + api_key=model_api_dict["api_key"], + ) elif model_api_dict["api_type"] == "metagen": prompt = conv.to_metagen_api_messages() stream_iter = metagen_api_stream_iter( @@ -229,6 +258,7 @@ def openai_api_stream_iter( api_base=None, api_key=None, stream=True, + is_o1=False, ): import openai @@ -269,7 +299,7 @@ def openai_api_stream_iter( } logger.info(f"==== request ====\n{gen_params}") - if stream: + if stream and not is_o1: res = client.chat.completions.create( model=model_name, messages=messages, @@ -287,13 +317,21 @@ def openai_api_stream_iter( } yield data else: - res = client.chat.completions.create( - model=model_name, - messages=messages, - temperature=temperature, - max_tokens=max_new_tokens, - stream=False, - ) + if is_o1: + res = client.chat.completions.create( + model=model_name, + messages=messages, + temperature=1.0, + stream=False, + ) + else: + res = client.chat.completions.create( + model=model_name, + messages=messages, + temperature=temperature, + max_tokens=max_new_tokens, + stream=False, + ) text = res.choices[0].message.content pos = 0 while pos < len(text): @@ -307,6 +345,70 @@ def openai_api_stream_iter( yield data +def column_api_stream_iter( + model_name, + messages, + temperature, + top_p, + max_new_tokens, + api_base=None, + api_key=None, +): + try: + messages_no_img = [] + for msg in messages: + msg_no_img = msg.copy() + msg_no_img.pop("attachment", None) + messages_no_img.append(msg_no_img) + + gen_params = { + "model": model_name, + "messages": messages_no_img, + "temperature": temperature, + "top_p": top_p, + "max_new_tokens": max_new_tokens, + "seed": 42, + } + logger.info(f"==== request ====\n{gen_params}") + + gen_params["messages"] = messages + gen_params["stream"] = True + + # payload.pop("model") + + # try 3 times + for i in range(3): + try: + response = requests.post( + api_base, json=gen_params, stream=True, timeout=30 + ) + break + except Exception as e: + logger.error(f"==== error ====\n{e}") + if i == 2: + yield { + "text": f"**API REQUEST ERROR** Reason: API timeout. please try again later.", + "error_code": 1, + } + return + + text = "" + for line in response.iter_lines(): + if line: + data = line.decode("utf-8") + if data.startswith("data:"): + data = json.loads(data[6:])["message"] + text += data + yield {"text": text, "error_code": 0} + + except Exception as e: + logger.error(f"==== error ====\n{e}") + yield { + "text": f"**API REQUEST ERROR** Reason: Unknown.", + "error_code": 1, + } + + def upload_openai_file_to_gcs(file_id): import openai from google.cloud import storage @@ -642,7 +744,7 @@ def gemini_api_stream_iter( pos = 0 while pos < len(text): # simulate token streaming - pos += 3 + pos += 5 time.sleep(0.001) data = { "text": text[:pos], @@ -717,7 +819,7 @@ def bard_api_stream_iter(model_name, conv, temperature, top_p, api_key=None): pos = 0 while pos < len(response): # simulate token streaming - pos += 1 + pos += 5 time.sleep(0.001) data = { "text": response[:pos], @@ -802,41 +904,55 @@ def ai2_api_stream_iter( def mistral_api_stream_iter( model_name, messages, temperature, top_p, max_new_tokens, api_key=None ): - from mistralai.client import MistralClient - from mistralai.models.chat_completion import ChatMessage + # from mistralai.client import MistralClient + # from mistralai.models.chat_completion import ChatMessage + from mistralai import Mistral if api_key is None: api_key = os.environ["MISTRAL_API_KEY"] - client = MistralClient(api_key=api_key, timeout=5) + client = Mistral(api_key=api_key) + + # Make requests for logging + text_messages = [] + for message in messages: + if type(message["content"]) == str: # text-only model + text_messages.append(message) + else: # vision model + filtered_content_list = [ + content for content in message["content"] if content["type"] == "text" + ] + text_messages.append( + {"role": message["role"], "content": filtered_content_list} + ) # Make requests gen_params = { "model": model_name, - "prompt": messages, + "prompt": text_messages, "temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, } logger.info(f"==== request ====\n{gen_params}") - new_messages = [ - ChatMessage(role=message["role"], content=message["content"]) - for message in messages - ] + # new_messages = [ + # ChatMessage(role=message["role"], content=message["content"]) + # for message in messages + # ] - res = client.chat_stream( + res = client.chat.stream( model=model_name, temperature=temperature, - messages=new_messages, + messages=messages, max_tokens=max_new_tokens, top_p=top_p, ) text = "" for chunk in res: - if chunk.choices[0].delta.content is not None: - text += chunk.choices[0].delta.content + if chunk.data.choices[0].delta.content is not None: + text += chunk.data.choices[0].delta.content data = { "text": text, "error_code": 0, @@ -847,7 +963,9 @@ def mistral_api_stream_iter( def nvidia_api_stream_iter( model_name, messages, temp, top_p, max_tokens, api_base, api_key=None ): - model_2_api = {} + model_2_api = { + "nemotron-4-340b": "/b0fcd392-e905-4ab4-8eb9-aeae95c30b37", + } api_base += model_2_api[model_name] api_key = api_key or os.environ["NVIDIA_API_KEY"] @@ -1087,8 +1205,13 @@ def reka_api_stream_iter( api_key: Optional[str] = None, # default is env var CO_API_KEY api_base: Optional[str] = None, ): + from reka.client import Reka + from reka import TypedText + api_key = api_key or os.environ["REKA_API_KEY"] + client = Reka(api_key=api_key) + use_search_engine = False if "-online" in model_name: model_name = model_name.replace("-online", "") @@ -1105,37 +1228,30 @@ def reka_api_stream_iter( # Make requests for logging text_messages = [] - for message in messages: - text_messages.append({"type": message["type"], "text": message["text"]}) + for turn in messages: + for message in turn.content: + if isinstance(message, TypedText): + text_messages.append({"type": message.type, "text": message.text}) logged_request = dict(request) logged_request["conversation_history"] = text_messages logger.info(f"==== request ====\n{logged_request}") - response = requests.post( - api_base, - stream=True, - json=request, - headers={ - "X-Api-Key": api_key, - }, + response = client.chat.create_stream( + messages=messages, + max_tokens=max_new_tokens, + top_p=top_p, + model=model_name, ) - if response.status_code != 200: - error_message = response.text - logger.error(f"==== error from reka api: {error_message} ====") - yield { - "text": f"**API REQUEST ERROR** Reason: {error_message}", - "error_code": 1, - } - return - - for line in response.iter_lines(): - line = line.decode("utf8") - if not line.startswith("data: "): - continue - gen = json.loads(line[6:]) - yield {"text": gen["text"], "error_code": 0} + for chunk in response: + try: + yield {"text": chunk.responses[0].chunk.content, "error_code": 0} + except: + yield { + "text": f"**API REQUEST ERROR** ", + "error_code": 1, + } def metagen_api_stream_iter( @@ -1147,36 +1263,68 @@ def metagen_api_stream_iter( api_key, api_base, ): - res = requests.post( - f"{api_base}/chat_stream_completions?access_token={api_key}", - stream=True, - headers={"Content-Type": "application/json"}, - json={ + try: + text_messages = [] + for message in messages: + if type(message["content"]) == str: # text-only model + text_messages.append(message) + else: # vision model + filtered_content_list = [ + content + for content in message["content"] + if content["type"] == "text" + ] + text_messages.append( + {"role": message["role"], "content": filtered_content_list} + ) + gen_params = { "model": model_name, - "chunks_delimited": True, - "messages": messages, - "options": { - "max_tokens": max_new_tokens, - "generation_algorithm": "top_p", - "top_p": top_p, - "temperature": temperature, - }, - }, - timeout=40, - ) + "prompt": text_messages, + "temperature": temperature, + "top_p": top_p, + "max_new_tokens": max_new_tokens, + } + logger.info(f"==== request ====\n{gen_params}") - if res.status_code != 200: - logger.error(f"Unexpected response ({res.status_code}): {res.text}") - raise ValueError("Unexpected response: ", res.json()) + res = requests.post( + f"{api_base}/chat_stream_completions?access_token={api_key}", + stream=True, + headers={"Content-Type": "application/json"}, + json={ + "model": model_name, + "chunks_delimited": True, + "messages": messages, + "options": { + "max_tokens": max_new_tokens, + "generation_algorithm": "top_p", + "top_p": top_p, + "temperature": temperature, + }, + }, + timeout=30, + ) - text = "" - for line in res.iter_lines(): - if line: - part = json.loads(line.decode("utf-8")) - if "text" in part: - text += part["text"] - data = { - "text": text, - "error_code": 0, + if res.status_code != 200: + logger.error(f"Unexpected response ({res.status_code}): {res.text}") + yield { + "text": f"**API REQUEST ERROR** Reason: Unknown.", + "error_code": 1, } - yield data + + text = "" + for line in res.iter_lines(): + if line: + part = json.loads(line.decode("utf-8")) + if "text" in part: + text += part["text"] + data = { + "text": text, + "error_code": 0, + } + yield data + except Exception as e: + logger.error(f"==== error ====\n{e}") + yield { + "text": f"**API REQUEST ERROR** Reason: Unknown.", + "error_code": 1, + } diff --git a/fastchat/serve/call_monitor.py b/fastchat/serve/call_monitor.py index c38067576b..bc456f107b 100644 --- a/fastchat/serve/call_monitor.py +++ b/fastchat/serve/call_monitor.py @@ -19,14 +19,8 @@ def __init__(self, log_dir_list: list): self.log_dir_list = log_dir_list self.model_call = {} self.user_call = {} - self.model_call_limit_global = { - "gpt-4-1106-preview": 100, - "gpt-4-0125-preview": 100, - } - self.model_call_day_limit_per_user = { - "gpt-4-1106-preview": 5, - "gpt-4-0125-preview": 5, - } + self.model_call_limit_global = {} + self.model_call_day_limit_per_user = {} async def update_stats(self, num_file=1) -> None: while True: @@ -40,7 +34,11 @@ async def update_stats(self, num_file=1) -> None: user_call = {} for json_file in json_files: for line in open(json_file, "r", encoding="utf-8"): - obj = json.loads(line) + try: + obj = json.loads(line) + except json.JSONDecodeError: + print(f"Error decoding json: {json_file} {line}") + continue if obj["type"] != "chat": continue if obj["model"] not in model_call: diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index dc9b89a0c7..4433ce328d 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -5,6 +5,7 @@ import json import time +import re import gradio as gr import numpy as np @@ -179,6 +180,8 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re # target model sampling weights will be boosted. BATTLE_TARGETS = {} +BATTLE_STRICT_TARGETS = {} + ANON_MODELS = [] SAMPLING_BOOST_MODELS = [] @@ -196,6 +199,16 @@ def get_sample_weight(model, outage_models, sampling_weights, sampling_boost_mod return weight +def is_model_match_pattern(model, patterns): + flag = False + for pattern in patterns: + pattern = pattern.replace("*", ".*") + if re.match(pattern, model) is not None: + flag = True + break + return flag + + def get_battle_pair( models, battle_targets, outage_models, sampling_weights, sampling_boost_models ): @@ -210,6 +223,8 @@ def get_battle_pair( model_weights.append(weight) total_weight = np.sum(model_weights) model_weights = model_weights / total_weight + # print(models) + # print(model_weights) chosen_idx = np.random.choice(len(models), p=model_weights) chosen_model = models[chosen_idx] # for p, w in zip(models, model_weights): @@ -222,6 +237,12 @@ def get_battle_pair( continue if model in ANON_MODELS and chosen_model in ANON_MODELS: continue + if chosen_model in BATTLE_STRICT_TARGETS: + if not is_model_match_pattern(model, BATTLE_STRICT_TARGETS[chosen_model]): + continue + if model in BATTLE_STRICT_TARGETS: + if not is_model_match_pattern(chosen_model, BATTLE_STRICT_TARGETS[model]): + continue weight = get_sample_weight(model, outage_models, sampling_weights) if ( weight != 0 @@ -383,12 +404,17 @@ def bot_response_multi( token_per_yield = 30 elif states[i].model_name in [ "qwen-max-0428", + "qwen-vl-max-0809", "qwen1.5-110b-chat", "llava-v1.6-34b", ]: token_per_yield = 7 elif states[i].model_name in [ + "qwen2.5-72b-instruct", "qwen2-72b-instruct", + "qwen-plus-0828", + "qwen-max-0919", + "llama-3.1-405b-instruct-bf16", ]: token_per_yield = 4 model_tpy.append(token_per_yield) @@ -414,21 +440,21 @@ def bot_response_multi( def build_side_by_side_ui_anony(models): notice_markdown = f""" -# ⚔️ LMSYS Chatbot Arena: Benchmarking LLMs in the Wild -[Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) +# ⚔️ Chatbot Arena (formerly LMSYS): Free AI Chat to Compare & Test Best AI Chatbots +[Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) {SURVEY_LINK} ## 📣 News - Chatbot Arena now supports images in beta. Check it out [here](https://lmarena.ai/?vision). -## 📜 Rules -- Ask any question to two anonymous models (e.g., ChatGPT, Gemini, Claude, Llama) and vote for the better one! -- You can chat for multiple turns until you identify a winner. -- Votes won't be counted if model identities are revealed during the conversation. +## 📜 How It Works +- **Blind Test**: Ask any question to two anonymous AI chatbots (ChatGPT, Gemini, Claude, Llama, and more). +- **Vote for the Best**: Choose the best response. You can keep chatting until you find a winner. +- **Play Fair**: If AI identity reveals, your vote won't count. -## 🏆 Chatbot Arena [Leaderboard](https://lmarena.ai/?leaderboard) -- We've collected **1,000,000+** human votes to compute an LLM leaderboard for 100+ models. Find out who is the 🥇LLM Champion [here](https://lmarena.ai/?leaderboard)! +## 🏆 Chatbot Arena LLM [Leaderboard](https://lmarena.ai/leaderboard) +- Backed by over **1,000,000+** community votes, our platform ranks the best LLM and AI chatbots. Explore the top AI models on our LLM [leaderboard](https://lmarena.ai/leaderboard)! ## 👇 Chat now! """ @@ -510,7 +536,7 @@ def build_side_by_side_ui_anony(models): max_output_tokens = gr.Slider( minimum=16, maximum=2048, - value=1600, + value=2000, step=64, interactive=True, label="Max output tokens", diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py index 7ee19b0413..09a0769cf5 100644 --- a/fastchat/serve/gradio_block_arena_named.py +++ b/fastchat/serve/gradio_block_arena_named.py @@ -268,11 +268,16 @@ def bot_response_multi( token_per_yield = 30 elif states[i].model_name in [ "qwen-max-0428", + "qwen-vl-max-0809", "qwen1.5-110b-chat", ]: token_per_yield = 7 elif states[i].model_name in [ + "qwen2.5-72b-instruct", "qwen2-72b-instruct", + "qwen-plus-0828", + "qwen-max-0919", + "llama-3.1-405b-instruct-bf16", ]: token_per_yield = 4 model_tpy.append(token_per_yield) @@ -308,12 +313,12 @@ def flash_buttons(): def build_side_by_side_ui_named(models): notice_markdown = f""" -# ⚔️ LMSYS Chatbot Arena: Benchmarking LLMs in the Wild -[Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) +# ⚔️ Chatbot Arena (formerly LMSYS): Free AI Chat to Compare & Test Best AI Chatbots +[Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) {SURVEY_LINK} -## 📜 Rules +## 📜 How It Works - Ask any question to two chosen models (e.g., ChatGPT, Gemini, Claude, Llama) and vote for the better one! - You can chat for multiple turns until you identify a winner. diff --git a/fastchat/serve/gradio_block_arena_vision.py b/fastchat/serve/gradio_block_arena_vision.py index 25ff78c087..07f2d3a5b6 100644 --- a/fastchat/serve/gradio_block_arena_vision.py +++ b/fastchat/serve/gradio_block_arena_vision.py @@ -10,6 +10,7 @@ import json import os import time +from typing import List, Union import gradio as gr from gradio.data_classes import FileData @@ -27,6 +28,7 @@ from fastchat.model.model_adapter import ( get_conversation_template, ) +from fastchat.serve.gradio_global_state import Context from fastchat.serve.gradio_web_server import ( get_model_description_md, acknowledgment_md, @@ -145,14 +147,18 @@ def clear_history(request: gr.Request): ip = get_ip(request) logger.info(f"clear_history. ip: {ip}") state = None - return (state, [], None) + (disable_btn,) * 5 + return (state, [], enable_multimodal, invisible_text, invisible_btn) + ( + disable_btn, + ) * 5 def clear_history_example(request: gr.Request): ip = get_ip(request) logger.info(f"clear_history_example. ip: {ip}") state = None - return (state, [], enable_multimodal) + (disable_btn,) * 5 + return (state, [], enable_multimodal, invisible_text, invisible_btn) + ( + disable_btn, + ) * 5 # TODO(Chris): At some point, we would like this to be a live-reporting feature. @@ -210,17 +216,40 @@ def moderate_input(state, text, all_conv_text, model_list, images, ip): return text, image_flagged, csam_flagged -def add_text(state, model_selector, chat_input, request: gr.Request): - text, images = chat_input["text"], chat_input["files"] +def add_text( + state, + model_selector, + chat_input: Union[str, dict], + context: Context, + request: gr.Request, +): + if isinstance(chat_input, dict): + text, images = chat_input["text"], chat_input["files"] + else: + text, images = chat_input, [] + + if ( + len(images) > 0 + and model_selector in context.text_models + and model_selector not in context.vision_models + ): + gr.Warning(f"{model_selector} is a text-only model. Image is ignored.") + images = [] + ip = get_ip(request) logger.info(f"add_text. ip: {ip}. len: {len(text)}") if state is None: - state = State(model_selector, is_vision=True) + if len(images) == 0: + state = State(model_selector, is_vision=False) + else: + state = State(model_selector, is_vision=True) if len(text) <= 0: state.skip_next = True - return (state, state.to_gradio_chatbot(), None) + (no_change_btn,) * 5 + return (state, state.to_gradio_chatbot(), None, "", no_change_btn) + ( + no_change_btn, + ) * 5 all_conv_text = state.conv.get_prompt() all_conv_text = all_conv_text[-2000:] + "\nuser: " + text @@ -234,30 +263,44 @@ def add_text(state, model_selector, chat_input, request: gr.Request): if image_flagged: logger.info(f"image flagged. ip: {ip}. text: {text}") state.skip_next = True - return (state, state.to_gradio_chatbot(), {"text": IMAGE_MODERATION_MSG}) + ( + return ( + state, + state.to_gradio_chatbot(), + {"text": IMAGE_MODERATION_MSG}, + "", no_change_btn, - ) * 5 + ) + (no_change_btn,) * 5 if (len(state.conv.messages) - state.conv.offset) // 2 >= CONVERSATION_TURN_LIMIT: logger.info(f"conversation turn limit. ip: {ip}. text: {text}") state.skip_next = True - return (state, state.to_gradio_chatbot(), {"text": CONVERSATION_LIMIT_MSG}) + ( + return ( + state, + state.to_gradio_chatbot(), + {"text": CONVERSATION_LIMIT_MSG}, + "", no_change_btn, - ) * 5 + ) + (no_change_btn,) * 5 text = text[:INPUT_CHAR_LEN_LIMIT] # Hard cut-off text = _prepare_text_with_image(state, text, images, csam_flag=csam_flag) state.conv.append_message(state.conv.roles[0], text) state.conv.append_message(state.conv.roles[1], None) - return (state, state.to_gradio_chatbot(), None) + (disable_btn,) * 5 + return ( + state, + state.to_gradio_chatbot(), + disable_multimodal, + visible_text, + enable_btn, + ) + (disable_btn,) * 5 def build_single_vision_language_model_ui( - models, add_promotion_links=False, random_questions=None + context: Context, add_promotion_links=False, random_questions=None ): promotion = ( f""" -- [GitHub](https://github.com/lm-sys/FastChat) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) +[Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) {SURVEY_LINK} @@ -269,39 +312,38 @@ def build_single_vision_language_model_ui( ) notice_markdown = f""" -# 🏔️ Chat with Large Vision-Language Models +# 🏔️ Chatbot Arena (formerly LMSYS): Free AI Chat to Compare & Test Best AI Chatbots {promotion} """ state = gr.State() gr.Markdown(notice_markdown, elem_id="notice_markdown") + vision_not_in_text_models = [ + model for model in context.vision_models if model not in context.text_models + ] + text_and_vision_models = context.text_models + vision_not_in_text_models + context_state = gr.State(context) with gr.Group(): with gr.Row(elem_id="model_selector_row"): model_selector = gr.Dropdown( - choices=models, - value=models[0] if len(models) > 0 else "", + choices=text_and_vision_models, + value=text_and_vision_models[0] + if len(text_and_vision_models) > 0 + else "", interactive=True, show_label=False, container=False, ) with gr.Accordion( - f"🔍 Expand to see the descriptions of {len(models)} models", open=False + f"🔍 Expand to see the descriptions of {len(text_and_vision_models)} models", + open=False, ): - model_description_md = get_model_description_md(models) + model_description_md = get_model_description_md(text_and_vision_models) gr.Markdown(model_description_md, elem_id="model_description_markdown") with gr.Row(): - textbox = gr.MultimodalTextbox( - file_types=["image"], - show_label=False, - placeholder="Enter your prompt or add image here", - container=True, - render=False, - elem_id="input_box", - ) - with gr.Column(scale=2, visible=False) as image_column: imagebox = gr.Image( type="pil", @@ -310,13 +352,31 @@ def build_single_vision_language_model_ui( ) with gr.Column(scale=8): chatbot = gr.Chatbot( - elem_id="chatbot", label="Scroll down and start chatting", height=650 + elem_id="chatbot", + label="Scroll down and start chatting", + height=650, + show_copy_button=True, ) with gr.Row(): - textbox.render() - # with gr.Column(scale=1, min_width=50): - # send_btn = gr.Button(value="Send", variant="primary") + textbox = gr.Textbox( + show_label=False, + placeholder="👉 Enter your prompt and press ENTER", + elem_id="input_box", + visible=False, + ) + + send_btn = gr.Button( + value="Send", variant="primary", scale=0, visible=False, interactive=False + ) + + multimodal_textbox = gr.MultimodalTextbox( + file_types=["image"], + show_label=False, + placeholder="Enter your prompt or add image here", + container=True, + elem_id="input_box", + ) with gr.Row(elem_id="buttons"): if random_questions: @@ -330,27 +390,11 @@ def build_single_vision_language_model_ui( regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False) clear_btn = gr.Button(value="🗑️ Clear", interactive=False) - cur_dir = os.path.dirname(os.path.abspath(__file__)) - - examples = gr.Examples( - examples=[ - { - "text": "How can I prepare a delicious meal using these ingredients?", - "files": [f"{cur_dir}/example_images/fridge.jpg"], - }, - { - "text": "What might the woman on the right be thinking about?", - "files": [f"{cur_dir}/example_images/distracted.jpg"], - }, - ], - inputs=[textbox], - ) - with gr.Accordion("Parameters", open=False) as parameter_row: temperature = gr.Slider( minimum=0.0, maximum=1.0, - value=0.2, + value=0.7, step=0.1, interactive=True, label="Temperature", @@ -397,23 +441,50 @@ def build_single_vision_language_model_ui( [state, temperature, top_p, max_output_tokens], [state, chatbot] + btn_list, ) - clear_btn.click(clear_history, None, [state, chatbot, textbox] + btn_list) + clear_btn.click( + clear_history, + None, + [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list, + ) model_selector.change( - clear_history, None, [state, chatbot, textbox] + btn_list - ).then(set_visible_image, [textbox], [image_column]) - examples.dataset.click( - clear_history_example, None, [state, chatbot, textbox] + btn_list + clear_history, + None, + [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list, + ).then(set_visible_image, [multimodal_textbox], [image_column]) + + multimodal_textbox.input(add_image, [multimodal_textbox], [imagebox]).then( + set_visible_image, [multimodal_textbox], [image_column] + ).then( + clear_history_example, + None, + [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list, ) - textbox.input(add_image, [textbox], [imagebox]).then( - set_visible_image, [textbox], [image_column] - ).then(clear_history_example, None, [state, chatbot, textbox] + btn_list) + multimodal_textbox.submit( + add_text, + [state, model_selector, multimodal_textbox, context_state], + [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list, + ).then(set_invisible_image, [], [image_column]).then( + bot_response, + [state, temperature, top_p, max_output_tokens], + [state, chatbot] + btn_list, + ) textbox.submit( add_text, - [state, model_selector, textbox], - [state, chatbot, textbox] + btn_list, + [state, model_selector, textbox, context_state], + [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list, + ).then(set_invisible_image, [], [image_column]).then( + bot_response, + [state, temperature, top_p, max_output_tokens], + [state, chatbot] + btn_list, + ) + + send_btn.click( + add_text, + [state, model_selector, textbox, context_state], + [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list, ).then(set_invisible_image, [], [image_column]).then( bot_response, [state, temperature, top_p, max_output_tokens], @@ -424,9 +495,11 @@ def build_single_vision_language_model_ui( random_btn.click( get_vqa_sample, # First, get the VQA sample [], # Pass the path to the VQA samples - [textbox, imagebox], # Outputs are textbox and imagebox - ).then(set_visible_image, [textbox], [image_column]).then( - clear_history_example, None, [state, chatbot, textbox] + btn_list + [multimodal_textbox, imagebox], # Outputs are textbox and imagebox + ).then(set_visible_image, [multimodal_textbox], [image_column]).then( + clear_history_example, + None, + [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list, ) return [state, model_selector] diff --git a/fastchat/serve/gradio_block_arena_vision_anony.py b/fastchat/serve/gradio_block_arena_vision_anony.py index 76bc47329f..2dade176ce 100644 --- a/fastchat/serve/gradio_block_arena_vision_anony.py +++ b/fastchat/serve/gradio_block_arena_vision_anony.py @@ -8,6 +8,7 @@ import gradio as gr import numpy as np +from typing import Union from fastchat.constants import ( TEXT_MODERATION_MSG, @@ -45,7 +46,6 @@ regenerate, clear_history, share_click, - add_text, bot_response_multi, set_global_vars_anony, load_demo_side_by_side_anony, @@ -68,6 +68,7 @@ visible_text, disable_multimodal, ) +from fastchat.serve.gradio_global_state import Context from fastchat.serve.remote_logger import get_remote_logger from fastchat.utils import ( build_logger, @@ -84,19 +85,7 @@ vl_models = [] # TODO(chris): fix sampling weights -VISION_SAMPLING_WEIGHTS = { - "gpt-4o-2024-05-13": 4, - "gpt-4-turbo-2024-04-09": 4, - "claude-3-haiku-20240307": 4, - "claude-3-sonnet-20240229": 4, - "claude-3-5-sonnet-20240620": 4, - "claude-3-opus-20240229": 4, - "gemini-1.5-flash-api-0514": 4, - "gemini-1.5-pro-api-0514": 4, - "llava-v1.6-34b": 4, - "reka-core-20240501": 4, - "reka-flash-preview-20240611": 4, -} +VISION_SAMPLING_WEIGHTS = {} # TODO(chris): Find battle targets that make sense VISION_BATTLE_TARGETS = {} @@ -115,16 +104,12 @@ def get_vqa_sample(): return (res, path) -def load_demo_side_by_side_vision_anony(all_text_models, all_vl_models, url_params): - global text_models, vl_models - text_models = all_text_models - vl_models = all_vl_models - - states = (None,) * num_sides - selector_updates = ( +def load_demo_side_by_side_vision_anony(): + states = [None] * num_sides + selector_updates = [ gr.Markdown(visible=True), gr.Markdown(visible=True), - ) + ] return states + selector_updates @@ -135,7 +120,7 @@ def clear_history_example(request: gr.Request): [None] * num_sides + [None] * num_sides + anony_names - + [enable_multimodal, invisible_text] + + [enable_multimodal, invisible_text, invisible_btn] + [invisible_btn] * 4 + [disable_btn] * 2 + [enable_btn] @@ -159,18 +144,28 @@ def vote_last_response(states, vote_type, model_selectors, request: gr.Request): gr.Info( "🎉 Thanks for voting! Your vote shapes the leaderboard, please vote RESPONSIBLY." ) + + model_name_1 = states[0].model_name + model_name_2 = states[1].model_name + model_name_map = {} + + if model_name_1 in model_name_map: + model_name_1 = model_name_map[model_name_1] + if model_name_2 in model_name_map: + model_name_2 = model_name_map[model_name_2] + if ":" not in model_selectors[0]: for i in range(5): names = ( - "### Model A: " + states[0].model_name, - "### Model B: " + states[1].model_name, + "### Model A: " + model_name_1, + "### Model B: " + model_name_2, ) yield names + (disable_text,) + (disable_btn,) * 4 time.sleep(0.1) else: names = ( - "### Model A: " + states[0].model_name, - "### Model B: " + states[1].model_name, + "### Model A: " + model_name_1, + "### Model B: " + model_name_2, ) yield names + (disable_text,) + (disable_btn,) * 4 @@ -240,7 +235,7 @@ def clear_history(request: gr.Request): [None] * num_sides + [None] * num_sides + anony_names - + [enable_multimodal, invisible_text] + + [enable_multimodal, invisible_text, invisible_btn] + [invisible_btn] * 4 + [disable_btn] * 2 + [enable_btn] @@ -249,7 +244,13 @@ def clear_history(request: gr.Request): def add_text( - state0, state1, model_selector0, model_selector1, chat_input, request: gr.Request + state0, + state1, + model_selector0, + model_selector1, + chat_input: Union[str, dict], + context: Context, + request: gr.Request, ): if isinstance(chat_input, dict): text, images = chat_input["text"], chat_input["files"] @@ -268,7 +269,7 @@ def add_text( if len(images) > 0: model_left, model_right = get_battle_pair( - vl_models, + context.all_vision_models, VISION_BATTLE_TARGETS, VISION_OUTAGE_MODELS, VISION_SAMPLING_WEIGHTS, @@ -280,7 +281,7 @@ def add_text( ] else: model_left, model_right = get_battle_pair( - text_models, + context.all_text_models, BATTLE_TARGETS, OUTAGE_MODELS, SAMPLING_WEIGHTS, @@ -298,7 +299,7 @@ def add_text( return ( states + [x.to_gradio_chatbot() for x in states] - + [None, ""] + + [None, "", no_change_btn] + [ no_change_btn, ] @@ -322,7 +323,7 @@ def add_text( return ( states + [x.to_gradio_chatbot() for x in states] - + [{"text": CONVERSATION_LIMIT_MSG}, ""] + + [{"text": CONVERSATION_LIMIT_MSG}, "", no_change_btn] + [ no_change_btn, ] @@ -343,6 +344,7 @@ def add_text( + " PLEASE CLICK 🎲 NEW ROUND TO START A NEW CONVERSATION." }, "", + no_change_btn, ] + [no_change_btn] * 7 + [""] @@ -364,7 +366,7 @@ def add_text( return ( states + [x.to_gradio_chatbot() for x in states] - + [disable_multimodal, visible_text] + + [disable_multimodal, visible_text, enable_btn] + [ disable_btn, ] @@ -373,21 +375,22 @@ def add_text( ) -def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions=None): +def build_side_by_side_vision_ui_anony(context: Context, random_questions=None): notice_markdown = f""" -# ⚔️ LMSYS Chatbot Arena (Multimodal): Benchmarking LLMs and VLMs in the Wild -[Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) +# ⚔️ Chatbot Arena (formerly LMSYS): Free AI Chat to Compare & Test Best AI Chatbots +[Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) {SURVEY_LINK} -## 📜 Rules -- Ask any question to two anonymous models (e.g., ChatGPT, Gemini, Claude, Llama) and vote for the better one! -- You can continue chatting until you identify a winner. -- Vote won't be counted if model identity is revealed during conversation. -- **NEW** Image Support: Upload an image on your first turn to unlock the multimodal arena! Images should be less than 15MB. +## 📜 How It Works +- **Blind Test**: Ask any question to two anonymous AI chatbots (ChatGPT, Gemini, Claude, Llama, and more). +- **Vote for the Best**: Choose the best response. You can keep chatting until you find a winner. +- **Play Fair**: If AI identity reveals, your vote won't count. -## 🏆 Chatbot Arena [Leaderboard](https://lmarena.ai/?leaderboard) -- We've collected **1,000,000+** human votes to compute an LLM Elo leaderboard for 100+ models. Find out who is the 🥇LLM Champion [here](https://lmarena.ai/?leaderboard)! +**NEW** Image Support: Upload an image to unlock the multimodal arena! + +## 🏆 Chatbot Arena LLM [Leaderboard](https://lmarena.ai/leaderboard) +- Backed by over **1,000,000+** community votes, our platform ranks the best LLM and AI chatbots. Explore the top AI models on our LLM [leaderboard](https://lmarena.ai/leaderboard)! ## 👇 Chat now! """ @@ -395,8 +398,9 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions= states = [gr.State() for _ in range(num_sides)] model_selectors = [None] * num_sides chatbots = [None] * num_sides - + context_state = gr.State(context) gr.Markdown(notice_markdown, elem_id="notice_markdown") + text_and_vision_models = context.models with gr.Row(): with gr.Column(scale=2, visible=False) as image_column: @@ -409,11 +413,11 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions= with gr.Column(scale=5): with gr.Group(elem_id="share-region-anony"): with gr.Accordion( - f"🔍 Expand to see the descriptions of {len(text_models) + len(vl_models)} models", + f"🔍 Expand to see the descriptions of {len(text_and_vision_models)} models", open=False, ): model_description_md = get_model_description_md( - text_models + vl_models + text_and_vision_models ) gr.Markdown( model_description_md, elem_id="model_description_markdown" @@ -457,6 +461,7 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions= placeholder="👉 Enter your prompt and press ENTER", elem_id="input_box", visible=False, + scale=3, ) multimodal_textbox = gr.MultimodalTextbox( @@ -465,8 +470,11 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions= container=True, placeholder="Enter your prompt or add image here", elem_id="input_box", + scale=3, + ) + send_btn = gr.Button( + value="Send", variant="primary", scale=1, visible=False, interactive=False ) - # send_btn = gr.Button(value="Send", variant="primary", scale=0) with gr.Row() as button_row: if random_questions: @@ -498,7 +506,7 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions= max_output_tokens = gr.Slider( minimum=16, maximum=2048, - value=1800, + value=2000, step=64, interactive=True, label="Max output tokens", @@ -550,7 +558,7 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions= states + chatbots + model_selectors - + [multimodal_textbox, textbox] + + [multimodal_textbox, textbox, send_btn] + btn_list + [random_btn] + [slow_warning], @@ -583,15 +591,19 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions= ).then( clear_history_example, None, - states + chatbots + model_selectors + [multimodal_textbox, textbox] + btn_list, + states + + chatbots + + model_selectors + + [multimodal_textbox, textbox, send_btn] + + btn_list, ) multimodal_textbox.submit( add_text, - states + model_selectors + [multimodal_textbox], + states + model_selectors + [multimodal_textbox, context_state], states + chatbots - + [multimodal_textbox, textbox] + + [multimodal_textbox, textbox, send_btn] + btn_list + [random_btn] + [slow_warning], @@ -607,10 +619,29 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions= textbox.submit( add_text, - states + model_selectors + [textbox], + states + model_selectors + [textbox, context_state], + states + + chatbots + + [multimodal_textbox, textbox, send_btn] + + btn_list + + [random_btn] + + [slow_warning], + ).then( + bot_response_multi, + states + [temperature, top_p, max_output_tokens], + states + chatbots + btn_list, + ).then( + flash_buttons, + [], + btn_list, + ) + + send_btn.click( + add_text, + states + model_selectors + [textbox, context_state], states + chatbots - + [multimodal_textbox, textbox] + + [multimodal_textbox, textbox, send_btn] + btn_list + [random_btn] + [slow_warning], @@ -635,7 +666,7 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions= states + chatbots + model_selectors - + [multimodal_textbox, textbox] + + [multimodal_textbox, textbox, send_btn] + btn_list + [random_btn], ) diff --git a/fastchat/serve/gradio_block_arena_vision_named.py b/fastchat/serve/gradio_block_arena_vision_named.py index ecca169ca2..3048ac935c 100644 --- a/fastchat/serve/gradio_block_arena_vision_named.py +++ b/fastchat/serve/gradio_block_arena_vision_named.py @@ -6,6 +6,7 @@ import json import os import time +from typing import List, Union import gradio as gr import numpy as np @@ -35,7 +36,12 @@ _prepare_text_with_image, convert_images_to_conversation_format, enable_multimodal, + disable_multimodal, + invisible_text, + invisible_btn, + visible_text, ) +from fastchat.serve.gradio_global_state import Context from fastchat.serve.gradio_web_server import ( State, bot_response, @@ -63,12 +69,35 @@ enable_moderation = False +def load_demo_side_by_side_vision_named(context: Context): + states = [None] * num_sides + + # default to the text models + models = context.text_models + + model_left = models[0] if len(models) > 0 else "" + if len(models) > 1: + weights = ([1] * 128)[: len(models) - 1] + weights = weights / np.sum(weights) + model_right = np.random.choice(models[1:], p=weights) + else: + model_right = model_left + + all_models = context.models + selector_updates = [ + gr.Dropdown(choices=all_models, value=model_left, visible=True), + gr.Dropdown(choices=all_models, value=model_right, visible=True), + ] + + return states + selector_updates + + def clear_history_example(request: gr.Request): logger.info(f"clear_history_example (named). ip: {get_ip(request)}") return ( [None] * num_sides + [None] * num_sides - + [enable_multimodal] + + [enable_multimodal, invisible_text, invisible_btn] + [invisible_btn] * 4 + [disable_btn] * 2 ) @@ -152,16 +181,40 @@ def clear_history(request: gr.Request): return ( [None] * num_sides + [None] * num_sides - + [enable_multimodal] + + [enable_multimodal, invisible_text, invisible_btn] + [invisible_btn] * 4 + [disable_btn] * 2 ) def add_text( - state0, state1, model_selector0, model_selector1, chat_input, request: gr.Request + state0, + state1, + model_selector0, + model_selector1, + chat_input: Union[str, dict], + context: Context, + request: gr.Request, ): - text, images = chat_input["text"], chat_input["files"] + if isinstance(chat_input, dict): + text, images = chat_input["text"], chat_input["files"] + else: + text, images = chat_input, [] + + if len(images) > 0: + if ( + model_selector0 in context.text_models + and model_selector0 not in context.vision_models + ): + gr.Warning(f"{model_selector0} is a text-only model. Image is ignored.") + images = [] + if ( + model_selector1 in context.text_models + and model_selector1 not in context.vision_models + ): + gr.Warning(f"{model_selector1} is a text-only model. Image is ignored.") + images = [] + ip = get_ip(request) logger.info(f"add_text (named). ip: {ip}. len: {len(text)}") states = [state0, state1] @@ -169,7 +222,9 @@ def add_text( # Init states if necessary for i in range(num_sides): - if states[i] is None: + if states[i] is None and len(images) == 0: + states[i] = State(model_selectors[i], is_vision=False) + elif states[i] is None and len(images) > 0: states[i] = State(model_selectors[i], is_vision=True) if len(text) <= 0: @@ -178,7 +233,7 @@ def add_text( return ( states + [x.to_gradio_chatbot() for x in states] - + [None] + + [None, "", no_change_btn] + [ no_change_btn, ] @@ -206,7 +261,7 @@ def add_text( return ( states + [x.to_gradio_chatbot() for x in states] - + [{"text": CONVERSATION_LIMIT_MSG}] + + [{"text": CONVERSATION_LIMIT_MSG}, "", no_change_btn] + [ no_change_btn, ] @@ -220,7 +275,7 @@ def add_text( return ( states + [x.to_gradio_chatbot() for x in states] - + [{"text": IMAGE_MODERATION_MSG}] + + [{"text": IMAGE_MODERATION_MSG}, "", no_change_btn] + [ no_change_btn, ] @@ -239,7 +294,7 @@ def add_text( return ( states + [x.to_gradio_chatbot() for x in states] - + [None] + + [disable_multimodal, visible_text, enable_btn] + [ disable_btn, ] @@ -247,18 +302,18 @@ def add_text( ) -def build_side_by_side_vision_ui_named(models, random_questions=None): +def build_side_by_side_vision_ui_named(context: Context, random_questions=None): notice_markdown = f""" -# ⚔️ LMSYS Chatbot Arena (Multimodal): Benchmarking LLMs and VLMs in the Wild -[Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) +# ⚔️ Chatbot Arena (formerly LMSYS): Free AI Chat to Compare & Test Best AI Chatbots +[Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) {SURVEY_LINK} -## 📜 Rules -- Chat with any two models side-by-side and vote! -- You can continue chatting for multiple rounds. -- Click "Clear history" to start a new round. -- You can only chat with one image per conversation. You can upload images less than 15MB. Click the "Random Example" button to chat with a random image. +## 📜 How It Works +- Ask any question to two chosen models (e.g., ChatGPT, Gemini, Claude, Llama) and vote for the better one! +- You can chat for multiple turns until you identify a winner. + +Note: You can only chat with one image per conversation. You can upload images less than 15MB. Click the "Random Example" button to chat with a random image. **❗️ For research purposes, we log user prompts and images, and may release this data to the public in the future. Please do not upload any confidential or personal information.** @@ -271,6 +326,9 @@ def build_side_by_side_vision_ui_named(models, random_questions=None): notice = gr.Markdown(notice_markdown, elem_id="notice_markdown") + text_and_vision_models = context.models + context_state = gr.State(context) + with gr.Row(): with gr.Column(scale=2, visible=False) as image_column: imagebox = gr.Image( @@ -282,10 +340,12 @@ def build_side_by_side_vision_ui_named(models, random_questions=None): with gr.Column(scale=5): with gr.Group(elem_id="share-region-anony"): with gr.Accordion( - f"🔍 Expand to see the descriptions of {len(models)} models", + f"🔍 Expand to see the descriptions of {len(text_and_vision_models)} models", open=False, ): - model_description_md = get_model_description_md(models) + model_description_md = get_model_description_md( + text_and_vision_models + ) gr.Markdown( model_description_md, elem_id="model_description_markdown" ) @@ -294,8 +354,10 @@ def build_side_by_side_vision_ui_named(models, random_questions=None): for i in range(num_sides): with gr.Column(): model_selectors[i] = gr.Dropdown( - choices=models, - value=models[i] if len(models) > i else "", + choices=text_and_vision_models, + value=text_and_vision_models[i] + if len(text_and_vision_models) > i + else "", interactive=True, show_label=False, container=False, @@ -325,7 +387,18 @@ def build_side_by_side_vision_ui_named(models, random_questions=None): ) with gr.Row(): - textbox = gr.MultimodalTextbox( + textbox = gr.Textbox( + show_label=False, + placeholder="👉 Enter your prompt and press ENTER", + elem_id="input_box", + visible=False, + ) + + send_btn = gr.Button( + value="Send", variant="primary", scale=0, visible=False, interactive=False + ) + + multimodal_textbox = gr.MultimodalTextbox( file_types=["image"], show_label=False, placeholder="Enter your prompt or add image here", @@ -409,7 +482,11 @@ def build_side_by_side_vision_ui_named(models, random_questions=None): ).then( flash_buttons, [], btn_list ) - clear_btn.click(clear_history, None, states + chatbots + [textbox] + btn_list) + clear_btn.click( + clear_history, + None, + states + chatbots + [multimodal_textbox, textbox, send_btn] + btn_list, + ) share_js = """ function (a, b, c, d) { @@ -435,17 +512,47 @@ def build_side_by_side_vision_ui_named(models, random_questions=None): for i in range(num_sides): model_selectors[i].change( - clear_history, None, states + chatbots + [textbox] + btn_list - ).then(set_visible_image, [textbox], [image_column]) + clear_history, + None, + states + chatbots + [multimodal_textbox, textbox, send_btn] + btn_list, + ).then(set_visible_image, [multimodal_textbox], [image_column]) - textbox.input(add_image, [textbox], [imagebox]).then( - set_visible_image, [textbox], [image_column] - ).then(clear_history_example, None, states + chatbots + [textbox] + btn_list) + multimodal_textbox.input(add_image, [multimodal_textbox], [imagebox]).then( + set_visible_image, [multimodal_textbox], [image_column] + ).then( + clear_history_example, + None, + states + chatbots + [multimodal_textbox, textbox, send_btn] + btn_list, + ) + + multimodal_textbox.submit( + add_text, + states + model_selectors + [multimodal_textbox, context_state], + states + chatbots + [multimodal_textbox, textbox, send_btn] + btn_list, + ).then(set_invisible_image, [], [image_column]).then( + bot_response_multi, + states + [temperature, top_p, max_output_tokens], + states + chatbots + btn_list, + ).then( + flash_buttons, [], btn_list + ) textbox.submit( add_text, - states + model_selectors + [textbox], - states + chatbots + [textbox] + btn_list, + states + model_selectors + [textbox, context_state], + states + chatbots + [multimodal_textbox, textbox, send_btn] + btn_list, + ).then(set_invisible_image, [], [image_column]).then( + bot_response_multi, + states + [temperature, top_p, max_output_tokens], + states + chatbots + btn_list, + ).then( + flash_buttons, [], btn_list + ) + + send_btn.click( + add_text, + states + model_selectors + [textbox, context_state], + states + chatbots + [multimodal_textbox, textbox, send_btn] + btn_list, ).then(set_invisible_image, [], [image_column]).then( bot_response_multi, states + [temperature, top_p, max_output_tokens], @@ -458,9 +565,11 @@ def build_side_by_side_vision_ui_named(models, random_questions=None): random_btn.click( get_vqa_sample, # First, get the VQA sample [], # Pass the path to the VQA samples - [textbox, imagebox], # Outputs are textbox and imagebox - ).then(set_visible_image, [textbox], [image_column]).then( - clear_history_example, None, states + chatbots + [textbox] + btn_list + [multimodal_textbox, imagebox], # Outputs are textbox and imagebox + ).then(set_visible_image, [multimodal_textbox], [image_column]).then( + clear_history_example, + None, + states + chatbots + [multimodal_textbox, textbox, send_btn] + btn_list, ) return states + model_selectors diff --git a/fastchat/serve/gradio_global_state.py b/fastchat/serve/gradio_global_state.py new file mode 100644 index 0000000000..fafaec213d --- /dev/null +++ b/fastchat/serve/gradio_global_state.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass, field +from typing import List + + +@dataclass +class Context: + text_models: List[str] = field(default_factory=list) + all_text_models: List[str] = field(default_factory=list) + vision_models: List[str] = field(default_factory=list) + all_vision_models: List[str] = field(default_factory=list) + models: List[str] = field(default_factory=list) + all_models: List[str] = field(default_factory=list) diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index 2ef47b14df..98399e5754 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -11,6 +11,7 @@ import random import time import uuid +from typing import List import gradio as gr import requests @@ -33,6 +34,7 @@ ) from fastchat.model.model_registry import get_model_info, model_info from fastchat.serve.api_provider import get_api_provider_stream_iter +from fastchat.serve.gradio_global_state import Context from fastchat.serve.remote_logger import get_remote_logger from fastchat.utils import ( build_logger, @@ -74,7 +76,7 @@ Please do not upload any private information. The service collects user dialogue data, including both text and images, and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) or a similar license. -#### Please report any bug or issue to our [Discord](https://discord.gg/HSWAKCrnFx)/arena-feedback. +#### Please report any bug or issue to our [Discord](https://discord.gg/6GXcFg3TH8)/arena-feedback. ### Acknowledgment We thank [UC Berkeley SkyLab](https://sky.cs.berkeley.edu/), [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a16z](https://www.a16z.com/), [Together AI](https://www.together.ai/), [Hyperbolic](https://hyperbolic.xyz/), [RunPod](https://runpod.io), [Anyscale](https://www.anyscale.com/), [HuggingFace](https://huggingface.co/) for their generous [sponsorship](https://lmsys.org/donations/). @@ -132,6 +134,12 @@ def init_system_prompt(self, conv, is_vision): return current_date = datetime.datetime.now().strftime("%Y-%m-%d") system_prompt = system_prompt.replace("{{currentDateTime}}", current_date) + + current_date_v2 = datetime.datetime.now().strftime("%d %b %Y") + system_prompt = system_prompt.replace("{{currentDateTimev2}}", current_date_v2) + + current_date_v3 = datetime.datetime.now().strftime("%B %Y") + system_prompt = system_prompt.replace("{{currentDateTimev3}}", current_date_v3) conv.set_system_message(system_prompt) def to_gradio_chatbot(self): @@ -151,7 +159,11 @@ def dict(self): return base -def set_global_vars(controller_url_, enable_moderation_, use_remote_storage_): +def set_global_vars( + controller_url_, + enable_moderation_, + use_remote_storage_, +): global controller_url, enable_moderation, use_remote_storage controller_url = controller_url_ enable_moderation = enable_moderation_ @@ -218,16 +230,23 @@ def get_model_list(controller_url, register_api_endpoint_file, vision_arena): return visible_models, models -def load_demo_single(models, url_params): +def load_demo_single(context: Context, query_params): + # default to text models + models = context.text_models + selected_model = models[0] if len(models) > 0 else "" - if "model" in url_params: - model = url_params["model"] + if "model" in query_params: + model = query_params["model"] if model in models: selected_model = model - dropdown_update = gr.Dropdown(choices=models, value=selected_model, visible=True) + all_models = context.models + + dropdown_update = gr.Dropdown( + choices=all_models, value=selected_model, visible=True + ) state = None - return state, dropdown_update + return [state, dropdown_update] def load_demo(url_params, request: gr.Request): @@ -665,6 +684,10 @@ def bot_response( color: #63A4FF; /* This can be any color you choose for hover */ text-decoration: underline; /* Adds underline on hover */ } + +.block { + overflow-y: hidden !important; +} """ @@ -772,19 +795,19 @@ def get_model_description_md(models): def build_about(): about_markdown = """ # About Us -Chatbot Arena is an open-source research project developed by members from [LMSYS](https://lmsys.org) and UC Berkeley [SkyLab](https://sky.cs.berkeley.edu/). Our mission is to build an open platform to evaluate LLMs by human preference in the real-world. -We open-source our [FastChat](https://github.com/lm-sys/FastChat) project at GitHub and release chat and human feedback dataset. We invite everyone to join us! +Chatbot Arena ([lmarena.ai](https://lmarena.ai)) is an open-source platform for evaluating AI through human preference, developed by researchers at UC Berkeley [SkyLab](https://sky.cs.berkeley.edu/) and [LMSYS](https://lmsys.org). We open-source the [FastChat](https://github.com/lm-sys/FastChat) project at GitHub and release open datasets. We always welcome contributions from the community. If you're interested in getting involved, we'd love to hear from you! ## Open-source contributors -- [Wei-Lin Chiang](https://infwinston.github.io/), [Lianmin Zheng](https://lmzheng.net/), [Ying Sheng](https://sites.google.com/view/yingsheng/home), [Lisa Dunlap](https://www.lisabdunlap.com/), [Anastasios Angelopoulos](https://people.eecs.berkeley.edu/~angelopoulos/), [Christopher Chou](https://www.linkedin.com/in/chrisychou), [Tianle Li](https://codingwithtim.github.io/), [Siyuan Zhuang](https://www.linkedin.com/in/siyuanzhuang) +- Leads: [Wei-Lin Chiang](https://infwinston.github.io/), [Anastasios Angelopoulos](https://people.eecs.berkeley.edu/~angelopoulos/) +- Contributors: [Lianmin Zheng](https://lmzheng.net/), [Ying Sheng](https://sites.google.com/view/yingsheng/home), [Lisa Dunlap](https://www.lisabdunlap.com/), [Christopher Chou](https://www.linkedin.com/in/chrisychou), [Tianle Li](https://codingwithtim.github.io/), [Evan Frick](https://efrick2002.github.io/), [Dacheng Li](https://dachengli1.github.io/), [Siyuan Zhuang](https://www.linkedin.com/in/siyuanzhuang) - Advisors: [Ion Stoica](http://people.eecs.berkeley.edu/~istoica/), [Joseph E. Gonzalez](https://people.eecs.berkeley.edu/~jegonzal/), [Hao Zhang](https://cseweb.ucsd.edu/~haozhang/), [Trevor Darrell](https://people.eecs.berkeley.edu/~trevor/) ## Learn more -- Chatbot Arena [paper](https://arxiv.org/abs/2403.04132), [launch blog](https://lmsys.org/blog/2023-05-03-arena/), [dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md), [policy](https://lmsys.org/blog/2024-03-01-policy/) +- Chatbot Arena [paper](https://arxiv.org/abs/2403.04132), [launch blog](https://blog.lmarena.ai/blog/2023/arena/), [dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md), [policy](https://blog.lmarena.ai/blog/2024/policy/) - LMSYS-Chat-1M dataset [paper](https://arxiv.org/abs/2309.11998), LLM Judge [paper](https://arxiv.org/abs/2306.05685) ## Contact Us -- Follow our [X](https://x.com/lmsysorg), [Discord](https://discord.gg/HSWAKCrnFx) or email us at lmsys.org@gmail.com +- Follow our [X](https://x.com/lmsysorg), [Discord](https://discord.gg/6GXcFg3TH8) or email us at `lmarena.ai@gmail.com` - File issues on [GitHub](https://github.com/lm-sys/FastChat) - Download our datasets and models on [HuggingFace](https://huggingface.co/lmsys) @@ -810,7 +833,7 @@ def build_about(): def build_single_model_ui(models, add_promotion_links=False): promotion = ( f""" -[Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) +[Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) {SURVEY_LINK} @@ -821,7 +844,7 @@ def build_single_model_ui(models, add_promotion_links=False): ) notice_markdown = f""" -# 🏔️ Chat with Large Language Models +# 🏔️ Chatbot Arena (formerly LMSYS): Free AI Chat to Compare & Test Best AI Chatbots {promotion} """ @@ -945,7 +968,7 @@ def build_single_model_ui(models, add_promotion_links=False): def build_demo(models): with gr.Blocks( - title="Chat with Open Large Language Models", + title="Chatbot Arena (formerly LMSYS): Free AI Chat to Compare & Test Best AI Chatbots", theme=gr.themes.Default(), css=block_css, ) as demo: diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py index 14f254bf39..7a255d59ea 100644 --- a/fastchat/serve/gradio_web_server_multi.py +++ b/fastchat/serve/gradio_web_server_multi.py @@ -6,6 +6,7 @@ import argparse import pickle import time +from typing import List import gradio as gr @@ -28,7 +29,9 @@ ) from fastchat.serve.gradio_block_arena_vision_named import ( build_side_by_side_vision_ui_named, + load_demo_side_by_side_vision_named, ) +from fastchat.serve.gradio_global_state import Context from fastchat.serve.gradio_web_server import ( set_global_vars, @@ -51,57 +54,68 @@ logger = build_logger("gradio_web_server_multi", "gradio_web_server_multi.log") -def load_demo(url_params, request: gr.Request): - global models, all_models, vl_models, all_vl_models - +def load_demo(context: Context, request: gr.Request): ip = get_ip(request) - logger.info(f"load_demo. ip: {ip}. params: {url_params}") + logger.info(f"load_demo. ip: {ip}. params: {request.query_params}") inner_selected = 0 - if "arena" in url_params: + if "arena" in request.query_params: inner_selected = 0 - elif "vision" in url_params: - inner_selected = 1 - elif "compare" in url_params: + elif "vision" in request.query_params: + inner_selected = 0 + elif "compare" in request.query_params: inner_selected = 1 - elif "direct" in url_params or "model" in url_params: + elif "direct" in request.query_params or "model" in request.query_params: + inner_selected = 2 + elif "leaderboard" in request.query_params: inner_selected = 3 - elif "leaderboard" in url_params: + elif "about" in request.query_params: inner_selected = 4 - elif "about" in url_params: - inner_selected = 5 if args.model_list_mode == "reload": - models, all_models = get_model_list( + context.text_models, context.all_text_models = get_model_list( args.controller_url, args.register_api_endpoint_file, vision_arena=False, ) - vl_models, all_vl_models = get_model_list( + context.vision_models, context.all_vision_models = get_model_list( args.controller_url, args.register_api_endpoint_file, vision_arena=True, ) - single_updates = load_demo_single(models, url_params) - side_by_side_anony_updates = load_demo_side_by_side_anony(all_models, url_params) - side_by_side_named_updates = load_demo_side_by_side_named(models, url_params) + # Text models + if args.vision_arena: + side_by_side_anony_updates = load_demo_side_by_side_vision_anony() - side_by_side_vision_anony_updates = load_demo_side_by_side_vision_anony( - all_models, all_vl_models, url_params - ) + side_by_side_named_updates = load_demo_side_by_side_vision_named( + context, + ) - return ( - (gr.Tabs(selected=inner_selected),) - + single_updates + direct_chat_updates = load_demo_single(context, request.query_params) + else: + direct_chat_updates = load_demo_single(context, request.query_params) + side_by_side_anony_updates = load_demo_side_by_side_anony( + context.all_text_models, request.query_params + ) + side_by_side_named_updates = load_demo_side_by_side_named( + context.text_models, request.query_params + ) + + tabs_list = ( + [gr.Tabs(selected=inner_selected)] + side_by_side_anony_updates + side_by_side_named_updates - + side_by_side_vision_anony_updates + + direct_chat_updates ) + return tabs_list + -def build_demo(models, vl_models, elo_results_file, leaderboard_table_file): +def build_demo( + context: Context, elo_results_file: str, leaderboard_table_file, arena_hard_table +): if args.show_terms_of_use: load_js = get_window_url_params_with_tos_js else: @@ -124,7 +138,7 @@ def build_demo(models, vl_models, elo_results_file, leaderboard_table_file): """ text_size = gr.themes.sizes.text_lg with gr.Blocks( - title="Chat with Open Large Language Models", + title="Chatbot Arena (formerly LMSYS): Free AI Chat to Compare & Test Best AI Chatbots", theme=gr.themes.Default(text_size=text_size), css=block_css, head=head_js, @@ -134,41 +148,62 @@ def build_demo(models, vl_models, elo_results_file, leaderboard_table_file): with gr.Tab("⚔️ Arena (battle)", id=0) as arena_tab: arena_tab.select(None, None, None, js=load_js) side_by_side_anony_list = build_side_by_side_vision_ui_anony( - all_models, - all_vl_models, + context, + random_questions=args.random_questions, + ) + with gr.Tab("⚔️ Arena (side-by-side)", id=1) as side_by_side_tab: + side_by_side_tab.select(None, None, None, js=alert_js) + side_by_side_named_list = build_side_by_side_vision_ui_named( + context, random_questions=args.random_questions + ) + + with gr.Tab("💬 Direct Chat", id=2) as direct_tab: + direct_tab.select(None, None, None, js=alert_js) + single_model_list = build_single_vision_language_model_ui( + context, + add_promotion_links=True, random_questions=args.random_questions, ) + else: with gr.Tab("⚔️ Arena (battle)", id=0) as arena_tab: arena_tab.select(None, None, None, js=load_js) - side_by_side_anony_list = build_side_by_side_ui_anony(models) + side_by_side_anony_list = build_side_by_side_ui_anony( + context.all_text_models + ) - with gr.Tab("⚔️ Arena (side-by-side)", id=2) as side_by_side_tab: - side_by_side_tab.select(None, None, None, js=alert_js) - side_by_side_named_list = build_side_by_side_ui_named(models) + with gr.Tab("⚔️ Arena (side-by-side)", id=1) as side_by_side_tab: + side_by_side_tab.select(None, None, None, js=alert_js) + side_by_side_named_list = build_side_by_side_ui_named( + context.text_models + ) - with gr.Tab("💬 Direct Chat", id=3) as direct_tab: - direct_tab.select(None, None, None, js=alert_js) - single_model_list = build_single_model_ui( - models, add_promotion_links=True - ) + with gr.Tab("💬 Direct Chat", id=2) as direct_tab: + direct_tab.select(None, None, None, js=alert_js) + single_model_list = build_single_model_ui( + context.text_models, add_promotion_links=True + ) demo_tabs = ( [inner_tabs] - + single_model_list + side_by_side_anony_list + side_by_side_named_list + + single_model_list ) if elo_results_file: - with gr.Tab("🏆 Leaderboard", id=4): + with gr.Tab("🏆 Leaderboard", id=3): build_leaderboard_tab( - elo_results_file, leaderboard_table_file, show_plot=True + elo_results_file, + leaderboard_table_file, + arena_hard_table, + show_plot=True, ) - with gr.Tab("ℹ️ About Us", id=5): + with gr.Tab("ℹ️ About Us", id=4): about = build_about() + context_state = gr.State(context) url_params = gr.JSON(visible=False) if args.model_list_mode not in ["once", "reload"]: @@ -176,7 +211,7 @@ def build_demo(models, vl_models, elo_results_file, leaderboard_table_file): demo.load( load_demo, - [url_params], + [context_state], demo_tabs, js=load_js, ) @@ -245,6 +280,9 @@ def build_demo(models, vl_models, elo_results_file, leaderboard_table_file): parser.add_argument( "--leaderboard-table-file", type=str, help="Load leaderboard results and plots" ) + parser.add_argument( + "--arena-hard-table", type=str, help="Load leaderboard results and plots" + ) parser.add_argument( "--gradio-root-path", type=str, @@ -274,18 +312,33 @@ def build_demo(models, vl_models, elo_results_file, leaderboard_table_file): set_global_vars(args.controller_url, args.moderate, args.use_remote_storage) set_global_vars_named(args.moderate) set_global_vars_anony(args.moderate) - models, all_models = get_model_list( + text_models, all_text_models = get_model_list( args.controller_url, args.register_api_endpoint_file, vision_arena=False, ) - vl_models, all_vl_models = get_model_list( + vision_models, all_vision_models = get_model_list( args.controller_url, args.register_api_endpoint_file, vision_arena=True, ) + models = text_models + [ + model for model in vision_models if model not in text_models + ] + all_models = all_text_models + [ + model for model in all_vision_models if model not in all_text_models + ] + context = Context( + text_models, + all_text_models, + vision_models, + all_vision_models, + models, + all_models, + ) + # Set authorization credentials auth = None if args.gradio_auth_path is not None: @@ -293,10 +346,10 @@ def build_demo(models, vl_models, elo_results_file, leaderboard_table_file): # Launch the demo demo = build_demo( - models, - all_vl_models, + context, args.elo_results_file, args.leaderboard_table_file, + args.arena_hard_table, ) demo.queue( default_concurrency_limit=args.concurrency_count, diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index afdfdd2491..c0ba39fd50 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -39,6 +39,11 @@ make_leaderboard_md_live, ) +k2c = {} +for k, v in key_to_category_name.items(): + k2c[k] = v + k2c[k + "_style_control"] = v + "_style_control" +key_to_category_name = k2c notebook_url = ( "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH" @@ -48,6 +53,34 @@ leader_component_values = [None] * 5 +def recompute_final_ranking(arena_df): + # compute ranking based on CI + ranking = {} + for i, model_a in enumerate(arena_df.index): + ranking[model_a] = 1 + for j, model_b in enumerate(arena_df.index): + if i == j: + continue + if ( + arena_df.loc[model_b]["rating_q025"] + > arena_df.loc[model_a]["rating_q975"] + ): + ranking[model_a] += 1 + return list(ranking.values()) + + +def arena_hard_title(date): + arena_hard_title = f""" +Last Updated: {date} + +**Arena-Hard-Auto v0.1** - an automatic evaluation tool for instruction-tuned LLMs with 500 challenging user queries curated from Chatbot Arena. + +We prompt GPT-4-Turbo as judge to compare the models' responses against a baseline model (default: GPT-4-0314). If you are curious to see how well your model might perform on Chatbot Arena, we recommend trying Arena-Hard-Auto. Check out our paper for more details about how Arena-Hard-Auto works as an fully automated data pipeline converting crowdsourced data into high-quality benchmarks -> +[[Paper](https://arxiv.org/abs/2406.11939) | [Repo](https://github.com/lm-sys/arena-hard-auto)] + """ + return arena_hard_title + + def recompute_final_ranking(arena_df): # compute ranking based on CI ranking = {} @@ -347,7 +380,7 @@ def update_leaderboard_df(arena_table_vals): "Rank* (UB)", "Delta", "Model", - "Arena Elo", + "Arena Score", "95% CI", "Votes", "Organization", @@ -402,6 +435,7 @@ def build_arena_tab( arena_dfs = {} category_elo_results = {} last_updated_time = elo_results["full"]["last_updated_datetime"].split(" ")[0] + for k in key_to_category_name.keys(): if k not in elo_results: continue @@ -412,8 +446,9 @@ def build_arena_tab( def update_leaderboard_and_plots(category, filters): if len(filters) > 0 and "Style Control" in filters: - if f"{category} (Style Control)" in arena_dfs: - category = f"{category} (Style Control)" + cat_name = f"{category} w/ Style Control" + if cat_name in arena_dfs: + category = cat_name else: gr.Warning("This category does not support style control.") @@ -461,8 +496,8 @@ def update_leaderboard_and_plots(category, filters): ], value=arena_values, elem_id="arena_leaderboard_dataframe", - height=800, - column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], + height=1000, + column_widths=[70, 70, 210, 90, 90, 90, 120, 150, 100], wrap=True, ) else: @@ -478,7 +513,7 @@ def update_leaderboard_and_plots(category, filters): "Knowledge Cutoff", ], datatype=[ - "str", + "number", "markdown", "number", "str", @@ -489,8 +524,8 @@ def update_leaderboard_and_plots(category, filters): ], value=arena_values, elem_id="arena_leaderboard_dataframe", - height=800, - column_widths=[70, 190, 100, 100, 90, 140, 150, 100], + height=1000, + column_widths=[70, 220, 90, 90, 90, 120, 150, 100], wrap=True, ) @@ -519,10 +554,15 @@ def update_leaderboard_and_plots(category, filters): md = make_arena_leaderboard_md(arena_df, last_updated_time, vision=vision) gr.Markdown(md, elem_id="leaderboard_markdown") + + # only keep category without style control + category_choices = list(arena_dfs.keys()) + category_choices = [x for x in category_choices if "Style Control" not in x] + with gr.Row(): with gr.Column(scale=2): category_dropdown = gr.Dropdown( - choices=list(arena_dfs.keys()), + choices=category_choices, label="Category", value="Overall", ) @@ -563,7 +603,7 @@ def update_leaderboard_and_plots(category, filters): "Knowledge Cutoff", ], datatype=[ - "str", + "number", "markdown", "number", "str", @@ -575,8 +615,8 @@ def update_leaderboard_and_plots(category, filters): # value=highlight_top_models(arena_vals.style), value=arena_vals.style, elem_id="arena_leaderboard_dataframe", - height=800, - column_widths=[70, 190, 100, 100, 90, 130, 150, 100], + height=1000, + column_widths=[70, 220, 90, 90, 90, 120, 150, 100], wrap=True, ) @@ -592,7 +632,6 @@ def update_leaderboard_and_plots(category, filters): ) if not vision: - # only live update the text tab leader_component_values[:] = [default_md, p1, p2, p3, p4] if show_plot: @@ -664,8 +703,8 @@ def build_full_leaderboard_tab(elo_results, model_table_df, model_to_score): gr.Dataframe( headers=[ "Model", - "Arena Elo", - "Arena-Hard-Auto", + "Arena Score", + "arena-hard-auto", "MT-bench", "MMLU", "Organization", @@ -675,7 +714,7 @@ def build_full_leaderboard_tab(elo_results, model_table_df, model_to_score): value=full_table_vals, elem_id="full_leaderboard_dataframe", column_widths=[200, 100, 110, 100, 70, 130, 150], - height=800, + height=1000, wrap=True, ) @@ -693,9 +732,19 @@ def get_arena_category_table(results_df, categories, metric="ranking"): # Reorder columns to match the input order of categories category_df = category_df.reindex(columns=category_names) category_df.insert(0, "Model", category_df.index) + + # insert model rating as a column to category_df + category_df = category_df.merge( + results_df[results_df["category"] == "Overall"][["Model", "rating"]], + on="Model", + how="left", + ) category_df = category_df.sort_values( - by=category_names[0], ascending=metric == "ranking" + by=[category_names[0], "rating"], + ascending=[metric == "ranking", False], ) + # by=["final_ranking", "rating"], ascending=[True, False] + category_df = category_df.drop(columns=["rating"]) category_df = category_df.reset_index(drop=True) style = category_df.style @@ -749,15 +798,19 @@ def build_category_leaderboard_tab( combined_elo_df, categories, "rating" ) sort_ranking = lambda _: get_arena_category_table(combined_elo_df, categories) + with gr.Row(): + gr.Markdown( + f"""  Chatbot Arena Overview""" + ) overall_ranking_leaderboard = gr.Dataframe( headers=["Model"] + [key_to_category_name[k] for k in categories], datatype=["markdown"] + ["str" for k in categories], value=full_table_vals, elem_id="full_leaderboard_dataframe", - column_widths=[250] + column_widths=[150] + categories_width, # IMPORTANT: THIS IS HARDCODED WITH THE CURRENT CATEGORIES - height=800, + height=1000, wrap=True, ) ranking_button.click( @@ -770,15 +823,19 @@ def build_category_leaderboard_tab( selected_categories = [ "full", - "coding", + "full_style_control", + "hard_6", + "hard_6_style_control", "if", + "coding", "math", - "hard_6", "multiturn", "long_user", - "no_refusal", + # "no_refusal", ] -selected_categories_width = [95, 85, 130, 75, 150, 100, 95, 100] +# selected_categories_width = [95, 85, 100, 75, 120, 100, 95, 100,100] +selected_categories_width = [110, 110, 110, 110, 110, 80, 80, 80, 80] +# selected_categories_width = [100] * len(selected_categories) language_categories = [ "english", @@ -813,7 +870,8 @@ def get_model_name(model_key): try: df["Model"] = df["Model"].apply(get_model_name) combined_table.append(df) - except: + except Exception as e: + print(f"Error: {e}") continue combined_table = pd.concat(combined_table) combined_table["Model"] = combined_table.index @@ -848,8 +906,9 @@ def build_leaderboard_tab( with gr.Row(): with gr.Column(scale=4): md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown") - with gr.Column(scale=1): - vote_button = gr.Button("Vote!", link="https://lmarena.ai") + if mirror: + with gr.Column(scale=1): + vote_button = gr.Button("Vote!", link="https://lmarena.ai") md2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown") if leaderboard_table_file: data = load_leaderboard_table_csv(leaderboard_table_file) @@ -912,6 +971,7 @@ def build_leaderboard_tab( vision=True, show_plot=show_plot, ) + model_to_score = {} if arena_hard_leaderboard is not None: with gr.Tab("Arena-Hard-Auto", id=3): dataFrame = arena_hard_process( @@ -945,7 +1005,7 @@ def build_leaderboard_tab( for col in dataFrame.columns ], elem_id="arena_hard_leaderboard", - height=800, + height=1000, wrap=True, column_widths=[70, 190, 80, 80, 90, 150], ) @@ -1058,7 +1118,7 @@ def build_demo(elo_results_file, leaderboard_table_file, arena_hard_leaderboard) parser.add_argument("--ban-ip-file", type=str) parser.add_argument("--exclude-model-names", type=str, nargs="+") parser.add_argument("--password", type=str, default=None, nargs="+") - parser.add_argument("--arena-hard-leaderboard", type=str) + parser.add_argument("--arena-hard-leaderboard", type=str, default=None) args = parser.parse_args() logger = build_logger("monitor", "monitor.log") diff --git a/fastchat/serve/monitor/monitor_md.py b/fastchat/serve/monitor/monitor_md.py index 3cc69a1631..0016fd1169 100644 --- a/fastchat/serve/monitor/monitor_md.py +++ b/fastchat/serve/monitor/monitor_md.py @@ -37,12 +37,10 @@ "no_refusal": "Exclude Refusal", "overall_limit_5_user_vote": "overall_limit_5_user_vote", "full_old": "Overall (Deprecated)", - "full_style_control": "Overall (Style Control)", - "hard_6_style_control": "Hard Prompts (Overall) (Style Control)", } cat_name_to_explanation = { "Overall": "Overall Questions", - "Overall w/ Style Control": "Overall with Style Control", + "Overall w/ Style Control": "Overall Leaderboard with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).", "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).", "Math": "Math", "Instruction Following": "Instruction Following", @@ -50,7 +48,7 @@ "Coding": "Coding: whether conversation contains code snippets", "Coding w/ Style Control": "Coding with Style Control", "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)", - "Hard Prompts (Overall) w/ Style Control": "Hard Prompts (Overall) with Style Control", + "Hard Prompts (Overall) w/ Style Control": "Hard Prompts with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).", "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)", "Longer Query": "Longer Query (>= 500 tokens)", "English": "English Prompts", @@ -66,8 +64,6 @@ "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")', "overall_limit_5_user_vote": "overall_limit_5_user_vote", "Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).", - "Overall (Style Control)": "Overall Leaderboard with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).", - "Hard Prompts (Overall) (Style Control)": "Hard Prompts (Overall) Leaderboard with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).", } cat_name_to_baseline = { "Hard Prompts (English)": "English", @@ -84,8 +80,8 @@ def make_default_md_1(mirror=False): link_color = "#1976D2" # This color should be clear in both light and dark mode leaderboard_md = f""" - # 🏆 Chatbot Arena Leaderboard - [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) + # 🏆 Chatbot Arena LLM Leaderboard: Community-driven Evaluation for Best LLM and AI chatbots + [Blog](https://blog.lmarena.ai/blog/2023/arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/6GXcFg3TH8) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena) """ return leaderboard_md @@ -96,8 +92,9 @@ def make_default_md_2(mirror=False): leaderboard_md = f""" {mirror_str if mirror else ""} -Chatbot Arena is a crowdsourced open platform for LLM evals. We've collected over 1,000,000 human pairwise comparisons to rank LLMs with the Bradley-Terry model and display the model ratings in Elo-scale. -You can find more details in our paper. **Chatbot arena is dependent on community participation, please contribute by casting your vote!** +Chatbot Arena ([lmarena.ai](https://lmarena.ai)) is an open-source platform for evaluating AI through human preference, developed by researchers at UC Berkeley [SkyLab](https://sky.cs.berkeley.edu/) and [LMSYS](https://lmsys.org). With over 1,000,000 user votes, the platform ranks best LLM and AI chatbots using the Bradley-Terry model to generate live leaderboards. For technical details, check out our [paper](https://arxiv.org/abs/2403.04132). + +**Chatbot Arena thrives on community engagement — cast your vote to help improve AI evaluation!** {SURVEY_LINK} """ diff --git a/fastchat/utils.py b/fastchat/utils.py index 545e014146..d3531928f3 100644 --- a/fastchat/utils.py +++ b/fastchat/utils.py @@ -185,6 +185,7 @@ def moderation_filter(text, model_list, do_moderation=False): "dbrx", "gemini", "reka", + "eureka", ] custom_thresholds = {"sexual": 0.3} diff --git a/playground/__init__.py b/playground/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/playground/benchmark/benchmark_api_provider.py b/playground/benchmark/benchmark_api_provider.py new file mode 100644 index 0000000000..89ca02ece6 --- /dev/null +++ b/playground/benchmark/benchmark_api_provider.py @@ -0,0 +1,135 @@ +""" +Usage: +python3 -m playground.benchmark.benchmark_api_provider --api-endpoint-file api_endpoints.json --output-file ./benchmark_results.json --random-questions metadata_sampled.json +""" +import argparse +import json +import time + +import numpy as np + +from fastchat.serve.api_provider import get_api_provider_stream_iter +from fastchat.serve.gradio_web_server import State +from fastchat.serve.vision.image import Image + + +class Metrics: + def __init__(self): + self.ttft = None + self.avg_token_time = None + + def to_dict(self): + return {"ttft": self.ttft, "avg_token_time": self.avg_token_time} + + +def sample_image_and_question(random_questions_dict, index): + # message = np.random.choice(random_questions_dict) + message = random_questions_dict[index] + question = message["question"] + path = message["path"] + + if isinstance(question, list): + question = question[0] + + return (question, path) + + +def call_model( + conv, + model_name, + model_api_dict, + state, + temperature=0.4, + top_p=0.9, + max_new_tokens=2048, +): + prev_message = "" + prev_time = time.time() + CHARACTERS_PER_TOKEN = 4 + metrics = Metrics() + + stream_iter = get_api_provider_stream_iter( + conv, model_name, model_api_dict, temperature, top_p, max_new_tokens, state + ) + call_time = time.time() + token_times = [] + for i, data in enumerate(stream_iter): + output = data["text"].strip() + if i == 0: + metrics.ttft = time.time() - call_time + prev_message = output + prev_time = time.time() + else: + token_diff_length = (len(output) - len(prev_message)) / CHARACTERS_PER_TOKEN + if token_diff_length == 0: + continue + + token_diff_time = time.time() - prev_time + token_time = token_diff_time / token_diff_length + token_times.append(token_time) + prev_time = time.time() + + metrics.avg_token_time = np.mean(token_times) + return metrics + + +def run_benchmark(model_name, model_api_dict, random_questions_dict, num_calls=20): + model_results = [] + + for index in range(num_calls): + state = State(model_name) + text, image_path = sample_image_and_question(random_questions_dict, index) + max_image_size_mb = 5 / 1.5 + + images = [ + Image(url=image_path).to_conversation_format( + max_image_size_mb=max_image_size_mb + ) + ] + message = (text, images) + + state.conv.append_message(state.conv.roles[0], message) + state.conv.append_message(state.conv.roles[1], None) + + metrics = call_model(state.conv, model_name, model_api_dict, state) + model_results.append(metrics.to_dict()) + + return model_results + + +def benchmark_models(api_endpoint_info, random_questions_dict, models): + results = {model_name: [] for model_name in models} + + for model_name in models: + model_results = run_benchmark( + model_name, + api_endpoint_info[model_name], + random_questions_dict, + num_calls=20, + ) + results[model_name] = model_results + + print(results) + return results + + +def main(api_endpoint_file, random_questions, output_file): + api_endpoint_info = json.load(open(api_endpoint_file)) + random_questions_dict = json.load(open(random_questions)) + models = ["reka-core-20240501", "gpt-4o-2024-05-13"] + + models_results = benchmark_models(api_endpoint_info, random_questions_dict, models) + + with open(output_file, "w") as f: + json.dump(models_results, f) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--api-endpoint-file", required=True) + parser.add_argument("--random-questions", required=True) + parser.add_argument("--output-file", required=True) + + args = parser.parse_args() + + main(args.api_endpoint_file, args.random_questions, args.output_file)