Skip to content

Commit

Permalink
Merge pull request #80 from wandb/weaveeval
Browse files Browse the repository at this point in the history
W&B Weave based Evaluation
  • Loading branch information
ayulockin authored Sep 8, 2024
2 parents 36f315e + 27c4709 commit 8ec557e
Show file tree
Hide file tree
Showing 6 changed files with 308 additions and 0 deletions.
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,26 @@ For more detailed instructions on installing and running the bot, please refer t

Executing these commands will launch the API, Slackbot, and Discord bot applications, enabling you to interact with the bot and ask questions related to the Weights & Biases documentation.

### Running the Evaluation pipeline

Make sure to set the environments in your terminal.

```
set -o allexport; source .env; set +o allexport
```

Launch the wandbot with 8 workers. This speeds up evaluation

```
WANDBOT_EVALUATION=1 gunicorn wandbot.api.app:app --bind 0.0.0.0:8000 --timeout=200 --workers=8 --worker-class uvicorn.workers.UvicornWorker
```

Launch W&B Weave evaluation

```
python src/wandbot/evaluation/weave_eval/main.py
```

## Overview of the Implementation

1. Creating Document Embeddings with ChromaDB
Expand Down
4 changes: 4 additions & 0 deletions src/wandbot/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ async def lifespan(app: FastAPI):

Base.metadata.create_all(bind=engine)

if os.getenv("WANDBOT_EVALUATION"):
logger.info("Lifespan starting, initializing wandbot for evaluation mode.")
await initialize()

async def backup_db():
"""Periodically backs up the database to a table.
Expand Down
2 changes: 2 additions & 0 deletions src/wandbot/chat/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def __call__(self, chat_request: ChatRequest) -> ChatResponse:
}
result_dict.update({"application": chat_request.application})
self.run.log(usage_stats)

return ChatResponse(**result_dict)
except Exception as e:
with Timer() as timer:
Expand All @@ -133,4 +134,5 @@ def __call__(self, chat_request: ChatRequest) -> ChatResponse:
"end_time": timer.stop,
}
)

return ChatResponse(**result)
41 changes: 41 additions & 0 deletions src/wandbot/evaluation/weave_eval/log_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
os.environ["WANDB_ENTITY"] = "wandbot"

import wandb
import weave
import pandas as pd
from weave import Dataset

from wandbot.evaluation.config import EvalConfig

config = EvalConfig()

wandb_project = config.wandb_project
wandb_entity = config.wandb_entity

eval_artifact = wandb.Api().artifact(config.eval_artifact)
eval_artifact_dir = eval_artifact.download(root=config.eval_artifact_root)

df = pd.read_json(
f"{eval_artifact_dir}/{config.eval_annotations_file}",
lines=True,
orient="records",
)
df.insert(0, "id", df.index)

correct_df = df[
(df["is_wandb_query"] == "YES") & (df["correctness"] == "correct")
]

data_rows = correct_df.to_dict('records')

weave.init(wandb_project)

# Create a dataset
dataset = Dataset(
name='wandbot_eval_data',
rows=data_rows,
)

# Publish the dataset
weave.publish(dataset)
114 changes: 114 additions & 0 deletions src/wandbot/evaluation/weave_eval/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import os
os.environ["WANDB_ENTITY"] = "wandbot"

import json
import httpx
import weave
import asyncio
import requests
from weave import Evaluation
from weave import Model
from llama_index.llms.openai import OpenAI

from wandbot.evaluation.config import EvalConfig
from wandbot.utils import get_logger

from wandbot.evaluation.eval.correctness import (
CORRECTNESS_EVAL_TEMPLATE,
WandbCorrectnessEvaluator,
)

logger = get_logger(__name__)
config = EvalConfig()

correctness_evaluator = WandbCorrectnessEvaluator(
llm=OpenAI(config.eval_judge_model),
eval_template=CORRECTNESS_EVAL_TEMPLATE,
)

wandb_project = config.wandb_project
wandb_entity = config.wandb_entity

weave.init(f"{wandb_entity}/{wandb_project}")


@weave.op()
async def get_answer(question: str, application: str = "api-eval") -> str:
url = "http://0.0.0.0:8000/chat/query"
payload = {
"question": question,
"application": application,
"language": "en",
}
async with httpx.AsyncClient(timeout=200.0) as client:
response = await client.post(url, json=payload)
response_json = response.json()
return json.dumps(response_json)


@weave.op()
async def get_eval_record(
question: str,
) -> dict:
response = await get_answer(question)
response = json.loads(response)
return {
"system_prompt": response["system_prompt"],
"generated_answer": response["answer"],
"retrieved_contexts": response["source_documents"],
"model": response["model"],
"total_tokens": response["total_tokens"],
"prompt_tokens": response["prompt_tokens"],
"completion_tokens": response["completion_tokens"],
"time_taken": response["time_taken"],
}


class EvaluatorModel(Model):
eval_judge_model: str = config.eval_judge_model

@weave.op()
async def predict(self, question: str) -> dict:
# Model logic goes here
prediction = await get_eval_record(question)
return prediction


@weave.op()
async def get_answer_correctness(
question: str,
ground_truth: str,
notes: str,
model_output: dict
) -> dict:
result = await correctness_evaluator.aevaluate(
query=question,
response=model_output["generated_answer"],
reference=ground_truth,
contexts=model_output["retrieved_contexts"],
reference_notes=notes,
)
return {
"answer_correctness": result.dict()["passing"]
}


dataset_ref = weave.ref(
"weave:///wandbot/wandbot-eval/object/wandbot_eval_data:eCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU"
).get()
question_rows = dataset_ref.rows
question_rows = [
{
"question": row["question"],
"ground_truth": row["answer"],
"notes": row["notes"],
} for row in question_rows
]
logger.info("Number of evaluation samples: %s", len(question_rows))

evaluation = Evaluation(
dataset=question_rows, scorers=[get_answer_correctness]
)

if __name__ == "__main__":
asyncio.run(evaluation.evaluate(EvaluatorModel()))
127 changes: 127 additions & 0 deletions src/wandbot/evaluation/weave_eval/weave_correctness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import asyncio
from typing import Any, Optional, Sequence

import regex as re
from llama_index.core.evaluation import CorrectnessEvaluator, EvaluationResult

from wandbot.evaluation.eval.utils import (
make_eval_template,
safe_parse_eval_response,
)

import wandb
import weave

SYSTEM_TEMPLATE = """You are a Weight & Biases support expert tasked with evaluating the correctness of answers to questions asked by users to a a technical support chatbot.
You are given the following information:
- a user query,
- the documentation used to generate the answer
- a reference answer
- the reason why the reference answer is correct, and
- a generated answer.
Your job is to judge the relevance and correctness of the generated answer.
- Consider whether the answer addresses all aspects of the question.
- The generated answer must provide only correct information according to the documentation.
- Compare the generated answer to the reference answer for completeness and correctness.
- Output a score and a decision that represents a holistic evaluation of the generated answer.
- You must return your response only in the below mentioned format. Do not return answers in any other format.
Follow these guidelines for scoring:
- Your score has to be between 1 and 3, where 1 is the worst and 3 is the best.
- If the generated answer is not correct in comparison to the reference, you should give a score of 1.
- If the generated answer is correct in comparison to the reference but contains mistakes, you should give a score of 2.
- If the generated answer is correct in comparision to the reference and completely answer's the user's query, you should give a score of 3.
Output your final verdict by strictly following JSON format:
{{
"reason": <<Provide a brief explanation for your decision here>>,
"score": <<Provide a score as per the above guidelines>>,
"decision": <<Provide your final decision here, either 'correct', or 'incorrect'>>
}}
Example Response 1:
{{
"reason": "The generated answer has the exact details as the reference answer and completely answer's the user's query.",
"score": 3,
"decision": "correct"
}}
Example Response 2:
{{
"reason": "The generated answer doesn't match the reference answer, and deviates from the documentation provided",
"score": 1,
"decision": "incorrect"
}}
Example Response 3:
{{
"reason": "The generated answer follows the same steps as the reference answer. However, it includes assumptions about methods that are not mentioned in the documentation.",
"score": 2,
"decision": "incorrect"
}}
"""


USER_TEMPLATE = """
## User Query
{query}
## Documentation
{context_str}
## Reference Answer
{reference_answer}
## Reference Correctness Reason
{reference_notes}
## Generated Answer
{generated_answer}
"""

CORRECTNESS_EVAL_TEMPLATE = make_eval_template(SYSTEM_TEMPLATE, USER_TEMPLATE)


class WandbCorrectnessEvaluator(CorrectnessEvaluator):
@weave.op()
async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
reference: Optional[str] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None or reference is None:
print(query, response, reference, flush=True)
raise ValueError("query, response, and reference must be provided")

eval_response = await self._llm.apredict(
prompt=self._eval_template,
query=query,
generated_answer=response,
reference_answer=reference,
context_str=re.sub(
"\n+", "\n", "\n---\n".join(contexts) if contexts else ""
),
reference_notes=kwargs.get("reference_notes", ""),
)

passing, reasoning, score = await safe_parse_eval_response(
eval_response, "correct"
)

return EvaluationResult(
query=query,
response=response,
passing=passing,
score=score,
feedback=reasoning,
)

0 comments on commit 8ec557e

Please sign in to comment.