Skip to content

Commit

Permalink
Merge pull request #151 from helixml/evals
Browse files Browse the repository at this point in the history
User provided feedback
  • Loading branch information
lukemarsden authored Jan 31, 2024
2 parents fb1b56e + 6beec83 commit 32c4122
Show file tree
Hide file tree
Showing 9 changed files with 264 additions and 3 deletions.
23 changes: 23 additions & 0 deletions api/cmd/helix/evals.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package helix

import (
"github.com/helixml/helix/api/pkg/evals"
"github.com/spf13/cobra"
)

var evalTargets []string

func newEvalsCommand() *cobra.Command {
var evalsCmd = &cobra.Command{
Use: "evals",
Short: "A CLI tool for evaluating finetuned LLMs",
Run: func(cmd *cobra.Command, args []string) {
evals.Run()
},
}
evalsCmd.Flags().StringSliceVar(&evalTargets, "target", []string{},
"Target(s) to use, defaults to all",
)

return evalsCmd
}
19 changes: 19 additions & 0 deletions api/pkg/data/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package data
import (
"fmt"
"path"
"runtime/debug"
"time"

"github.com/helixml/helix/api/pkg/system"
Expand Down Expand Up @@ -217,6 +218,23 @@ func GetSessionSummary(session *types.Session) (*types.SessionSummary, error) {
}, nil
}

func GetHelixVersion() string {
helixVersion := "<unknown>"
info, ok := debug.ReadBuildInfo()
if ok {
for _, kv := range info.Settings {
if kv.Value == "" {
continue
}
switch kv.Key {
case "vcs.revision":
helixVersion = kv.Value
}
}
}
return helixVersion
}

func CreateSession(req types.CreateSessionRequest) (types.Session, error) {
systemInteraction := &types.Interaction{
ID: system.GenerateUUID(),
Expand Down Expand Up @@ -251,6 +269,7 @@ func CreateSession(req types.CreateSessionRequest) (types.Session, error) {
},
Priority: req.Priority,
ManuallyReviewQuestions: req.ManuallyReviewQuestions,
HelixVersion: GetHelixVersion(),
},
}

Expand Down
7 changes: 7 additions & 0 deletions api/pkg/evals/evals.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package evals

import "log"

func Run() {
log.Printf("hello from evals")
}
91 changes: 91 additions & 0 deletions api/pkg/evals/evals_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# End-to-end testing of helix's ability to learn from documents.
# To cover fine-tuning, RAG and combinations thereof.

# manually extract these examples from the database and construct qapairs
# manually to be graded automatically by gpt4 for similarity
manual_evals:
- name: junior-doctors
download:
- https://www.theguardian.com/society/2023/dec/05/junior-doctors-in-england-to-stage-more-strikes
question: what are the doctors going to do?
expected_answer: the doctors are going to go on strike
checker: gpt4-similarity


# manually extract these urls from the database and have gpt4 construct qapairs
# to feed back into inference mode of the model and then auto-grade the result
automatic_evals:
- name: junior-doctors
download:
- https://www.theguardian.com/society/2023/dec/05/junior-doctors-in-england-to-stage-more-strikes
checker: gpt4-autoqa


checkers:
- name: gpt4-similarity
prompt: |
Given the context
[BEGIN_DATA]
{{.Context}}
[END_DATA]
How similar is the real answer:
[BEGIN_REAL_ANSWER]
{{.RealAnswer}}
[END_REAL_ANSWER]
To the expected answer:
[BEGIN_EXPECTED_ANSWER]
{{.ExpectedAnswer}}
[END_EXPECTED_ANSWER]
Answer with one of the following values: Good, OK, Bad
values:
Bad: 0.0
OK: 0.5
Good: 1.0


- name: gpt4-autoqa
prompt: |
Given the context
[BEGIN_DATA]
{{.Context}}
[END_DATA]
Construct a list of questions and expected answers about the article.
Answer in the following schema:
```json
[{
"question": "...",
"answer": "..."
},
{
"question": "...",
"answer": "..."
},
]
```
# using the resulting qapairs, feed them back into inference mode in the
# model and check the with gpt4-similarity


finetune_targets:
- name: production
api_url: https://api.tryhelix.ai/v1
token_from_env: HELIX_API_KEY


llm_targets:
- name: openai
api_url: https://api.openai.com/v1
model: gpt-4-1106-preview
token_from_env: OPENAI_API_KEY
- name: together-mixtral
api_url: https://api.together.xyz/v1
model: mistralai/Mixtral-8x7B-Instruct-v0.1
11 changes: 11 additions & 0 deletions api/pkg/types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,17 @@ type SessionMetadata struct {
DocumentGroupID string `json:"document_group_id"`
ManuallyReviewQuestions bool `json:"manually_review_questions"`
SystemPrompt string `json:"system_prompt"`
HelixVersion string `json:"helix_version"`
// Evals are cool. Scores are strings of floats so we can distinguish ""
// (not rated) from "0.0"
EvalRunId string `json:"eval_run_id"`
EvalUserScore string `json:"eval_user_score"`
EvalUserReason string `json:"eval_user_reason"`
EvalManualScore string `json:"eval_manual_score"`
EvalManualReason string `json:"eval_manual_reason"`
EvalAutomaticScore string `json:"eval_automatic_score"`
EvalAutomaticReason string `json:"eval_automatic_reason"`
EvalOriginalUserPrompts []string `json:"eval_original_user_prompts"`
}

// the packet we put a list of sessions into so pagination is supported and we know the total amount
Expand Down
1 change: 1 addition & 0 deletions docker-compose.dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ services:
volumes:
- .:/workspace/helix
- ./cog/helix_cog_wrapper.py:/workspace/cog-sdxl/helix_cog_wrapper.py
- ~/.cache/huggingface:/root/.cache/huggingface
# comment these out if you don't have appropriate repos checked out
#- ../cog-sdxl/predict.py:/workspace/cog-sdxl/predict.py
#- ../cog-sdxl/weights.py:/workspace/cog-sdxl/weights.py
Expand Down
6 changes: 4 additions & 2 deletions frontend/src/hooks/useSession.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ export const useSession = () => {
} else {
setBot(undefined)
}
return result
}, [])

const loadSessionSummary = useCallback(async (id: string) => {
Expand All @@ -38,9 +39,10 @@ export const useSession = () => {
setSummary(result)
}, [])

const reload = useCallback(() => {
const reload = useCallback(async () => {
if(!data) return
loadSession(data.id)
const result = await loadSession(data.id)
return result
}, [
data,
])
Expand Down
94 changes: 93 additions & 1 deletion frontend/src/pages/Session.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ import Container from '@mui/material/Container'
import Box from '@mui/material/Box'

import SendIcon from '@mui/icons-material/Send'
import ThumbUpIcon from '@mui/icons-material/ThumbUp'
import ThumbDownIcon from '@mui/icons-material/ThumbDown'
import ThumbUpOffIcon from '@mui/icons-material/ThumbUpOffAlt'
import ThumbDownOffIcon from '@mui/icons-material/ThumbDownOffAlt'
import ShareIcon from '@mui/icons-material/Share'

import InteractionLiveStream from '../components/session/InteractionLiveStream'
Expand Down Expand Up @@ -33,6 +37,7 @@ import useLoading from '../hooks/useLoading'
import {
ICloneInteractionMode,
ISession,
ISessionConfig,
INTERACTION_STATE_EDITING,
SESSION_TYPE_TEXT,
SESSION_MODE_FINETUNE,
Expand Down Expand Up @@ -64,11 +69,16 @@ const Session: FC = () => {
const [restartWindowOpen, setRestartWindowOpen] = useState(false)
const [shareInstructions, setShareInstructions] = useState<IShareSessionInstructions>()
const [inputValue, setInputValue] = useState('')
const [feedbackValue, setFeedbackValue] = useState(session.data?.config.eval_user_reason)

const handleInputChange = (event: React.ChangeEvent<HTMLInputElement>) => {
setInputValue(event.target.value)
}

const handleFeedbackChange = (event: React.ChangeEvent<HTMLInputElement>) => {
setFeedbackValue(event.target.value)
}

const loading = useMemo(() => {
if(!session.data || !session.data?.interactions || session.data?.interactions.length === 0) return false
const interaction = session.data?.interactions[session.data?.interactions.length - 1]
Expand Down Expand Up @@ -108,7 +118,9 @@ const Session: FC = () => {

const onUpdateSharing = useCallback(async (value: boolean) => {
if(!session.data) return false
const result = await session.updateConfig(session.data?.id, Object.assign({}, session.data.config, {
const latestSessionData = await session.reload()
if(!latestSessionData) return false
const result = await session.updateConfig(latestSessionData.id, Object.assign({}, latestSessionData.config, {
shared: value,
}))
return result ? true : false
Expand Down Expand Up @@ -161,6 +173,24 @@ const Session: FC = () => {
session.data,
])

const onUpdateSessionConfig = useCallback(async (data: Partial<ISessionConfig>, snackbarMessage?: string) => {
if(!session.data) return
const latestSessionData = await session.reload()
if(!latestSessionData) return false
const sessionConfigUpdate = Object.assign({}, latestSessionData.config, data)
const result = await api.put<ISessionConfig, ISessionConfig>(`/api/v1/sessions/${session.data.id}/config`, sessionConfigUpdate, undefined, {
loading: true,
})
if(!result) return
session.reload()
if(snackbarMessage) {
snackbar.success(snackbarMessage)
}
}, [
account.user,
session.data,
])

const onClone = useCallback(async (mode: ICloneInteractionMode, interactionID: string): Promise<boolean> => {
if(!checkOwnership({
cloneMode: mode,
Expand Down Expand Up @@ -457,6 +487,68 @@ const Session: FC = () => {
</>
)
}
<Box
sx={{
width: '100%',
flexGrow: 0,
p: 2,
display: 'flex',
flexDirection: 'row',
alignItems: 'center',
justifyContent: 'center',
}}
>
<Button
onClick={ () => {
onUpdateSessionConfig({
eval_user_score: session.data?.config.eval_user_score == "" ? '1.0' : "",
}, `Thank you for your feedback!`)
}}
>
{ session.data?.config.eval_user_score == "1.0" ? <ThumbUpIcon /> : <ThumbUpOffIcon /> }
</Button>
<Button
onClick={ () => {
onUpdateSessionConfig({
eval_user_score: session.data?.config.eval_user_score == "" ? '0.0' : "",
}, `Sorry! We will use your feedback to improve`)
}}
>
{ session.data?.config.eval_user_score == "0.0" ? <ThumbDownIcon /> : <ThumbDownOffIcon /> }
</Button>
</Box>
{ session.data?.config.eval_user_score != "" && (
<Box
sx={{
width: '100%',
flexGrow: 0,
p: 2,
display: 'flex',
flexDirection: 'row',
alignItems: 'center',
justifyContent: 'center',
}}
>
<TextField
id="feedback"
label="Please explain why"
value={feedbackValue}
onChange={handleFeedbackChange}
name="ai_feedback"
/>
<Button
variant='contained'
disabled={loading}
onClick={ () => onUpdateSessionConfig({
eval_user_reason: feedbackValue,
}, `Thanks, you are awesome`)
}
sx={{ ml: 2 }}
>
Save
</Button>
</Box>
) }
</Container>
</Box>
<Box
Expand Down
15 changes: 15 additions & 0 deletions frontend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,21 @@ export interface ISessionConfig {
original_mode: ISessionMode,
origin: ISessionOrigin,
shared?: boolean,
avatar: string,
priority: boolean,
document_ids: Record<string, string>,
document_group_id: string,
manually_review_questions: boolean,
system_prompt: string,
helix_version: string,
eval_run_id: string,
eval_user_score: string,
eval_user_reason: string,
eval_manual_score: string,
eval_manual_reason: string,
eval_automatic_score: string,
eval_automatic_reason: string,
eval_original_user_prompts: string[],
}

export interface ISession {
Expand Down

0 comments on commit 32c4122

Please sign in to comment.