from dataclasses import dataclass, field
from typing import List, Tuple, Dict
import os
from openai import AsyncOpenAI
import logging
import re
from contextlib import asynccontextmanager
# Define the prompts
= {
PROMPTS "defend_prompt": """
You're an eloquent advocate (Advocate {advocate_id}), making
a case for Candidate A. Your Opponent is advocating for Candidate B.
<CandidateA>
{candidate_a}
</CandidateA>
<CandidateB>
{candidate_b}
</CandidateB>
Use the Latest Feedback, Opponent's Last Argument, and Your Team's Previous Arguments to improve your case for Candidate A.
<LatestFeedback>
{feedback}
</LatestFeedback>
<OpponentLastArgument>
{opponent_argument}
</OpponentLastArgument>
<TeamPreviousArguments>
{team_arguments}
</TeamPreviousArguments>
Respond in under 80 words.
Your defense:
""",
"judge_prompt": """
You're a fair, impartial judge in a competition between two
candidates, Candidate A and Candidate B, to determine which
is performing better at their task.
<CandidateA>
{candidate_a}
</CandidateA>
<CandidateB>
{candidate_b}
</CandidateB>
Advocates are making their case for their respective candidates.
Your task is to provide feedback that will help advocates improve
and differentiate their arguments more clearly. Use the following
information to guide your feedback:
<CurrentRound>
{current_round}
</CurrentRound>
<TotalRounds>
{total_rounds}
</TotalRounds>
<PreviousScores>
{previous_scores}
</PreviousScores>
<DefenseForCandidateA>
{defense1}
</DefenseForCandidateA>
<DefenseForCandidateB>
{defense2}
</DefenseForCandidateB>
Provide specific, constructive feedback in under 50 words:
""",
"score_prompt_samre": """
You're a critical, impartial judge scoring the performance of
two candidates, Candidate A and Candidate B, on a list of Criteria,
using the DefenseForCandidateA and DefenseForCandidateB.
<CandidateA>
{candidate_a}
</CandidateA>
<CandidateB>
{candidate_b}
</CandidateB>
<DefenseForCandidateA>
{defense1}
</DefenseForCandidateA>
<DefenseForCandidateB>
{defense2}
</DefenseForCandidateB>
<Criteria>
<Criterion1>Relevance to their task</Criterion1>
<Criterion2>Accuracy and credible sources</Criterion2>
<Criterion3>Depth and completeness</Criterion3>
<Criterion4>Clarity and logical flow</Criterion4>
<Criterion5>Reasoning and factual support</Criterion5>
<Criterion6>Effectiveness addressing opponent</Criterion6>
</Criteria>
For each Criterion in the Criteria, briefly analyze
the performance of the two candidates based on the
DefenseForCandidateA and DefenseForCandidateB, then
render a score between 0 and 10.
Respond as follows:
<Criterion1>
<Analysis>
Candidate A: [Brief thoughts about Candidate A's performance on the Criterion, based on the DefenseForCandidateA]
Candidate B: [Brief thoughts about Candidate B's performance on the Criterion, based on the DefenseForCandidateB]
</Analysis>
<Scores>
<CandidateAScore>[score between 0 and 10]</CandidateAScore>
<CandidateBScore>[score between 0 and 10]</CandidateBScore>
</Scores>
</Criterion1>
<Criterion2>
<Analysis>
Candidate A: [Brief thoughts about Candidate A's performance on the Criterion, based on the DefenseForCandidateA]
Candidate B: [Brief thoughts about Candidate B's performance on the Criterion, based on the DefenseForCandidateB]
</Analysis>
<Scores>
<CandidateAScore>[score between 0 and 10]</CandidateAScore>
<CandidateBScore>[score between 0 and 10]</CandidateBScore>
</Scores>
</Criterion2>
...
""",
"score_prompt_baseline": """
You're a critical, impartial judge scoring the performance of
two candidates, Candidate A and Candidate B, on a list of Criteria.
<CandidateA>
{candidate_a}
</CandidateA>
<CandidateB>
{candidate_b}
</CandidateB>
<Criteria>
<Criterion1>Relevance to their task</Criterion1>
<Criterion2>Accuracy and credible sources</Criterion2>
<Criterion3>Depth and completeness</Criterion3>
<Criterion4>Clarity and logical flow</Criterion4>
<Criterion5>Reasoning and factual support</Criterion5>
<Criterion6>Effectiveness addressing opponent</Criterion6>
</Criteria>
For each Criterion in the Criteria, briefly analyze
the performance of the two candidates, then render a score
between 0 and 10.
Respond as follows:
<Criterion1>
<Analysis>
Candidate A: [Brief thoughts about Candidate A's performance on the Criterion]
Candidate B: [Brief thoughts about Candidate B's performance on the Criterion]
</Analysis>
<Scores>
<CandidateAScore>[score between 0 and 10]</CandidateAScore>
<CandidateBScore>[score between 0 and 10]</CandidateBScore>
</Scores>
</Criterion1>
<Criterion2>
<Analysis>
Candidate A: [Brief thoughts about Candidate A's performance on the Criterion]
Candidate B: [Brief thoughts about Candidate B's performance on the Criterion]
</Analysis>
<Scores>
<CandidateAScore>[score between 0 and 10]</CandidateAScore>
<CandidateBScore>[score between 0 and 10]</CandidateBScore>
</Scores>
</Criterion2>
...
"""
}
@dataclass
class Memory:
str, str]] = field(default_factory=list)
arguments: List[Tuple[float, float]] = field(default_factory=list)
scores: List[Tuple[str] = field(default_factory=list)
feedback: List[
class ModelEvaluator:
@classmethod
@asynccontextmanager
async def create(cls, mode="samre", model="gpt-4o-mini", logging_level=logging.WARNING):
= cls(mode=mode, model=model, logging_level=logging_level)
instance = AsyncOpenAI()
instance.client try:
yield instance
finally:
await instance.client.close()
def _setup_logger(self, logging_level):
"""Setup logger with word wrapping."""
= logging.getLogger(__name__)
logger
logger.setLevel(logging_level)if not logger.handlers:
= logging.StreamHandler()
handler class WrapFormatter(logging.Formatter):
def format(self, record):
import textwrap
= super().format(record)
message return '\n'.join(textwrap.fill(line, width=80)
for line in message.split('\n'))
= WrapFormatter('%(message)s')
formatter
handler.setFormatter(formatter)
logger.addHandler(handler)return logger
def __init__(self, mode="samre", model="gpt-4o-mini", logging_level=logging.WARNING):
self.mode = mode
self.model = model
self.max_rounds = 4 if mode == "samre" else 1
self.logger = self._setup_logger(logging_level)
# Initialize all prompts
self.defend_prompt = PROMPTS["defend_prompt"]
self.judge_prompt = PROMPTS["judge_prompt"]
async def get_completion(self, prompt: str) -> str:
"""Get a completion from the OpenAI API."""
if not self.client:
raise RuntimeError("Evaluator must be created using 'async with ModelEvaluator.create() as evaluator:'")
= await self.client.chat.completions.create(
response =self.model,
model=[{"role": "system", "content": prompt}],
messages=0
temperature
)return response.choices[0].message.content
def _extract_final_scores(self, score_response: str) -> Tuple[float, float]:
"""Extract and sum scores from all criteria sections."""
= r'<CandidateAScore>\s*(\d+\.?\d*)\s*</CandidateAScore>'
score_a_pattern = r'<CandidateBScore>\s*(\d+\.?\d*)\s*</CandidateBScore>'
score_b_pattern
= [float(match.group(1)) for match in re.finditer(score_a_pattern, score_response)]
scores_a = [float(match.group(1)) for match in re.finditer(score_b_pattern, score_response)]
scores_b
if not scores_a or not scores_b:
raise ValueError("Could not find scores for both candidates")
if len(scores_a) != len(scores_b):
raise ValueError(f"Mismatched number of scores: A={len(scores_a)}, B={len(scores_b)}")
= sum(scores_a) / len(scores_a)
final_score_a = sum(scores_b) / len(scores_b)
final_score_b
return (final_score_a, final_score_b)
async def evaluate(self, candidate_a: str, candidate_b: str) -> Dict:
"""Main evaluation method that branches based on mode."""
if not self.client:
raise RuntimeError("Evaluator must be created using 'async with ModelEvaluator.create() as evaluator:'")
if self.mode == "baseline":
self.logger.info("\n=== Starting Baseline Evaluation ===\n")
return await self._evaluate_baseline(candidate_a, candidate_b)
else:
self.logger.info("\n=== Starting SAMRE Evaluation ===\n")
return await self._evaluate_samre(candidate_a, candidate_b)
async def _evaluate_baseline(self, candidate_a: str, candidate_b: str) -> Dict:
"""Simple baseline evaluation."""
= PROMPTS["score_prompt_baseline"].format(
score_prompt =candidate_a,
candidate_a=candidate_b
candidate_b
)= await self.get_completion(score_prompt)
score_response self.logger.info(f"Score response: {score_response}")
try:
= self._extract_final_scores(score_response)
scores except Exception as e:
self.logger.error(f"Score parsing error: {e}")
self.logger.error(f"Raw score response: {score_response}")
= (10.0, 10.0)
scores
return {
"winner": 'model_a' if scores[0] > scores[1] else 'model_b' if scores[1] > scores[0] else 'tie',
"average_scores": scores,
"rounds": 1,
"score_history": [list(scores)],
"full_response": score_response
}
async def _evaluate_samre(self, candidate_a: str, candidate_b: str) -> Dict:
"""Full SAMRE evaluation with debate rounds."""
= Memory()
local_memory
self.logger.info("\n=== Starting SAMRE Evaluation ===\n")
for round_num in range(self.max_rounds):
self.logger.info(f"\n--- Round {round_num + 1} ---")
= await self._run_debate_round(
scores
candidate_a,
candidate_b,
round_num,
local_memory
)
if self._has_scores_converged(round_num, local_memory):
self.logger.info("\nScores have converged - ending debate early.")
break
return self._prepare_results(local_memory)
async def defend_answer(self, candidate_a: str, candidate_b: str,
int, feedback: str = "",
advocate_id: str = "",
opponent_argument: str] = None) -> str:
team_arguments: List["""Get defense from an advocate."""
if team_arguments is None:
= []
team_arguments
= self.defend_prompt.format(
prompt =advocate_id,
advocate_id=candidate_a,
candidate_a=candidate_b,
candidate_b=feedback,
feedback=opponent_argument,
opponent_argument="\n".join(team_arguments)
team_arguments
)return await self.get_completion(prompt)
async def judge_debate(self, candidate_a: str, candidate_b: str,
str, defense2: str,
defense1: int,
current_round: -> Tuple[str, Tuple[float, float]]:
memory: Memory) """Judge the debate between two answers."""
= self.judge_prompt.format(
feedback_prompt =candidate_a,
candidate_a=candidate_b,
candidate_b=current_round,
current_round=self.max_rounds,
total_rounds=memory.scores,
previous_scores=defense1,
defense1=defense2
defense2
)= await self.get_completion(feedback_prompt)
feedback
= PROMPTS["score_prompt_baseline"].format(
score_prompt =candidate_a,
candidate_a=candidate_b,
candidate_b=defense1,
defense1=defense2
defense2
)= await self.get_completion(score_prompt)
score_response self.logger.info(f"Score response: {score_response}")
try:
= self._extract_final_scores(score_response)
scores except Exception as e:
self.logger.error(f"Score parsing error: {e}")
self.logger.error(f"Raw score response: {score_response}")
= (10.0, 10.0)
scores
return feedback, scores
async def _run_debate_round(self, candidate_a: str, candidate_b: str,
int, memory: Memory) -> Tuple[float, float]:
round_num: """Execute a single round of debate between advocates."""
= await self._get_advocate_defenses(candidate_a, candidate_b, memory)
defenses
memory.arguments.append(defenses)
= await self.judge_debate(
feedback, scores 0], defenses[1], round_num + 1, memory
candidate_a, candidate_b, defenses[
)
self._store_round_results(feedback, scores, memory)
self._display_round_results(defenses, feedback, scores)
return scores
async def _get_advocate_defenses(self, candidate_a: str, candidate_b: str,
-> Tuple[str, str]:
memory: Memory) """Get defenses from both advocates."""
= await self.defend_answer(
defense1 1,
candidate_a, candidate_b, =memory.feedback[-1] if memory.feedback else "",
feedback=memory.arguments[-1][1] if memory.arguments else "",
opponent_argument=[args[0] for args in memory.arguments]
team_arguments
)
= await self.defend_answer(
defense2 2,
candidate_a, candidate_b, =memory.feedback[-1] if memory.feedback else "",
feedback=memory.arguments[-1][0] if memory.arguments else "",
opponent_argument=[args[1] for args in memory.arguments]
team_arguments
)
return (defense1, defense2)
def _store_round_results(self, feedback: str, scores: Tuple[float, float],
-> None:
memory: Memory) """Store feedback and scores from the round."""
memory.feedback.append(feedback)
memory.scores.append(scores)
def _display_round_results(self, defenses: Tuple[str, str],
str, scores: Tuple[float, float]) -> None:
feedback: """Display the results of the current round."""
self.logger.info(f"\nAdvocate 1's defense:\n{defenses[0]}")
self.logger.info(f"\nAdvocate 2's defense:\n{defenses[1]}")
self.logger.info(f"\nJudge's feedback:\n{feedback}")
self.logger.info(f"Scores for this round: Answer 1 = {round(scores[0], 2)}, Answer 2 = {round(scores[1], 2)}")
def _has_scores_converged(self, round_num: int, memory: Memory) -> bool:
"""Check if scores have converged (same winner twice in a row)."""
if round_num > 0:
= memory.scores[-2][0] - memory.scores[-2][1]
prev_diff = memory.scores[-1][0] - memory.scores[-1][1]
curr_diff return (prev_diff * curr_diff) > 0
return False
def _prepare_results(self, memory: Memory) -> Dict:
"""Prepare the final results dictionary."""
= [
avg_scores round(sum(scores[i] for scores in memory.scores) / len(memory.scores), 2)
for i in range(2)
]
= (
winner 'model_a' if avg_scores[0] > avg_scores[1]
else 'model_b' if avg_scores[0] < avg_scores[1]
else 'tie'
)
return {
"winner": winner,
"average_scores": avg_scores,
"rounds": len(memory.scores),
"score_history": [[round(s[0], 2), round(s[1], 2)] for s in memory.scores],
"argument_history": memory.arguments,
"feedback_history": memory.feedback
}
I’ve been doing a lot of work with LLM-based evaluations lately, and I’ve been thinking about how to improve the quality of these evaluations.
I like to read research papers from arXiv for inspiration, and I recently came across a paper called Adversarial Multi-Agent Evaluation of Large Language Models through Iterative Debates, which introduces a new method inspired by judicial process called Single Advocate Multi-Round Evaluation (SAMRE). Briefly, the SAMRE method evaluates the quality of different LLM outputs through an iterative debate process.
I was initially impressed by the results, as they reported a gain of 6-8% over the baseline method. However, I am often skeptical of comparisons to “baseline” in these research papers, as I find that they often fail to implement standard best practices and are therefore not represenative of a true baseline model.
I decided to try and reproduce some of the methods of the paper, while creating a baseline that does implement standard best practices for prompt engineering.
Using a sample of 500 conversations from MT-bench for testing and evaluation, I find that contrary to the paper, SAMRE is inferior to baseline.
This serves to highlight the importance of implementing standard best practices in baseline models, as well as being skeptical of claims in research papers that compare new methods to a “baseline model”.
Baseline model inadequacies
First, let’s consider some of the inadequacies in the Baseline model’s prompt reported in the paper. The prompt they used was as follows:
You are a fair, impartial judge scoring a debate on the following question:
question.
Answer 1: answer1
Answer 2: answer2
Score each answer on a scale of 1-20 for each of the following criteria:
1. Relevance to the question
2. Accuracy of information and use of credible sources
3. Depth of analysis and completeness of argument
4. Clarity of expression and logical flow
5. Strength of reasoning and factual support
6. Effectiveness in addressing opponent’s points
Provide scores as [Answer1_score, Answer2_score] for each criterion in a list format, then sum for final scores. Please keep an eye on the slightest difference that should make a difference in the scoring. Don’t overthink!
Relevance:
Accuracy:
Depth:
Clarity:
Logic and Factuality:
Addressing opponent’s points:
Final Scores (sum of above) as a tuple (example: (18, 9)):
Explain your scoring, focusing on why one answer is better than the other based on the criteria above. Keep your explanation concise but informative.
Finally, return the final score tuple (score1, score2) as a tuple (in parentheses).
Example: (18, 9)
Your scores and explanation:
First, the prompt does not use any delimiters for the inputs. I would enclose the inputs inside XML tags like
Second, the prompt instructs the model to first generate scores in list format, and then to sum them. But as we know, language models models often make arithmetic mistakes. It would be better to ask the model to generate scores for each criterion, and then to programmatically extract and summarize them in python (or another programming language) from which the routine is run.
Third, although the prompt asks the model to “explain your scoring”, it is not clear if the model should be reasoning about each criterion before it scores them, or if it should provide reasoning at the end when giving its final score. I would ask the model to provide reasoning for each criterion that it is asked to score, and ask it to reason before scoring.
Fourth, it’s unclear why a scale of 1-20 is used. This is not a standard scale for scoring. I would use a scale of 1-10 or 0-10 which is likely more familiar to the model and can be expected to be used more consistently.
Fifth, although the prompt does suggest that the model provide its scores in tuple format, it would be better to provide more explicit format instructions.
Finally, although this goes beyond the prompt itself (and is not something that I address with my improved baseline), it is worth noting that comparing a multi-round method to a single-round method is also an unfair comparison. Instead, it would be better to compare the SAMRE method to a baseline that uses the same number of rounds and then similarly averages its scores.
With all of that in mind, here’s how I would rewrite the prompt:
You're a critical, impartial judge scoring the performance of
two answers, Answer1 and Answer2, on a list of Criteria, using
the DefenseForAnswer1 and DefenseForAnswer2.
<Answer1>
{answer1}
</Answer1>
<Answer2>
{answer2}
</Answer2>
<DefenseForAnswer1>
{defense1}
</DefenseForAnswer1>
<DefenseForAnswer2>
{defense2}
</DefenseForAnswer2>
<Criteria>
<Criterion1>Relevance to their task</Criterion1>
<Criterion2>Accuracy and credible sources</Criterion2>
<Criterion3>Depth and completeness</Criterion3>
<Criterion4>Clarity and logical flow</Criterion4>
<Criterion5>Reasoning and factual support</Criterion5>
<Criterion6>Effectiveness addressing opponent</Criterion6>
</Criteria>
For each Criterion in the Criteria, briefly analyze
the performance of the two answers based on the
DefenseForAnswer1 and DefenseForAnswer2, then
render a score between 0 and 10.
Respond as follows:
<Criterion1>
<Analysis>
Answer 1: [Brief thoughts about Answer 1's performance on the Criterion, based on the DefenseForAnswer1]
Answer 2: [Brief thoughts about Answer 2's performance on the Criterion, based on the DefenseForAnswer2]
</Analysis>
<Scores>
<Answer1Score>[score between 0 and 10]</Answer1Score>
<Answer2Score>[score between 0 and 10]</Answer2Score>
</Scores>
</Criterion1>
<Criterion2>
<Analysis>
Answer 1: [Brief thoughts about Answer 1's performance on the Criterion, based on the DefenseForAnswer1]
Answer 2: [Brief thoughts about Answer 2's performance on the Criterion, based on the DefenseForAnswer2]
</Analysis>
<Scores>
<Answer1Score>[score between 0 and 10]</Answer1Score>
<Answer2Score>[score between 0 and 10]</Answer2Score>
</Scores>
</Criterion2>
...
Notice that the prompt now uses XML tags to structure the instructions, that it asks the model to provide reasoning for each criterion before scoring, and that it gives the model a clear format for its response that reinforces analysis before scoring for each criterion.
I’ve also change the scale from 1-20 to 1-10, and removed the instruction to summarize the scores, as I would handle this within the code.
My implementation of SAMRE and Baseline
Okay, so with that criticism out of the way, let’s implement the SAMRE and Baseline methods. Below is my python implementation of the SAMRE and Baseline methods. I’ve modified the “scorer prompt” within the SAMRE method to use the same format as the Baseline method, and I’ve made a few adjustments to the prompts so they can handle conversation data as input (which is what the MT-bench dataset contains by default) instead of question-answer pairs. For example, instead of asking the model to score two answers to a question, the model is asked to score two “candidates” in a debate. I’ve also modified the structure of prompts to use XML tags throughout.
Load the MT-bench dataset
Next I will read in the MT-bench dataset from disk and prepare it for evaluation.
import json
import pandas as pd
# Read the MT-bench dataset from disk
= []
data_list with open('mt_bench_human_judgments.jsonl', 'r') as file:
for line in file:
= json.loads(line)
data
data_list.append(data)
= pd.DataFrame(data_list)
df
# Take a random sample of 615 rows, which
# with this seed yields 500 unique conversations
= df.sample(n=615, random_state=42)
df
# Get unique combinations of conversations and models
'conversation_a_str'] = df['conversation_a'].astype(str)
df['conversation_b_str'] = df['conversation_b'].astype(str)
df[= df.drop_duplicates(subset=['model_a', 'model_b', 'conversation_a_str', 'conversation_b_str'])
df
print(f"Unique conversations: {len(df)}")
df.head()
Unique conversations: 500
split | question_id | model_a | model_b | winner | judge | conversation_a | conversation_b | turn | conversation_a_str | conversation_b_str | |
---|---|---|---|---|---|---|---|---|---|---|---|
3250 | human | 159 | alpaca-13b | vicuna-13b-v1.2 | model_b | expert_2 | [{'content': 'What are some business etiquette... | [{'content': 'What are some business etiquette... | 2 | [{'content': 'What are some business etiquette... | [{'content': 'What are some business etiquette... |
596 | human | 95 | llama-13b | gpt-3.5-turbo | model_b | expert_39 | [{'content': 'Please assume the role of an Eng... | [{'content': 'Please assume the role of an Eng... | 2 | [{'content': 'Please assume the role of an Eng... | [{'content': 'Please assume the role of an Eng... |
2236 | human | 134 | gpt-4 | llama-13b | model_a | expert_42 | [{'content': 'Given the following data, identi... | [{'content': 'Given the following data, identi... | 2 | [{'content': "Given the following data, identi... | [{'content': "Given the following data, identi... |
1064 | human | 106 | claude-v1 | alpaca-13b | model_a | expert_10 | [{'content': 'Each problem consists of three s... | [{'content': 'Each problem consists of three s... | 1 | [{'content': 'Each problem consists of three s... | [{'content': 'Each problem consists of three s... |
315 | human | 89 | gpt-3.5-turbo | llama-13b | model_a | expert_26 | [{'content': 'Help me construct a catchy, yet ... | [{'content': 'Help me construct a catchy, yet ... | 2 | [{'content': 'Help me construct a catchy, yet ... | [{'content': 'Help me construct a catchy, yet ... |
Evaluate MT-bench with SAMRE and Baseline methods
Next, for each conversation in the dataset, I will evaluate the two models using both the SAMRE and Baseline methods.
import asyncio
from asyncio import Semaphore
import logging
import os
import hashlib
=logging.WARNING)
logging.basicConfig(level
async def evaluate_conversation_pair(row, evaluators, semaphore, idx, total):
"""Evaluate a single conversation pair with all evaluators"""
async with semaphore:
# Generate pair_id from conversation hash
= f"{row['model_a']}_{row['model_b']}_{hashlib.sha256(str(row['conversation_a']).encode()).hexdigest()[:12]}"
pair_id = f'checkpoints/{pair_id}.json'
checkpoint_file
# Return existing checkpoint if available
if os.path.exists(checkpoint_file):
f"Found existing checkpoint file for {pair_id}")
logging.info(return json.load(open(checkpoint_file))
f"No checkpoint file found for {pair_id}")
logging.info(= {
result 'model_a': row['model_a'],
'model_b': row['model_b'],
'human_winner': row['winner'],
'pair_id': pair_id
}
# Evaluate with each evaluator
for name, evaluator in evaluators.items():
try:
= await evaluator.evaluate(row['conversation_a'], row['conversation_b'])
eval_result f'{name}_winner'] = eval_result['winner']
result[
# Add common evaluation details
f'{name}_{k}': eval_result[k] for k in ['average_scores', 'rounds', 'score_history']})
result.update({
# Add mode-specific details
if evaluator.mode == "samre":
result.update({'samre_argument_history': eval_result['argument_history'],
'samre_feedback_history': eval_result['feedback_history']
})elif evaluator.mode == "baseline":
'baseline_full_response'] = eval_result['full_response']
result[
except Exception as e:
print(f"Error with {name} evaluator on row {idx}: {str(e)}")
f'{name}_winner'] = None
result[
# Save checkpoint
'checkpoints', exist_ok=True)
os.makedirs(open(checkpoint_file, 'w'))
json.dump(result,
if (idx + 1) % 10 == 0:
print(f"Processed {idx + 1}/{total} conversations")
return result
async def evaluate_conversations_async(df, evaluators, semaphore_limit=3):
"""Evaluate conversations asynchronously"""
'checkpoints', exist_ok=True)
os.makedirs(= [evaluate_conversation_pair(row[1], evaluators, Semaphore(semaphore_limit), idx, len(df))
tasks for idx, row in enumerate(df.iterrows())]
return pd.DataFrame(await asyncio.gather(*tasks))
async def main():
async with ModelEvaluator.create(mode="samre") as samre_evaluator, \
="baseline") as baseline_evaluator:
ModelEvaluator.create(modereturn await evaluate_conversations_async(
df,'samre': samre_evaluator, 'baseline': baseline_evaluator},
{=7
semaphore_limit
)
# Run evaluation with checkpoint recovery
try:
= await main()
eval_df except Exception as e:
print(f"Error during evaluation: {str(e)}\nRecovering from checkpoints...")
= pd.DataFrame([json.load(open(f'checkpoints/{f}'))
eval_df for f in os.listdir('checkpoints')
if f.endswith('.json')])
finally:
'eval_df.csv', index=False)
eval_df.to_csv( eval_df.head()
Results
Finally, I will evaluate the performance of the methods by looking at how well each method agreed with the human judgments. I’ll use Krippendorff’s alpha to measure agreement, since it is a robust measure of agreement that can handle non-binary ratings (among other things).
from krippendorff import alpha
import numpy as np
from sklearn.preprocessing import LabelEncoder
def calculate_agreement(df, rater1_col, rater2_col):
"""
Calculate Krippendorff's alpha between two raters.
Args:
df: DataFrame containing the ratings
rater1_col: Name of first rater's column
rater2_col: Name of second rater's column
Returns:
float: Krippendorff's alpha score
"""
# Create label encoder
= LabelEncoder()
le
# Combine all unique values from both columns
= pd.concat([df[rater1_col], df[rater2_col]]).unique()
all_values
le.fit(all_values)
# Transform the ratings to numeric values
= le.transform(df[rater1_col].fillna('missing'))
ratings1 = le.transform(df[rater2_col].fillna('missing'))
ratings2
# Reshape data for krippendorff alpha calculation
# Each row represents one item, each column represents one rater
= np.vstack([ratings1, ratings2])
reliability_data
return alpha(reliability_data=reliability_data, level_of_measurement='nominal')
# Calculate agreement scores
= calculate_agreement(eval_df, 'human_winner', 'baseline_winner')
human_baseline_agreement = calculate_agreement(eval_df, 'human_winner', 'samre_winner')
human_samre_agreement
# Create a DataFrame with the agreement scores
= pd.DataFrame({
agreement_df 'Evaluator Pair': ['Human-Baseline', 'Human-SAMRE'],
'Krippendorff Alpha': [human_baseline_agreement, human_samre_agreement]
})
# Round the scores to 3 decimal places
'Krippendorff Alpha'] = agreement_df['Krippendorff Alpha'].round(3)
agreement_df[
# Display the DataFrame
agreement_df
Evaluator Pair | Krippendorff Alpha | |
---|---|---|
0 | Human-Baseline | 0.364 |
1 | Human-SAMRE | 0.215 |
Although neither method yielded particularly strong agreement with the human judges, the Baseline method yielded significantly better agreement than the SAMRE method: 0.364 vs. 0.215. This is contrary to the paper, which reported a gain of 6-8% for SAMRE over baseline.