from dataclasses import dataclass, field
from typing import List, Tuple, Dict
import os
from openai import OpenAI
import logging
# Define the prompts
= {
PROMPTS "defend_prompt": """
You're an eloquent advocate (Advocate {advocate_id}) defending this answer: ```{answer}``` to the question: ```{question}```. Your opponent defends this answer: ```{opponent_answer}```.
Use the latest feedback, opponent's last argument, and your team's previous arguments to improve your case.
Latest feedback: ```{feedback}```
Opponent's last argument: ```{opponent_argument}```
Your team's previous arguments: ```{team_arguments}```
Respond in under 80 words.
Your defense:
""",
"judge_prompt": """
You're a fair, impartial judge in a debate on the following question: ```{question}```
Answer 1: ```{answer1}```
Answer 2: ```{answer2}```
Your goal is to provide feedback that will help advocates improve and differentiate their arguments more clearly.
Current round: ```{current_round}```
Total rounds: ```{total_rounds}```
Previous scores: ```{previous_scores}```
Defense for 1st answer: ```{defense1}```
Defense for 2nd answer: ```{defense2}```
Provide specific, constructive feedback in under 50 words:
""",
"score_prompt": """You're a critical, impartial judge scoring these debate responses on a scale of 1-20 for each criterion. Return scores as tuple (score1, score2).
Question: ```{question}```
Answer 1: ```{answer1}```
Answer 2: ```{answer2}```
Defense 1: ```{defense1}```
Defense 2: ```{defense2}```
Criteria:
1. Relevance to question
2. Accuracy and credible sources
3. Depth and completeness
4. Clarity and logical flow
5. Reasoning and factual support
6. Effectiveness addressing opponent
Return final scores as tuple, e.g. (18, 9)"""
}
@dataclass
class Memory:
# List of (arg1, arg2) tuples
str, str]] = field(default_factory=list)
arguments: List[Tuple[# List of (score1, score2) tuples
float, float]] = field(default_factory=list)
scores: List[Tuple[# List of feedback strings
str] = field(default_factory=list)
feedback: List[
class SAMREEvaluator:
def __init__(self, model="gpt-4o-mini"):
"""Initialize the SAMRE evaluator."""
self.model = model
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
self.max_rounds = 4
self.memory = Memory()
self.defend_prompt = PROMPTS["defend_prompt"]
self.judge_prompt = PROMPTS["judge_prompt"]
self.score_prompt = PROMPTS["score_prompt"]
# Setup logger with word wrapping
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.INFO)
if not self.logger.handlers:
= logging.StreamHandler()
handler class WrapFormatter(logging.Formatter):
def format(self, record):
import textwrap
= super().format(record)
message return '\n'.join(textwrap.fill(line, width=80)
for line in message.split('\n'))
= WrapFormatter('%(message)s') # Simple format with wrapping
formatter
handler.setFormatter(formatter)self.logger.addHandler(handler)
def get_completion(self, prompt: str) -> str:
"""Get a completion from the OpenAI API."""
= self.client.chat.completions.create(
response =self.model,
model=[{"role": "system", "content": prompt}],
messages=0
temperature
)return response.choices[0].message.content
def defend_answer(self, question: str, answer: str, opponent_answer: str,
int, feedback: str = "", opponent_argument: str = "",
advocate_id: str] = None) -> str:
team_arguments: List[if team_arguments is None:
= []
team_arguments
= self.defend_prompt.format(
prompt =advocate_id,
advocate_id=answer,
answer=question,
question=opponent_answer,
opponent_answer=feedback,
feedback=opponent_argument,
opponent_argument="\n".join(team_arguments)
team_arguments
)return self.get_completion(prompt)
def judge_debate(self,
str,
question: str,
answer1: str,
answer2: str,
defense1: str,
defense2: int) -> Tuple[str, Tuple[float, float]]:
current_round: """Judge the debate between two answers."""
# Get feedback
= self.judge_prompt.format(
feedback_prompt =question,
question=answer1,
answer1=answer2,
answer2=current_round,
current_round=self.max_rounds,
total_rounds=self.memory.scores,
previous_scores=defense1,
defense1=defense2
defense2
)= self.get_completion(feedback_prompt)
feedback
# Get scores
= self.score_prompt.format(
score_prompt =question,
question=answer1,
answer1=answer2,
answer2=defense1,
defense1=defense2
defense2
)= self.get_completion(score_prompt)
score_response
# More robust score parsing
try:
# Try to find any tuple-like pattern in the response
import re
= re.search(r'\((\d+\.?\d*)\s*,\s*(\d+\.?\d*)\)', score_response)
tuple_match if tuple_match:
= (float(tuple_match.group(1)),
scores float(tuple_match.group(2)))
else:
# Fallback: try to find any two numbers in the response
= re.findall(r'\d+\.?\d*', score_response)
numbers if len(numbers) >= 2:
= (float(numbers[0]), float(numbers[1]))
scores else:
raise ValueError("Could not find two scores in response")
except Exception as e:
self.logger.error(f"Score parsing error: {e}")
self.logger.error(f"Raw score response: {score_response}")
= (10.0, 10.0)
scores
return feedback, scores
def evaluate(self,
str,
question: str,
answer1: str) -> Dict:
answer2: """Evaluate the debate between two answers."""
self.logger.info("\n=== Starting SAMRE Evaluation ===\n")
for round_num in range(self.max_rounds):
self.logger.info(f"\n--- Round {round_num + 1} ---")
= self._run_debate_round(question, answer1, answer2, round_num)
scores
if self._has_scores_converged(round_num):
self.logger.info("\nScores have converged - ending debate early.")
break
return self._prepare_results()
def _run_debate_round(self,
str,
question: str,
answer1: str,
answer2: int) -> Tuple[float, float]:
round_num:
"""Execute a single round of debate between advocates."""
# Get advocate defenses
= self._get_advocate_defenses(question, answer1, answer2)
defenses self.memory.arguments.append(defenses)
# Get judge's feedback and scores
= self.judge_debate(
feedback, scores 0], defenses[1], round_num + 1
question, answer1, answer2, defenses[
)
# Store and display results
self._store_round_results(feedback, scores)
self._display_round_results(defenses, feedback, scores)
return scores
def _get_advocate_defenses(self,
str,
question: str,
answer1: str) -> Tuple[str, str]:
answer2: """Get defenses from both advocates."""
= self.defend_answer(
defense1 1,
question, answer1, answer2, =self.memory.feedback[-1] if self.memory.feedback else "",
feedback=self.memory.arguments[-1][1] if self.memory.arguments else "",
opponent_argument=[args[0] for args in self.memory.arguments]
team_arguments
)
= self.defend_answer(
defense2 2,
question, answer2, answer1, =self.memory.feedback[-1] if self.memory.feedback else "",
feedback=self.memory.arguments[-1][0] if self.memory.arguments else "",
opponent_argument=[args[1] for args in self.memory.arguments]
team_arguments
)
return (defense1, defense2)
def _store_round_results(self,
str,
feedback: float, float]) -> None:
scores: Tuple["""Store feedback and scores from the round."""
self.memory.feedback.append(feedback)
self.memory.scores.append(scores)
def _display_round_results(self,
str, str],
defenses: Tuple[str,
feedback: float, float]) -> None:
scores: Tuple["""Display the results of the current round."""
self.logger.info(f"\nAdvocate 1's defense:\n{defenses[0]}")
self.logger.info(f"\nAdvocate 2's defense:\n{defenses[1]}")
self.logger.info(f"\nJudge's feedback:\n{feedback}")
self.logger.info(f"Scores for this round: Answer 1 = {scores[0]}, Answer 2 = {scores[1]}")
def _has_scores_converged(self,
int) -> bool:
round_num: """Check if scores have converged (same winner twice in a row)."""
if round_num > 0:
= self.memory.scores[-2][0] - self.memory.scores[-2][1]
prev_diff = self.memory.scores[-1][0] - self.memory.scores[-1][1]
curr_diff return (prev_diff * curr_diff) > 0
return False
def _prepare_results(self) -> Dict:
"""Prepare the final results dictionary."""
= tuple(
avg_scores sum(scores[i] for scores in self.memory.scores) / len(self.memory.scores)
for i in range(2)
)
return {
"winner": 1 if avg_scores[0] > avg_scores[1] else 2,
"average_scores": avg_scores,
"rounds": len(self.memory.scores),
"score_history": self.memory.scores,
"argument_history": self.memory.arguments,
"feedback_history": self.memory.feedback
}
I’ve been doing a lot of work with LLM-based evaluations lately, and I’ve been thinking about how to improve the quality of these evaluations.
I recently came across a paper published in October 2024 in arXiv, Adversarial Multi-Agent Evaluation of Large Language Models through Iterative Debates, which explores different approaches for using LLMs to evaluate the quality of two alternative responses using methods inspired by judicial process. (Note: Using LLMs to evaluate responses is sometimes called “LLM as Judge” evaluation.)
One of the methods explored in the paper is called Single Advocate Multi-Round Evaluation (SAMRE). Briefly, the SAMRE method evaluates the quality of different LLM outputs through an iterative debate process.
- For each answer being evaluated, there is one advocate who defends it, along with a judge who provides feedback and scoring across multiple criteria (like relevance, accuracy, and clarity).
- The debate runs for up to 4 rounds, with advocates refining their arguments based on the judge’s feedback each round. (Note: The process can end early if the same answer wins twice in a row, saving computational resources.)
- Each round, the judge scores both answers on a scale of 1-20, and the final winner is determined by the highest average score across all rounds.
This approach is interesting because it allows the LLM to make a strong case for each answer (as its arguments for each answer are refined over multiple rounds), and then judge the quality of the arguments before picking a winner.
In this blog post, I’ll show how to implement the SAMRE method in python, and I’ll use a test case to demonstrate how it works – printing the results of each round in the debate, as well as the final winner and scores.
Implementation
The implementation is straightforward. We define a SAMREEvaluator
class that initializes the evaluator with a model, and then defines methods for each step of the SAMRE process.
We also define a Memory
class to store the results of each round, and a PROMPTS
dictionary to store the prompts for the different steps of the SAMRE process.
A logger is used to print the results of each round in the debate.
Demonstration
Here’s a demonstration of the SAMRE method using a test case wherein the question is “What are the implications of quantum computing for cybersecurity?” to which two responses were generated by GPT-4o.
# Initialize evaluator
= SAMREEvaluator(model="gpt-4o")
evaluator
# Define the question and answers
= "What are the implications of quantum computing for cybersecurity? Respond in 80 words or less."
question
= "Quantum computing poses a major threat to cybersecurity by potentially breaking widely used encryption methods, such as RSA and ECC, through algorithms like Shor's. It could render current cryptographic protections obsolete, exposing sensitive data. However, it also enables quantum-resistant cryptography and advanced security techniques, such as quantum key distribution (QKD), which offer near-unbreakable encryption. Preparing for this shift requires adopting post-quantum cryptography and upgrading current systems to remain secure against future quantum threats."
answer1
= "Quantum computing could revolutionize cybersecurity by enabling faster detection of cyber threats and improving optimization in defensive measures. However, it also risks undermining blockchain integrity and breaking current hashing algorithms, threatening digital signatures and data integrity. Quantum algorithms may facilitate advanced cyberattacks, making traditional defenses inadequate. Organizations must prioritize research in quantum-safe solutions, such as lattice-based cryptography, while leveraging quantum technology for enhanced threat modeling and secure communication systems, ensuring a balance between risks and opportunities."
answer2
= evaluator.evaluate(question, answer1, answer2)
result
print(f"Winner: Answer {result['winner']}")
print(f"Average scores: {result['average_scores']}")
print(f"Number of rounds: {result['rounds']}")
=== Starting SAMRE Evaluation ===
--- Round 1 ---
Advocate 1's defense:
Quantum computing threatens cybersecurity by potentially breaking encryption
like RSA and ECC with Shor's algorithm, risking data exposure. However, it also
offers solutions like quantum-resistant cryptography and quantum key
distribution (QKD) for near-unbreakable security. Preparing for this shift
involves adopting post-quantum cryptography and upgrading systems to counter
future quantum threats, ensuring data remains secure. This dual nature
highlights both the risks and the potential for enhanced security in the quantum
era.
Advocate 2's defense:
Quantum computing's dual impact on cybersecurity includes both risks and
opportunities. While it threatens current encryption and blockchain integrity,
it also offers enhanced threat detection and optimization of defenses. Unlike
the narrow focus on encryption threats, our perspective emphasizes a balanced
approach, advocating for quantum-safe solutions like lattice-based cryptography
and leveraging quantum technology for secure communication. This comprehensive
view ensures organizations are prepared for both the challenges and benefits of
quantum advancements.
Judge's feedback:
Answer 1: Clarify how quantum-resistant cryptography differs from current
methods and provide examples. Emphasize the urgency of transitioning to post-
quantum cryptography. Highlight the potential timeline for quantum threats to
become significant.
Answer 2: Expand on how quantum computing enhances threat detection and
optimization. Provide specific examples of quantum-safe solutions. Discuss the
timeline for when quantum computing might realistically impact cybersecurity.
Scores for this round: Answer 1 = 18.0, Answer 2 = 17.0
--- Round 2 ---
Advocate 1's defense:
Quantum computing threatens cybersecurity by potentially breaking current
encryption methods like RSA and ECC with Shor's algorithm, risking data
exposure. Quantum-resistant cryptography, such as lattice-based methods, and
quantum key distribution (QKD) offer near-unbreakable security. The urgency to
transition to post-quantum cryptography is critical as quantum threats could
emerge within the next decade. This dual nature highlights both the risks and
the potential for enhanced security, necessitating immediate upgrades to
safeguard against future quantum threats.
Advocate 2's defense:
Quantum computing's dual impact on cybersecurity includes both risks and
opportunities. It enhances threat detection and optimizes defenses, while also
threatening current encryption and blockchain integrity. Unlike a narrow focus
on encryption threats, our perspective emphasizes a balanced approach,
advocating for quantum-safe solutions like lattice-based cryptography and
leveraging quantum technology for secure communication. Organizations must
prioritize research in these areas to ensure preparedness for both the
challenges and benefits of quantum advancements, with significant impacts
expected within the next decade.
Judge's feedback:
Answer 1: Emphasize the urgency of transitioning to post-quantum cryptography by
providing a timeline for potential quantum threats. Highlight specific quantum-
resistant cryptographic methods beyond QKD to strengthen your argument. Clarify
how organizations can practically implement these solutions to prepare for
future challenges.
Answer 2: Expand on how quantum computing can specifically enhance threat
detection and optimization. Provide examples of quantum-safe solutions and their
current development status. Clarify how organizations can balance leveraging
quantum technology while mitigating risks, offering practical steps for
implementation.
Scores for this round: Answer 1 = 18.0, Answer 2 = 17.0
Scores have converged - ending debate early.
Winner: Answer 1
Average scores: (18.0, 17.0)
Number of rounds: 2