from avaliar.evals.bias import BBQ, BBQTask
from avaliar.models.base import AvaliarBaseLLM
from openai import OpenAI
class MyModel(AvaliarBaseLLM):
def __init__(self):
self.client = OpenAI()
def generate(self, prompt: str) -> str:
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content
bbq = BBQ(
tasks=[BBQTask.AGE, BBQTask.GENDER_IDENTITY, BBQTask.RACE_ETHNICITY],
n_shots=5,
n_problems_per_task=200,
)
result = bbq.evaluate(MyModel())
print(f"Overall accuracy: {result.overall_accuracy}")
print(f"Overall score: {result.overall_score}")