import zeroeval as ze
ze.init()
# Create or pull dataset
dataset = ze.Dataset.pull("spam_detection")
@ze.task(outputs=["prediction", "confidence"])
def detect_spam(row):
text = row["text"]
spam_indicators = ["FREE", "WIN", "URGENT", "Click here", "Act now"]
score = sum(1 for word in spam_indicators if word.upper() in text.upper())
confidence = min(0.95, max(0.05, score / 3 + 0.3))
prediction = 1 if confidence > 0.5 else 0
return {"prediction": prediction, "confidence": confidence}
@ze.evaluation(mode="row", outputs=["correct", "true_positive", "false_positive"])
def binary_metrics(row):
pred = row["prediction"]
label = row["label"]
return {
"correct": int(pred == label),
"true_positive": int(pred == 1 and label == 1),
"false_positive": int(pred == 1 and label == 0)
}
@ze.column_metric(outputs=["accuracy", "precision"])
def aggregate_metrics(dataset):
tp = sum(row.get("true_positive", 0) for row in dataset)
fp = sum(row.get("false_positive", 0) for row in dataset)
correct = sum(row.get("correct", 0) for row in dataset)
accuracy = correct / len(dataset)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
return {"accuracy": accuracy, "precision": precision}
# Run experiment
run = dataset.run(detect_spam)
run.eval([binary_metrics])
run.column_metrics([aggregate_metrics])
# Multiple runs for stability analysis
all_runs = run.repeat(3) # Creates 3 total runs
print(f"Final accuracy: {run.metrics.get('accuracy', 0):.2%}")