As AI agents become more prevalent in business operations, companies need a standardized way to:
- Evaluate agent performance across different business functions
- Compare different agents objectively
- Ensure agents meet company-specific requirements
- Validate agent capabilities in sandbox environments
- Sales Development (SDR)
- Marketing
- Customer Support
- Business Analysis
- Recruitment
- General Purpose Tasks
- Objective evaluation using language models
- Consistent scoring across submissions
- Detailed feedback on performance
- Weighted scoring based on task importance
- Synthetic business data generation
- Sandbox SaaS integrations (Salesforce, HubSpot, etc.)
- Custom data source support
- Default industry-standard benchmarks
- Company-specific evaluation criteria
- Custom weighting systems
- Role-specific metrics
-
Setup
- Choose agent category
- Configure data sources
- Define evaluation criteria
-
Evaluation
graph LR A[AI Agent] --> B[Task Execution] B --> C[Data Collection] C --> D[LLM Evaluation] D --> E[Weighted Scoring]
-
Verification
- Independent result validation
- Performance consistency checks
- Sandbox environment testing
- Install the package:
pip install git+https://github.com/ninajlu/ai-agent-benchmark.git
- Create a benchmark configuration:
from benchmark.core import BenchmarkRunner
from benchmark.sources.synthetic import SyntheticDataSource
from benchmark.tasks.sales_analysis import SalesAnalysisTask
# Configure data source
data_source = SyntheticDataSource({
"employee_count": 200,
"num_opportunities": 300
})
# Create evaluation task
task = SalesAnalysisTask(
name="lead_qualification",
category="sales_development",
custom_criteria="""
Additional evaluation criteria:
- Industry knowledge
- Process compliance
"""
)
# Initialize benchmark runner
runner = BenchmarkRunner(
data_sources=[data_source],
tasks=[task],
judge_llm=your_llm_client
)
- Run evaluation:
results = await runner.run_benchmark(your_agent)
print(json.dumps(results, indent=2))
from benchmark.defaults.evaluation_criteria import BenchmarkDefaults
# Get default SDR criteria
sdr_criteria = BenchmarkDefaults.get_criteria("sales_development")
# Add custom criteria
custom_sdr_task = BenchmarkTask(
name="outbound_sequence",
category="sales_development",
custom_criteria="""
- Email response rates
- Meeting conversion
- Pipeline contribution
"""
)
# Create marketing evaluation
marketing_task = BenchmarkTask(
name="campaign_analysis",
category="marketing",
custom_criteria="""
- ROI measurement
- A/B testing strategy
- Channel optimization
"""
)
Compare different AI agent vendors using standardized benchmarks and company-specific criteria.
from benchmark.core import BenchmarkRunner
from benchmark.sources.salesforce import SalesforceDataSource
# Setup sandbox environments for each vendor
vendor_agents = {
"vendor_a": VendorAAgent(),
"vendor_b": VendorBAgent()
}
# Configure sandbox data source
sandbox_data = SalesforceDataSource({
"username": "sandbox_user",
"password": "sandbox_pass",
"domain": "test"
})
# Run benchmarks for each vendor
async def compare_vendors():
results = {}
for vendor_name, agent in vendor_agents.items():
runner = BenchmarkRunner(
data_sources=[sandbox_data],
tasks=default_sdr_tasks,
judge_llm=your_llm_client
)
results[vendor_name] = await runner.run_benchmark(agent)
return results
from benchmark.verification import VerificationRunner
# Setup continuous evaluation
verification = VerificationRunner(benchmark_runner)
# Run periodic verification
async def verify_agent_performance(agent):
results = await verification.verify_submission(
submission_id="production_agent_v1",
agent=agent,
sample_size=5 # Number of tasks to verify
)
if not results["passed"]:
alert_team(results["comparison"])
Compare AI agents in head-to-head competitions across standardized tasks.
from benchmark.core import BenchmarkRunner, BattleMode
from benchmark.sources.salesforce import SalesforceDataSource
# Setup battle environment
battle = AgentBattle(
category="sales_development",
max_rounds=10,
environment="competitive" # Agents can see/react to each other
)
# Register competing agents
battle.register_agent("agent_a", VendorAAgent())
battle.register_agent("agent_b", VendorBAgent())
# Configure sandbox arena
sandbox_data = SalesforceDataSource({
"username": "sandbox_user",
"password": "sandbox_pass",
"domain": "test",
"competitive_mode": True # Enable agent interaction
})
# Run head-to-head battle
async def run_agent_battle():
battle_results = await battle.run_competition(
data_source=sandbox_data,
metrics=[
"leads_converted",
"response_quality",
"strategy_adaptation", # How well agents adapt to opponent
"resource_efficiency"
]
)
return battle.generate_report(battle_results)
Soon you'll be able to pit teams of specialized AI agents against each other in complex business scenarios.
# Preview of upcoming team battles feature
team_alpha = AITeam({
"name": "Alpha Squad",
"agents": {
"sdr": SDRAgent(),
"marketer": MarketingAgent(),
"analyst": AnalystAgent()
},
"team_strategy": "aggressive_growth"
})
team_beta = AITeam({
"name": "Beta Force",
"agents": {
"sdr": SDRAgent(),
"support": SupportAgent(),
"closer": SalesCloserAgent()
},
"team_strategy": "customer_centric"
})
# Complex business scenario
scenario = BusinessScenario(
name="Market Expansion",
duration="30_days",
objectives=[
"Enter new market",
"Generate qualified leads",
"Convert to customers",
"Maintain satisfaction"
],
constraints={
"budget": 100000,
"resources": "limited",
"market_conditions": "competitive"
}
)
# Run team competition
battle_royale = TeamBattle(
teams=[team_alpha, team_beta],
scenario=scenario,
collaboration_enabled=True # Agents within team can collaborate
)
results = await battle_royale.execute()
-
Team Dynamics
- Inter-agent communication
- Resource sharing
- Strategy coordination
- Role specialization
-
Complex Scenarios
- Multi-stage business challenges
- Dynamic market conditions
- Competitor reactions
- Resource management
-
Advanced Metrics
- Team synergy scores
- Adaptation capability
- Strategy effectiveness
- Resource utilization
-
Tournament System
- League rankings
- Season competitions
- Championship events
- Team progression
-
Business Strategy Testing
- Test different team compositions
- Evaluate strategy effectiveness
- Identify optimal agent combinations
-
Training and Development
- Improve agent collaboration
- Develop team strategies
- Enhance adaptive capabilities
-
Market Simulation
- Model competitive scenarios
- Test market entry strategies
- Evaluate team performance
-
Process Optimization
- Identify efficient workflows
- Optimize resource allocation
- Improve team coordination
Stay tuned for the release of Team Battles - where AI agents collaborate and compete in the ultimate business simulation! π
# Synthetic data example
synthetic = SyntheticDataSource({
"employee_count": 200,
"num_opportunities": 300,
"total_sales_target": 10000000
})
# Salesforce sandbox example
salesforce = SalesforceDataSource({
"username": "test@example.com",
"password": "password123",
"security_token": "token123",
"domain": "test"
})
# Custom data source
class CustomDataSource(DataSource):
async def initialize(self):
"""Setup your data source"""
pass
async def get_data(self, query: Dict[str, Any]) -> Dict[str, Any]:
"""Implement your data retrieval logic"""
pass
# Define custom evaluation criteria
custom_criteria = """
Evaluate based on:
1. Response Quality
- Grammar and tone
- Technical accuracy
- Completeness
2. Process Adherence
- Following standard procedures
- Using required tools
- Documentation quality
"""
# Parse into structured criteria
criteria_parser = CriteriaParser(llm_client)
structured_criteria = await criteria_parser.parse_criteria(custom_criteria)
# Generate evaluation prompt
judge_prompt = await criteria_parser.generate_judge_prompt(structured_criteria)
# Submit for verification
verification = VerificationRunner(benchmark_runner)
result = await verification.verify_submission(
submission_id="agent_v1",
agent=your_agent
)
# Check verification status
if result["passed"]:
print("Verification successful!")
print(f"Score difference: {result['comparison']['score_diff']:.2%}")
else:
print("Verification failed")
print("Detailed comparison:", json.dumps(result["comparison"], indent=2))
- Full API reference
- Integration guides
- Best practices
- GitHub Discussions
- Issue tracking
- Feature requests
We welcome contributions! See our Contributing Guide for:
- Development setup
- Submission guidelines
- Code standards
- Testing requirements