<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://benchmarkingagents.com</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>1</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/about</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/agent-benchmarks</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/agent-eval-cost-calculator</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/agentbench</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/androidworld</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/appworld-benchmark</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/assistantbench</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/autogen-benchmarks</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/benchmark-contamination</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/benchmarks-list</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/best-benchmarks-for-browser-agents</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/best-benchmarks-for-coding-agents</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/best-benchmarks-for-rag</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/best-benchmarks-for-reasoning</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/best-benchmarks-for-tool-use</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/bfcl-function-calling</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/bigbench-hard</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/browsergym</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/chatbot-arena</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/cost-per-eval</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/crewai-benchmarks</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/custom-evals</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/dspy-benchmarks</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/faq</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/for-production-monitoring</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/gaia-benchmark</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/glossary</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/gpqa-arc-agi</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/harm-safety-evals</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/hle-humanitys-last-exam</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/human-vs-automated</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/humaneval</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/inspect-uk-aisi</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/langgraph-benchmarks</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/legalbench</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/livecodebench</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/llm-as-judge</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/math-benchmark</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/medqa-medical-benchmark</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/methodology</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/mind2web</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/mmlu</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/mmlu-pro</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/mmmu-benchmark</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/model-on-benchmark</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/openai-agents-sdk-benchmarks</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/openai-evals-framework</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/osworld</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/pass-at-k</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/prompt-template-variance</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/rag-eval</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/repobench</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/reproducibility-evals</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/stanford-helm</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/swe-bench</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/tau-bench</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/tau-bench-retail-airline</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/terminal-bench</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/tools-compared</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/webarena</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://benchmarkingagents.com/what-these-benchmarks-miss</loc>
<lastmod>2026-06-05</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
</urlset>
