VL-RewardBench Leaderboard

{
  • "headers": [
    • "Model",
    • "Type",
    • "General",
    • "Hallucination",
    • "Reasoning",
    • "Overall Consistency",
    • "Macro Average",
    • "Open Source?",
    • "Model Size"
    ],
  • "data": [
    • [
      • "<a href="https://huggingface.co/Skywork/Skywork-VL-Reward-7B" target="_blank">Skywork/Skywork-VL-Reward-7B</a>",
      • "Seq. Classifier",
      • "65.6%",
      • "80.2%",
      • "61.3%",
      • "73.3%",
      • "69.0%",
      • "Yes",
      • "7-13B"
      ],
    • [
      • "<a href="https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash" target="_blank">Gemini-2.0-flash-exp (2024-12) </a>",
      • "Generative",
      • "50.8%",
      • "72.6%",
      • "70.1%",
      • "68.8%",
      • "64.5%",
      • "No",
      • "Unknown"
      ],
    • [
      • "<a href="https://www.anthropic.com/news/claude-3-5-sonnet" target="_blank">Gemini-1.5-Pro (2024-09-24)</a>",
      • "Generative",
      • "50.8%",
      • "72.5%",
      • "64.2%",
      • "67.2%",
      • "62.5%",
      • "No",
      • "Unknown"
      ],
    • [
      • "<a href="https://platform.openai.com/docs/models/gpt-4o" target="_blank">GPT-4o (2024-08-06)</a>",
      • "Generative",
      • "49.1%",
      • "67.6%",
      • "70.5%",
      • "65.8%",
      • "62.4%",
      • "No",
      • "Unknown"
      ],
    • [
      • "<a href="https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash" target="_blank">Gemini-1.5-Flash (2024-09-24)</a>",
      • "Generative",
      • "47.8%",
      • "59.6%",
      • "58.4%",
      • "57.6%",
      • "55.3%",
      • "No",
      • "Unknown"
      ],
    • [
      • "<a href="https://huggingface.co/allenai/Molmo-7B-D-0924" target="_blank">meta-llama/Llama-3.2-90B-Vision-Instruct</a>",
      • "Generative",
      • "42.6%",
      • "57.3%",
      • "61.7%",
      • "56.2%",
      • "53.9%",
      • "Yes",
      • ">13B"
      ],
    • [
      • "<a href="https://www.anthropic.com/news/claude-3-5-sonnet" target="_blank">Claude-3.5-Sonnet (2024-06-22)</a>",
      • "Generative",
      • "43.4%",
      • "55.0%",
      • "62.3%",
      • "55.3%",
      • "53.6%",
      • "No",
      • "Unknown"
      ],
    • [
      • "<a href="https://cloud.siliconflow.cn/open/models?target=TeleAI/TeleMM" target="_blank">TeleAI/TeleMM</a>",
      • "Generative",
      • "47.1%",
      • "38.9%",
      • "46.1%",
      • "54.9%",
      • "44.0%",
      • "No",
      • "Unknown"
      ],
    • [
      • "<a href="https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api" target="_blank">qwen-vl-max (2024-11-19)</a>",
      • "Generative",
      • "40.6%",
      • "46.0%",
      • "57.6%",
      • "48.2%",
      • "48.1%",
      • "No",
      • "Unknown"
      ],
    • [
      • "<a href="https://huggingface.co/Qwen/QVQ-72B-Preview" target="_blank">Qwen/QVQ-72B-Preview</a>",
      • "Generative",
      • "41.8%",
      • "46.2%",
      • "51.2%",
      • "46.4%",
      • "46.4%",
      • "Yes",
      • ">13B"
      ],
    • [
      • "<a href="https://huggingface.co/MAmmoTH-VL/MAmmoTH-VL-8B-SI" target="_blank">MAmmoTH-VL/MAmmoTH-VL-8B-SI</a>",
      • "Generative",
      • "42.0%",
      • "41.0%",
      • "53.0%",
      • "45.2%",
      • "45.3%",
      • "Yes",
      • "7-13B"
      ],
    • [
      • "<a href="https://huggingface.co/OpenGVLab/InternVL2-8B" target="_blank">OpenGVLab/InternVL2-8B</a>",
      • "Generative",
      • "35.6%",
      • "41.1%",
      • "59.0%",
      • "44.5%",
      • "45.2%",
      • "Yes",
      • "7-13B"
      ],
    • [
      • "<a href="https://huggingface.co/allenai/Molmo-72B-0924" target="_blank">allenai/Molmo-72B-0924</a>",
      • "Generative",
      • "33.9%",
      • "42.3%",
      • "54.9%",
      • "44.1%",
      • "43.7%",
      • "Yes",
      • ">13B"
      ],
    • [
      • "<a href="https://huggingface.co/OpenGVLab/InternVL2-26B" target="_blank">OpenGVLab/InternVL2-26B</a>",
      • "Generative",
      • "39.3%",
      • "36.9%",
      • "60.8%",
      • "43.2%",
      • "45.7%",
      • "Yes",
      • ">13B"
      ],
    • [
      • "<a href="https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct" target="_blank">meta-llama/Llama-3.2-11B-Vision-Instruct</a>",
      • "Generative",
      • "33.3%",
      • "38.4%",
      • "56.6%",
      • "42.9%",
      • "42.8%",
      • "Yes",
      • "7-13B"
      ],
    • [
      • "<a href="https://huggingface.co/MAmmoTH-VL/MAmmoTH-VL-8B" target="_blank">MAmmoTH-VL/MAmmoTH-VL-8B</a>",
      • "Generative",
      • "36.0%",
      • "40.0%",
      • "52.0%",
      • "42.2%",
      • "42.7%",
      • "Yes",
      • "7-13B"
      ],
    • [
      • "<a href="https://platform.openai.com/docs/models/gpt-4o-mini" target="_blank">GPT-4o-mini (2024-07-18)</a>",
      • "Generative",
      • "41.7%",
      • "34.5%",
      • "58.2%",
      • "41.5%",
      • "44.8%",
      • "No",
      • "Unknown"
      ],
    • [
      • "<a href="https://huggingface.co/rhymes-ai/Aria" target="_blank">rhymes-ai/Aria</a>",
      • "Generative",
      • "37.9%",
      • "33.1%",
      • "64.0%",
      • "41.1%",
      • "45.0%",
      • "Yes",
      • "<7B"
      ],
    • [
      • "<a href="https://huggingface.co/nvidia/NVLM-D-72B" target="_blank">nvidia/NVLM-D-72B</a>",
      • "Generative",
      • "38.9%",
      • "31.6%",
      • "62.0%",
      • "40.1%",
      • "44.2%",
      • "Yes",
      • ">13B"
      ],
    • [
      • "<a href="https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct" target="_blank">Qwen/Qwen2-VL-72B-Instruct </a>",
      • "Generative",
      • "38.1%",
      • "32.8%",
      • "58.0%",
      • "39.5%",
      • "43.0%",
      • "Yes",
      • ">13B"
      ],
    • [
      • "<a href="https://huggingface.co/allenai/Molmo-72B-0924" target="_blank">allenai/Molmo-7B-D-0924</a>",
      • "Generative",
      • "31.1%",
      • "31.8%",
      • "56.2%",
      • "37.5%",
      • "39.7%",
      • "Yes",
      • "7-13B"
      ],
    • [
      • "<a href="https://huggingface.co/mistralai/Pixtral-12B-2409" target="_blank">mistralai/Pixtral-12B-2409</a>",
      • "Generative",
      • "35.6%",
      • "25.9%",
      • "59.9%",
      • "35.8%",
      • "40.5%",
      • "Yes",
      • "7-13B"
      ],
    • [
      • "<a href="https://huggingface.co/deepseek-ai/deepseek-vl2" target="_blank">deepseek-ai/deepseek-vl2</a>",
      • "Generative",
      • "29.7%",
      • "23.8%",
      • "50.9%",
      • "30.3%",
      • "34.8%",
      • "Yes",
      • "<7B"
      ],
    • [
      • "<a href="https://hf.co/lmms-lab/llava-onevision-qwen2-7b-ov" target="_blank">lmms/llava-onevision-qwen2-7b-ov</a>",
      • "Generative",
      • "32.2%",
      • "20.1%",
      • "57.1%",
      • "29.6%",
      • "36.5%",
      • "Yes",
      • "7-13B"
      ],
    • [
      • "<a href="https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct" target="_blank">Qwen/Qwen2-VL-7B-Instruct</a>",
      • "Generative",
      • "31.6%",
      • "19.1%",
      • "51.1%",
      • "28.3%",
      • "33.9%",
      • "Yes",
      • "7-13B"
      ],
    • [
      • "<a href="https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct" target="_blank">microsoft/Phi-3.5-vision-instruct</a>",
      • "Generative",
      • "28.0%",
      • "22.4%",
      • "56.6%",
      • "28.2%",
      • "35.7%",
      • "Yes",
      • "<7B"
      ]
    ],
  • "metadata": null
}