BizGenEval Leaderboard
BizGenEval is a benchmark for commercial visual content generation quality. This leaderboard reports score breakdowns by:
- Capability dimensions: Layout, Attribute, Text, Knowledge
- Content domains: Slides, Webpage, Poster, Chart, Scientific Figure
All leaderboard scores are displayed as hard(easy) on a 0-100 scale.
{
- "headers": [
- "Model",
- "Average โฌ๏ธ",
- "Layout",
- "Attribute",
- "Text",
- "Knowledge",
- "Slides",
- "Webpage",
- "Poster",
- "Chart",
- "Scientific Figure"
- "data": [
- [
- "Nano-Banana-Pro",
- "76.7(93.7)",
- "72.2(91.2)",
- "65.6(92.2)",
- "86.4(95.0)",
- "82.6(96.2)",
- "82.2(94.8)",
- "77.5(96.5)",
- "76.5(94.8)",
- "73.0(92.2)",
- "74.2(90.0)"
- [
- "Nano-Banana-2.0",
- "68.5(92.5)",
- "68.4(91.0)",
- "57.4(91.6)",
- "83.4(94.6)",
- "64.6(93.0)",
- "73.8(95.8)",
- "71.2(94.5)",
- "67.5(91.2)",
- "60.2(89.2)",
- "69.5(92.0)"
- [
- "Seedream-5.0",
- "48.8(79.2)",
- "67.6(89.0)",
- "42.4(77.2)",
- "43.4(75.6)",
- "41.8(75.2)",
- "54.5(80.8)",
- "47.0(80.8)",
- "50.7(77.0)",
- "46.0(76.2)",
- "45.8(81.5)"
- [
- "GPT-Image-1.5",
- "35.9(81.6)",
- "51.6(84.8)",
- "25.8(75.2)",
- "40.4(82.8)",
- "26.0(83.6)",
- "40.8(89.2)",
- "41.0(86.0)",
- "42.0(83.5)",
- "28.2(76.5)",
- "27.8(72.8)"
- [
- "Seedream-4.5",
- "30.1(66.2)",
- "35.4(71.6)",
- "22.4(62.8)",
- "41.4(72.4)",
- "21.4(58.2)",
- "33.8(71.0)",
- "36.2(75.5)",
- "35.5(72.2)",
- "18.0(47.8)",
- "27.3(64.8)"
- [
- "Wan2.6-T2I",
- "21.9(58.7)",
- "46.4(80.6)",
- "16.6(60.6)",
- "12.6(52.6)",
- "12.2(41.0)",
- "27.1(56.5)",
- "25.5(67.0)",
- "27.5(62.5)",
- "17.2(48.8)",
- "12.5(58.8)"
- [
- "Seedream-4.0",
- "14.3(60.1)",
- "27.6(73.4)",
- "11.4(59.2)",
- "11.4(52.8)",
- "6.8(54.8)",
- "18.5(67.8)",
- "19.2(71.8)",
- "18.8(65.5)",
- "7.8(46.0)",
- "7.2(49.2)"
- [
- "Emu3.5",
- "13.2(40.2)",
- "30.4(63.4)",
- "14.2(52.6)",
- "7.0(33.6)",
- "1.2(11.0)",
- "14.5(44.8)",
- "20.0(48.8)",
- "20.3(53.0)",
- "4.8(20.3)",
- "6.5(34.0)"
- [
- "HunyuanImage-3.0",
- "13.0(40.1)",
- "27.8(65.0)",
- "13.8(53.6)",
- "10.2(39.6)",
- "0.0(2.0)",
- "19.3(47.0)",
- "21.0(52.0)",
- "19.8(53.5)",
- "2.0(18.8)",
- "2.8(29.0)"
- [
- "GPT-Image-1.0",
- "11.2(52.4)",
- "21.4(60.2)",
- "6.8(48.6)",
- "8.6(41.0)",
- "7.8(60.0)",
- "18.2(64.8)",
- "12.0(63.0)",
- "17.2(64.8)",
- "3.2(31.2)",
- "5.0(38.5)"
- [
- "HunyuanImage-2.1",
- "8.6(27.7)",
- "29.0(68.4)",
- "5.2(39.8)",
- "0.0(1.4)",
- "0.0(1.0)",
- "11.0(36.2)",
- "16.5(40.0)",
- "11.8(39.0)",
- "1.2(8.0)",
- "2.2(15.0)"
- [
- "Z-Image",
- "8.2(43.8)",
- "26.8(69.2)",
- "2.6(47.6)",
- "2.8(45.0)",
- "0.6(13.2)",
- "12.2(43.8)",
- "6.2(48.5)",
- "12.0(50.0)",
- "8.8(30.5)",
- "1.8(46.0)"
- [
- "Qwen-Image-2512",
- "6.3(41.0)",
- "22.2(70.6)",
- "1.2(47.8)",
- "1.8(39.2)",
- "0.0(6.4)",
- "10.2(45.0)",
- "7.2(48.0)",
- "11.5(47.0)",
- "2.2(28.0)",
- "0.2(37.0)"
- [
- "FLUX.2-dev",
- "4.9(42.0)",
- "17.2(67.8)",
- "1.2(49.2)",
- "1.0(43.0)",
- "0.0(8.2)",
- "5.5(43.2)",
- "5.5(48.8)",
- "7.2(48.2)",
- "5.5(33.5)",
- "0.5(36.5)"
- [
- "Z-Image-Turbo",
- "3.4(32.4)",
- "11.0(60.6)",
- "1.2(35.0)",
- "1.2(29.8)",
- "0.2(4.0)",
- "7.0(36.5)",
- "4.5(45.8)",
- "4.8(40.8)",
- "0.8(15.5)",
- "0.0(23.2)"
- [
- "Qwen-Image",
- "2.8(23.8)",
- "10.4(51.2)",
- "0.2(22.2)",
- "0.6(17.6)",
- "0.0(4.4)",
- "3.5(28.5)",
- "2.5(27.5)",
- "5.8(32.2)",
- "2.2(15.8)",
- "0.0(15.2)"
- [
- "FLUX.2-Pro",
- "1.6(21.1)",
- "6.1(36.2)",
- "0.0(22.9)",
- "0.0(13.7)",
- "0.2(11.7)",
- "1.8(23.0)",
- "2.0(27.0)",
- "2.8(26.2)",
- "1.3(14.7)",
- "0.0(14.3)"
- [
- "GLM-Image",
- "1.4(15.3)",
- "5.4(43.2)",
- "0.0(13.4)",
- "0.2(4.4)",
- "0.0(0.4)",
- "1.5(22.8)",
- "1.8(27.3)",
- "3.8(24.8)",
- "0.0(0.5)",
- "0.0(1.5)"
- [
- "Imagen-4",
- "1.1(10.7)",
- "4.2(26.8)",
- "0.0(8.7)",
- "0.2(4.0)",
- "0.0(3.4)",
- "1.5(15.0)",
- "0.5(14.8)",
- "1.8(12.8)",
- "1.5(6.8)",
- "0.2(4.3)"
- [
- "LongCat-Image",
- "0.7(13.0)",
- "2.4(35.8)",
- "0.2(11.6)",
- "0.0(4.4)",
- "0.0(0.0)",
- "0.8(15.0)",
- "1.8(22.3)",
- "0.8(21.3)",
- "0.0(2.5)",
- "0.0(3.8)"
- [
- "X-Omni-EN",
- "0.5(9.4)",
- "2.0(22.8)",
- "0.0(5.6)",
- "0.0(8.0)",
- "0.0(1.4)",
- "0.8(9.0)",
- "0.2(14.2)",
- "1.5(15.8)",
- "0.0(6.5)",
- "0.0(1.8)"
- [
- "SD3.5-Large",
- "0.5(2.1)",
- "2.2(6.6)",
- "0.0(0.4)",
- "0.0(0.0)",
- "0.0(1.2)",
- "0.0(0.5)",
- "0.2(2.2)",
- "0.0(2.8)",
- "1.2(3.2)",
- "1.2(1.5)"
- [
- "Bagel",
- "0.3(3.7)",
- "0.6(12.8)",
- "0.0(1.6)",
- "0.0(0.0)",
- "0.0(0.2)",
- "0.3(4.8)",
- "0.0(5.0)",
- "0.5(8.5)",
- "0.0(0.0)",
- "0.0(0.0)"
- [
- "FLUX.1-Krea-dev",
- "0.1(5.1)",
- "0.2(17.8)",
- "0.0(2.8)",
- "0.0(0.0)",
- "0.0(0.0)",
- "0.0(4.8)",
- "0.0(8.5)",
- "0.2(11.8)",
- "0.0(0.8)",
- "0.0(0.0)"
- [
- "FLUX.1-dev",
- "0.1(5.0)",
- "0.4(15.8)",
- "0.0(2.8)",
- "0.0(0.0)",
- "0.0(1.4)",
- "0.0(6.0)",
- "0.0(11.0)",
- "0.5(7.8)",
- "0.0(0.2)",
- "0.0(0.0)"
- [
- "FLUX.1-schnell",
- "0.0(5.1)",
- "0.0(16.8)",
- "0.0(2.6)",
- "0.0(0.0)",
- "0.0(1.2)",
- "0.0(8.5)",
- "0.0(8.2)",
- "0.0(8.2)",
- "0.0(0.8)",
- "0.0(0.0)"
- [
- "metadata": null
How it works
- Run BizGenEval evaluation locally.
- Run BizGenEval summarization to obtain
summary.json. - Submit model metadata via this leaderboard to enter queue tracking.
- Update
commit_results.jsonlto refresh the public leaderboard.
Score Protocol
- Scores are shown as
hard(easy)on a 0-100 scale. - Ranking is sorted by hard-track average first, then easy-track average.
- Displayed columns include 4 capability dimensions and 5 content domains.
Reproducibility
To reproduce the benchmark outputs:
python -m evaluation.image_evaluation --input_dir outputs/generated --save_dir outputs/eval_results
python -m evaluation.summarize --data_path assets/bizgeneval.jsonl --result_dir outputs/eval_results --save_dir outputs/summary