LLM 성능

Mistral Large 2 성능¹은 다음과 같다.

코드

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

코드

mistral <- tribble(
  ~Model,                      ~Average, ~Python, ~`C++`, ~Bash,  ~Java,  ~TypeScript, ~PHP,   ~`C#`,
  "Mistral Large 2",    76.9,     92.1,    84.5,   51.9,   84.2,   86.8,        77.6,   61.4,
  "Mistral Large 1",    60.4,     70.1,    67.1,   36.1,   70.3,   71.7,        61.5,   46.2,
  "Llama 3.1 405B",    75.8,     89.0,    82.0,   57.6,   80.4,   81.1,        76.4,   64.4,
  "Llama 3.1 70B",             68.5,     78.7,    70.2,   51.3,   74.7,   76.7,        73.3,   54.4,
  "GPT-4o",                    77.9,     93.3,    85.7,   54.4,   82.9,   89.3,        79.5,   60.1
)

mistral |> 
  pivot_longer(cols = -Model, names_to = "언어", values_to = "성능") |> 
  ggplot(aes(x=Model, y=성능, fill = Model)) +
    geom_col(width = 0.3, show.legend = FALSE) +
    facet_wrap(~언어) +
    labs(x = "",
         y = "성능"
    ) +
    theme_bw(base_family = "NanumGothic") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

라마3.1

라마3.1 모델 카드

코드

llama <- tribble(
  ~Category, ~Benchmark, ~Shots, ~Metric, ~`Llama 3 8B Instruct`, ~`Llama 3.1 8B Instruct`, ~`Llama 3 70B Instruct`, ~`Llama 3.1 70B Instruct`, ~`Llama 3.1 405B Instruct`,
  "General", "MMLU", 5, "macro_avg/acc", 68.5, 69.4, 82.0, 83.6, 87.3,
  "General", "MMLU (CoT)", 0, "macro_avg/acc", 65.3, 73.0, 80.9, 86.0, 88.6,
  "General", "MMLU-Pro (CoT)", 5, "micro_avg/acc_char", 45.5, 48.3, 63.4, 66.4, 73.3,
  "General", "IFEval", NA, NA, 76.8, 80.4, 82.9, 87.5, 88.6,
  "Reasoning", "ARC-C", 0, "acc", 82.4, 83.4, 94.4, 94.8, 96.9,
  "Reasoning", "GPQA", 0, "em", 34.6, 30.4, 39.5, 41.7, 50.7,
  "Code", "HumanEval", 0, "pass@1", 60.4, 72.6, 81.7, 80.5, 89.0,
  "Code", "MBPP ++ base version", 0, "pass@1", 70.6, 72.8, 82.5, 86.0, 88.6,
  "Code", "Multipl-E HumanEval", 0, "pass@1", NA, 50.8, NA, 65.5, 75.2,
  "Code", "Multipl-E MBPP", 0, "pass@1", NA, 52.4, NA, 62.0, 65.7,
  "Math", "GSM-8K (CoT)", 8, "em_maj1@1", 80.6, 84.5, 93.0, 95.1, 96.8,
  "Math", "MATH (CoT)", 0, "final_em", 29.1, 51.9, 51.0, 68.0, 73.8,
  "Tool Use", "API-Bank", 0, "acc", 48.3, 82.6, 85.1, 90.0, 92.0,
  "Tool Use", "BFCL", 0, "acc", 60.3, 76.1, 83.0, 84.8, 88.5,
  "Tool Use", "Gorilla Benchmark API Bench", 0, "acc", 1.7, 8.2, 14.7, 29.7, 35.3,
  "Tool Use", "Nexus (0-shot)", 0, "macro_avg/acc", 18.1, 38.5, 47.8, 56.7, 58.7,
  "Multilingual", "Multilingual MGSM (CoT)", 0, "em", NA, 68.9, NA, 86.9, 91.6
)

llama |>
  janitor::clean_names() |>
  pivot_longer(cols = starts_with('llama'), names_to = 'model', values_to = 'value') |>
  mutate(model = str_remove(model, "_instruct")) |>
  ggplot(aes(x=model, y=value, fill = model)) +
    geom_col(width=0.3, position = "dodge", show.legend = FALSE) +
    facet_wrap(~category) +
    labs(x = "",
         y = "성능"
    ) +
    theme_bw(base_family = "NanumGothic") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_col()`).

각주

Mistral Large 2 ↩︎