LLM 성능

Mistral Large 2 성능1은 다음과 같다.

코드
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
코드
mistral <- tribble(
  ~Model,                      ~Average, ~Python, ~`C++`, ~Bash,  ~Java,  ~TypeScript, ~PHP,   ~`C#`,
  "Mistral Large 2",    76.9,     92.1,    84.5,   51.9,   84.2,   86.8,        77.6,   61.4,
  "Mistral Large 1",    60.4,     70.1,    67.1,   36.1,   70.3,   71.7,        61.5,   46.2,
  "Llama 3.1 405B",    75.8,     89.0,    82.0,   57.6,   80.4,   81.1,        76.4,   64.4,
  "Llama 3.1 70B",             68.5,     78.7,    70.2,   51.3,   74.7,   76.7,        73.3,   54.4,
  "GPT-4o",                    77.9,     93.3,    85.7,   54.4,   82.9,   89.3,        79.5,   60.1
)

mistral |> 
  pivot_longer(cols = -Model, names_to = "언어", values_to = "성능") |> 
  ggplot(aes(x=Model, y=성능, fill = Model)) +
    geom_col(width = 0.3, show.legend = FALSE) +
    facet_wrap(~언어) +
    labs(x = "",
         y = "성능"
    ) +
    theme_bw(base_family = "NanumGothic") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 

라마3.1

라마3.1 모델 카드

코드
llama <- tribble(
  ~Category, ~Benchmark, ~Shots, ~Metric, ~`Llama 3 8B Instruct`, ~`Llama 3.1 8B Instruct`, ~`Llama 3 70B Instruct`, ~`Llama 3.1 70B Instruct`, ~`Llama 3.1 405B Instruct`,
  "General", "MMLU", 5, "macro_avg/acc", 68.5, 69.4, 82.0, 83.6, 87.3,
  "General", "MMLU (CoT)", 0, "macro_avg/acc", 65.3, 73.0, 80.9, 86.0, 88.6,
  "General", "MMLU-Pro (CoT)", 5, "micro_avg/acc_char", 45.5, 48.3, 63.4, 66.4, 73.3,
  "General", "IFEval", NA, NA, 76.8, 80.4, 82.9, 87.5, 88.6,
  "Reasoning", "ARC-C", 0, "acc", 82.4, 83.4, 94.4, 94.8, 96.9,
  "Reasoning", "GPQA", 0, "em", 34.6, 30.4, 39.5, 41.7, 50.7,
  "Code", "HumanEval", 0, "pass@1", 60.4, 72.6, 81.7, 80.5, 89.0,
  "Code", "MBPP ++ base version", 0, "pass@1", 70.6, 72.8, 82.5, 86.0, 88.6,
  "Code", "Multipl-E HumanEval", 0, "pass@1", NA, 50.8, NA, 65.5, 75.2,
  "Code", "Multipl-E MBPP", 0, "pass@1", NA, 52.4, NA, 62.0, 65.7,
  "Math", "GSM-8K (CoT)", 8, "em_maj1@1", 80.6, 84.5, 93.0, 95.1, 96.8,
  "Math", "MATH (CoT)", 0, "final_em", 29.1, 51.9, 51.0, 68.0, 73.8,
  "Tool Use", "API-Bank", 0, "acc", 48.3, 82.6, 85.1, 90.0, 92.0,
  "Tool Use", "BFCL", 0, "acc", 60.3, 76.1, 83.0, 84.8, 88.5,
  "Tool Use", "Gorilla Benchmark API Bench", 0, "acc", 1.7, 8.2, 14.7, 29.7, 35.3,
  "Tool Use", "Nexus (0-shot)", 0, "macro_avg/acc", 18.1, 38.5, 47.8, 56.7, 58.7,
  "Multilingual", "Multilingual MGSM (CoT)", 0, "em", NA, 68.9, NA, 86.9, 91.6
)

llama |>
  janitor::clean_names() |>
  pivot_longer(cols = starts_with('llama'), names_to = 'model', values_to = 'value') |>
  mutate(model = str_remove(model, "_instruct")) |>
  ggplot(aes(x=model, y=value, fill = model)) +
    geom_col(width=0.3, position = "dodge", show.legend = FALSE) +
    facet_wrap(~category) +
    labs(x = "",
         y = "성능"
    ) +
    theme_bw(base_family = "NanumGothic") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))     
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_col()`).