데이터 전처리 (Preprocessing)

기계학습과 시각화를 위한 다양한 데이터 정제(Cleansing)하는 방법을 다룹니다.
저자
소속
이광춘

TCS

공개

2023년 01월 12일

1 결측값 제거

참고자료: 결측데이터
library(tidyverse)
library(palmerpenguins)

penguins
#> # A tibble: 344 × 8
#>    species island    bill_length_mm bill_depth_mm flipper_…¹ body_…² sex    year
#>    <fct>   <fct>              <dbl>         <dbl>      <int>   <int> <fct> <int>
#>  1 Adelie  Torgersen           39.1          18.7        181    3750 male   2007
#>  2 Adelie  Torgersen           39.5          17.4        186    3800 fema…  2007
#>  3 Adelie  Torgersen           40.3          18          195    3250 fema…  2007
#>  4 Adelie  Torgersen           NA            NA           NA      NA <NA>   2007
#>  5 Adelie  Torgersen           36.7          19.3        193    3450 fema…  2007
#>  6 Adelie  Torgersen           39.3          20.6        190    3650 male   2007
#>  7 Adelie  Torgersen           38.9          17.8        181    3625 fema…  2007
#>  8 Adelie  Torgersen           39.2          19.6        195    4675 male   2007
#>  9 Adelie  Torgersen           34.1          18.1        193    3475 <NA>   2007
#> 10 Adelie  Torgersen           42            20.2        190    4250 <NA>   2007
#> # … with 334 more rows, and abbreviated variable names ¹​flipper_length_mm,
#> #   ²​body_mass_g

1.1 행기준 제거

library(tidyverse)
library(palmerpenguins)

penguins %>% 
  drop_na()
#> # A tibble: 333 × 8
#>    species island    bill_length_mm bill_depth_mm flipper_…¹ body_…² sex    year
#>    <fct>   <fct>              <dbl>         <dbl>      <int>   <int> <fct> <int>
#>  1 Adelie  Torgersen           39.1          18.7        181    3750 male   2007
#>  2 Adelie  Torgersen           39.5          17.4        186    3800 fema…  2007
#>  3 Adelie  Torgersen           40.3          18          195    3250 fema…  2007
#>  4 Adelie  Torgersen           36.7          19.3        193    3450 fema…  2007
#>  5 Adelie  Torgersen           39.3          20.6        190    3650 male   2007
#>  6 Adelie  Torgersen           38.9          17.8        181    3625 fema…  2007
#>  7 Adelie  Torgersen           39.2          19.6        195    4675 male   2007
#>  8 Adelie  Torgersen           41.1          17.6        182    3200 fema…  2007
#>  9 Adelie  Torgersen           38.6          21.2        191    3800 male   2007
#> 10 Adelie  Torgersen           34.6          21.1        198    4400 male   2007
#> # … with 323 more rows, and abbreviated variable names ¹​flipper_length_mm,
#> #   ²​body_mass_g

1.2 열기준 제거

penguins %>% 
  select_if(~ !any(is.na(.)))
#> # A tibble: 344 × 3
#>    species island     year
#>    <fct>   <fct>     <int>
#>  1 Adelie  Torgersen  2007
#>  2 Adelie  Torgersen  2007
#>  3 Adelie  Torgersen  2007
#>  4 Adelie  Torgersen  2007
#>  5 Adelie  Torgersen  2007
#>  6 Adelie  Torgersen  2007
#>  7 Adelie  Torgersen  2007
#>  8 Adelie  Torgersen  2007
#>  9 Adelie  Torgersen  2007
#> 10 Adelie  Torgersen  2007
#> # … with 334 more rows

1.3 결측값 현황

penguins %>% 
  summarise(across(everything(), ~ sum(complete.cases(.)))) %>% 
  pivot_longer( cols = everything(), names_to = "변수", values_to = "정상값") %>% 
  mutate(결측비율 = 1 - 정상값 / nrow(penguins))
#> # A tibble: 8 × 3
#>   변수              정상값 결측비율
#>   <chr>              <int>    <dbl>
#> 1 species              344  0      
#> 2 island               344  0      
#> 3 bill_length_mm       342  0.00581
#> 4 bill_depth_mm        342  0.00581
#> 5 flipper_length_mm    342  0.00581
#> 6 body_mass_g          342  0.00581
#> 7 sex                  333  0.0320 
#> 8 year                 344  0

2 중복 제거

2.1 행기준 제거

penguins_dup <- penguins %>% 
  slice(1:5) %>% 
  bind_rows(penguins %>% slice(1:5))

penguins_dup

penguins_dup %>% 
  distinct()
#> # A tibble: 10 × 8
#>    species island    bill_length_mm bill_depth_mm flipper_…¹ body_…² sex    year
#>    <fct>   <fct>              <dbl>         <dbl>      <int>   <int> <fct> <int>
#>  1 Adelie  Torgersen           39.1          18.7        181    3750 male   2007
#>  2 Adelie  Torgersen           39.5          17.4        186    3800 fema…  2007
#>  3 Adelie  Torgersen           40.3          18          195    3250 fema…  2007
#>  4 Adelie  Torgersen           NA            NA           NA      NA <NA>   2007
#>  5 Adelie  Torgersen           36.7          19.3        193    3450 fema…  2007
#>  6 Adelie  Torgersen           39.1          18.7        181    3750 male   2007
#>  7 Adelie  Torgersen           39.5          17.4        186    3800 fema…  2007
#>  8 Adelie  Torgersen           40.3          18          195    3250 fema…  2007
#>  9 Adelie  Torgersen           NA            NA           NA      NA <NA>   2007
#> 10 Adelie  Torgersen           36.7          19.3        193    3450 fema…  2007
#> # … with abbreviated variable names ¹​flipper_length_mm, ²​body_mass_g
#> # A tibble: 5 × 8
#>   species island    bill_length_mm bill_depth_mm flipper_l…¹ body_…² sex    year
#>   <fct>   <fct>              <dbl>         <dbl>       <int>   <int> <fct> <int>
#> 1 Adelie  Torgersen           39.1          18.7         181    3750 male   2007
#> 2 Adelie  Torgersen           39.5          17.4         186    3800 fema…  2007
#> 3 Adelie  Torgersen           40.3          18           195    3250 fema…  2007
#> 4 Adelie  Torgersen           NA            NA            NA      NA <NA>   2007
#> 5 Adelie  Torgersen           36.7          19.3         193    3450 fema…  2007
#> # … with abbreviated variable names ¹​flipper_length_mm, ²​body_mass_g

2.2 열기준 제거

penguins_cols <- penguins %>% 
  select(1:3) %>% 
  cbind(penguins %>% select(1:3)) 

penguins_cols

duplicated_colnames <- duplicated(colnames(penguins_cols))

penguins_cols[!duplicated_colnames] %>% 
  as_tibble()
#> # A tibble: 344 × 3
#>    species island    bill_length_mm
#>    <fct>   <fct>              <dbl>
#>  1 Adelie  Torgersen           39.1
#>  2 Adelie  Torgersen           39.5
#>  3 Adelie  Torgersen           40.3
#>  4 Adelie  Torgersen           NA  
#>  5 Adelie  Torgersen           36.7
#>  6 Adelie  Torgersen           39.3
#>  7 Adelie  Torgersen           38.9
#>  8 Adelie  Torgersen           39.2
#>  9 Adelie  Torgersen           34.1
#> 10 Adelie  Torgersen           42  
#> # … with 334 more rows

3 자료형 변환

penguins %>% 
  select(species, year, island, bill_length_mm) %>% 
  mutate(year = lubridate::make_date(year = year, month = 7L, day = 1L))
#> # A tibble: 344 × 4
#>    species year       island    bill_length_mm
#>    <fct>   <date>     <fct>              <dbl>
#>  1 Adelie  2007-07-01 Torgersen           39.1
#>  2 Adelie  2007-07-01 Torgersen           39.5
#>  3 Adelie  2007-07-01 Torgersen           40.3
#>  4 Adelie  2007-07-01 Torgersen           NA  
#>  5 Adelie  2007-07-01 Torgersen           36.7
#>  6 Adelie  2007-07-01 Torgersen           39.3
#>  7 Adelie  2007-07-01 Torgersen           38.9
#>  8 Adelie  2007-07-01 Torgersen           39.2
#>  9 Adelie  2007-07-01 Torgersen           34.1
#> 10 Adelie  2007-07-01 Torgersen           42  
#> # … with 334 more rows

4 이상점 제거

penguin_stat <- penguins %>% 
  drop_na() %>% 
  summarise(mean = mean(bill_length_mm),
            sd   = sd(bill_length_mm)) 

penguins %>% 
  filter(bill_length_mm < penguin_stat$mean - 2 * penguin_stat$sd |
         bill_length_mm > penguin_stat$mean + 2 * penguin_stat$sd)
#> # A tibble: 6 × 8
#>   species   island bill_length_mm bill_depth_mm flipper_le…¹ body_…² sex    year
#>   <fct>     <fct>           <dbl>         <dbl>        <int>   <int> <fct> <int>
#> 1 Adelie    Dream            32.1          15.5          188    3050 fema…  2009
#> 2 Gentoo    Biscoe           59.6          17            230    6050 male   2007
#> 3 Gentoo    Biscoe           55.9          17            228    5600 male   2009
#> 4 Gentoo    Biscoe           55.1          16            230    5850 male   2009
#> 5 Chinstrap Dream            58            17.8          181    3700 fema…  2007
#> 6 Chinstrap Dream            55.8          19.8          207    4000 male   2009
#> # … with abbreviated variable names ¹​flipper_length_mm, ²​body_mass_g

5 공백문자제거

penguins %>% 
  mutate(species = str_trim(species))
#> # A tibble: 344 × 8
#>    species island    bill_length_mm bill_depth_mm flipper_…¹ body_…² sex    year
#>    <chr>   <fct>              <dbl>         <dbl>      <int>   <int> <fct> <int>
#>  1 Adelie  Torgersen           39.1          18.7        181    3750 male   2007
#>  2 Adelie  Torgersen           39.5          17.4        186    3800 fema…  2007
#>  3 Adelie  Torgersen           40.3          18          195    3250 fema…  2007
#>  4 Adelie  Torgersen           NA            NA           NA      NA <NA>   2007
#>  5 Adelie  Torgersen           36.7          19.3        193    3450 fema…  2007
#>  6 Adelie  Torgersen           39.3          20.6        190    3650 male   2007
#>  7 Adelie  Torgersen           38.9          17.8        181    3625 fema…  2007
#>  8 Adelie  Torgersen           39.2          19.6        195    4675 male   2007
#>  9 Adelie  Torgersen           34.1          18.1        193    3475 <NA>   2007
#> 10 Adelie  Torgersen           42            20.2        190    4250 <NA>   2007
#> # … with 334 more rows, and abbreviated variable names ¹​flipper_length_mm,
#> #   ²​body_mass_g

6 칼럼 변환

penguins %>% 
  mutate_if(is.factor, toupper)
# mutate_if(is.factor, toupper)
#> # A tibble: 344 × 8
#>    species island    bill_length_mm bill_depth_mm flipper_…¹ body_…² sex    year
#>    <chr>   <chr>              <dbl>         <dbl>      <int>   <int> <chr> <int>
#>  1 ADELIE  TORGERSEN           39.1          18.7        181    3750 MALE   2007
#>  2 ADELIE  TORGERSEN           39.5          17.4        186    3800 FEMA…  2007
#>  3 ADELIE  TORGERSEN           40.3          18          195    3250 FEMA…  2007
#>  4 ADELIE  TORGERSEN           NA            NA           NA      NA <NA>   2007
#>  5 ADELIE  TORGERSEN           36.7          19.3        193    3450 FEMA…  2007
#>  6 ADELIE  TORGERSEN           39.3          20.6        190    3650 MALE   2007
#>  7 ADELIE  TORGERSEN           38.9          17.8        181    3625 FEMA…  2007
#>  8 ADELIE  TORGERSEN           39.2          19.6        195    4675 MALE   2007
#>  9 ADELIE  TORGERSEN           34.1          18.1        193    3475 <NA>   2007
#> 10 ADELIE  TORGERSEN           42            20.2        190    4250 <NA>   2007
#> # … with 334 more rows, and abbreviated variable names ¹​flipper_length_mm,
#> #   ²​body_mass_g

7 특정문자 변환

penguins %>% 
  mutate(species_kr = glue::glue("{species}_종")) %>% 
  select(species, species_kr, island)
#> # A tibble: 344 × 3
#>    species species_kr island   
#>    <fct>   <glue>     <fct>    
#>  1 Adelie  Adelie_종  Torgersen
#>  2 Adelie  Adelie_종  Torgersen
#>  3 Adelie  Adelie_종  Torgersen
#>  4 Adelie  Adelie_종  Torgersen
#>  5 Adelie  Adelie_종  Torgersen
#>  6 Adelie  Adelie_종  Torgersen
#>  7 Adelie  Adelie_종  Torgersen
#>  8 Adelie  Adelie_종  Torgersen
#>  9 Adelie  Adelie_종  Torgersen
#> 10 Adelie  Adelie_종  Torgersen
#> # … with 334 more rows

8 결측값 치환

penguins %>% 
  mutate(bill_length_mm = ifelse(is.na(bill_length_mm), 
                                 mean(bill_length_mm, na.rm = TRUE),
                                 bill_length_mm))
#> # A tibble: 344 × 8
#>    species island    bill_length_mm bill_depth_mm flipper_…¹ body_…² sex    year
#>    <fct>   <fct>              <dbl>         <dbl>      <int>   <int> <fct> <int>
#>  1 Adelie  Torgersen           39.1          18.7        181    3750 male   2007
#>  2 Adelie  Torgersen           39.5          17.4        186    3800 fema…  2007
#>  3 Adelie  Torgersen           40.3          18          195    3250 fema…  2007
#>  4 Adelie  Torgersen           43.9          NA           NA      NA <NA>   2007
#>  5 Adelie  Torgersen           36.7          19.3        193    3450 fema…  2007
#>  6 Adelie  Torgersen           39.3          20.6        190    3650 male   2007
#>  7 Adelie  Torgersen           38.9          17.8        181    3625 fema…  2007
#>  8 Adelie  Torgersen           39.2          19.6        195    4675 male   2007
#>  9 Adelie  Torgersen           34.1          18.1        193    3475 <NA>   2007
#> 10 Adelie  Torgersen           42            20.2        190    4250 <NA>   2007
#> # … with 334 more rows, and abbreviated variable names ¹​flipper_length_mm,
#> #   ²​body_mass_g

9 Pivoting

폭넓은 데이터(wide-format data)를 긴 데이터(long-format data)로 혹은 반대로 데이터의 형태를 바꾼다.

9.1 Wide → Long

penguins_long <- penguins %>% 
  pivot_longer(cols = contains("mm"), names_to = "측정변수", 
               values_to = "측정값")

penguins_long
#> # A tibble: 1,032 × 7
#>    species island    body_mass_g sex     year 측정변수          측정값
#>    <fct>   <fct>           <int> <fct>  <int> <chr>              <dbl>
#>  1 Adelie  Torgersen        3750 male    2007 bill_length_mm      39.1
#>  2 Adelie  Torgersen        3750 male    2007 bill_depth_mm       18.7
#>  3 Adelie  Torgersen        3750 male    2007 flipper_length_mm  181  
#>  4 Adelie  Torgersen        3800 female  2007 bill_length_mm      39.5
#>  5 Adelie  Torgersen        3800 female  2007 bill_depth_mm       17.4
#>  6 Adelie  Torgersen        3800 female  2007 flipper_length_mm  186  
#>  7 Adelie  Torgersen        3250 female  2007 bill_length_mm      40.3
#>  8 Adelie  Torgersen        3250 female  2007 bill_depth_mm       18  
#>  9 Adelie  Torgersen        3250 female  2007 flipper_length_mm  195  
#> 10 Adelie  Torgersen          NA <NA>    2007 bill_length_mm      NA  
#> # … with 1,022 more rows

9.2 long → Wide

penguins_long %>% 
  pivot_wider(names_from = "측정변수", 
              values_from = "측정값",
              values_fn = mean)
#> # A tibble: 284 × 8
#>    species island    body_mass_g sex     year bill_length_mm bill_dept…¹ flipp…²
#>    <fct>   <fct>           <int> <fct>  <int>          <dbl>       <dbl>   <dbl>
#>  1 Adelie  Torgersen        3750 male    2007           39.1        18.7     181
#>  2 Adelie  Torgersen        3800 female  2007           39.5        17.4     186
#>  3 Adelie  Torgersen        3250 female  2007           40.3        18       195
#>  4 Adelie  Torgersen          NA <NA>    2007           NA          NA        NA
#>  5 Adelie  Torgersen        3450 female  2007           37.7        19.2     194
#>  6 Adelie  Torgersen        3650 male    2007           39.3        20.6     190
#>  7 Adelie  Torgersen        3625 female  2007           38.9        17.8     181
#>  8 Adelie  Torgersen        4675 male    2007           39.2        19.6     195
#>  9 Adelie  Torgersen        3475 <NA>    2007           34.1        18.1     193
#> 10 Adelie  Torgersen        4250 <NA>    2007           42          20.2     190
#> # … with 274 more rows, and abbreviated variable names ¹​bill_depth_mm,
#> #   ²​flipper_length_mm

10 변수 쪼개기

penguins_long %>% 
  separate(측정변수, into = c("부위", "측정", "단위"), sep = "_")
#> # A tibble: 1,032 × 9
#>    species island    body_mass_g sex     year 부위    측정   단위  측정값
#>    <fct>   <fct>           <int> <fct>  <int> <chr>   <chr>  <chr>  <dbl>
#>  1 Adelie  Torgersen        3750 male    2007 bill    length mm      39.1
#>  2 Adelie  Torgersen        3750 male    2007 bill    depth  mm      18.7
#>  3 Adelie  Torgersen        3750 male    2007 flipper length mm     181  
#>  4 Adelie  Torgersen        3800 female  2007 bill    length mm      39.5
#>  5 Adelie  Torgersen        3800 female  2007 bill    depth  mm      17.4
#>  6 Adelie  Torgersen        3800 female  2007 flipper length mm     186  
#>  7 Adelie  Torgersen        3250 female  2007 bill    length mm      40.3
#>  8 Adelie  Torgersen        3250 female  2007 bill    depth  mm      18  
#>  9 Adelie  Torgersen        3250 female  2007 flipper length mm     195  
#> 10 Adelie  Torgersen          NA <NA>    2007 bill    length mm      NA  
#> # … with 1,022 more rows