데이터 전처리 (Preprocessing)
기계학습과 시각화를 위한 다양한 데이터 정제(Cleansing)하는 방법을 다룹니다.
1 결측값 제거
참고자료: 결측데이터
library(tidyverse)
library(palmerpenguins)
penguins
#> # A tibble: 344 × 8
#> species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
#> 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
#> 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007
#> 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007
#> 4 Adelie Torgersen NA NA NA NA <NA> 2007
#> 5 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007
#> 6 Adelie Torgersen 39.3 20.6 190 3650 male 2007
#> 7 Adelie Torgersen 38.9 17.8 181 3625 fema… 2007
#> 8 Adelie Torgersen 39.2 19.6 195 4675 male 2007
#> 9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007
#> 10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007
#> # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm,
#> # ²body_mass_g
1.1 행기준 제거
library(tidyverse)
library(palmerpenguins)
%>%
penguins drop_na()
#> # A tibble: 333 × 8
#> species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
#> 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
#> 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007
#> 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007
#> 4 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007
#> 5 Adelie Torgersen 39.3 20.6 190 3650 male 2007
#> 6 Adelie Torgersen 38.9 17.8 181 3625 fema… 2007
#> 7 Adelie Torgersen 39.2 19.6 195 4675 male 2007
#> 8 Adelie Torgersen 41.1 17.6 182 3200 fema… 2007
#> 9 Adelie Torgersen 38.6 21.2 191 3800 male 2007
#> 10 Adelie Torgersen 34.6 21.1 198 4400 male 2007
#> # … with 323 more rows, and abbreviated variable names ¹flipper_length_mm,
#> # ²body_mass_g
1.2 열기준 제거
%>%
penguins select_if(~ !any(is.na(.)))
#> # A tibble: 344 × 3
#> species island year
#> <fct> <fct> <int>
#> 1 Adelie Torgersen 2007
#> 2 Adelie Torgersen 2007
#> 3 Adelie Torgersen 2007
#> 4 Adelie Torgersen 2007
#> 5 Adelie Torgersen 2007
#> 6 Adelie Torgersen 2007
#> 7 Adelie Torgersen 2007
#> 8 Adelie Torgersen 2007
#> 9 Adelie Torgersen 2007
#> 10 Adelie Torgersen 2007
#> # … with 334 more rows
1.3 결측값 현황
%>%
penguins summarise(across(everything(), ~ sum(complete.cases(.)))) %>%
pivot_longer( cols = everything(), names_to = "변수", values_to = "정상값") %>%
mutate(결측비율 = 1 - 정상값 / nrow(penguins))
#> # A tibble: 8 × 3
#> 변수 정상값 결측비율
#> <chr> <int> <dbl>
#> 1 species 344 0
#> 2 island 344 0
#> 3 bill_length_mm 342 0.00581
#> 4 bill_depth_mm 342 0.00581
#> 5 flipper_length_mm 342 0.00581
#> 6 body_mass_g 342 0.00581
#> 7 sex 333 0.0320
#> 8 year 344 0
2 중복 제거
2.1 행기준 제거
<- penguins %>%
penguins_dup slice(1:5) %>%
bind_rows(penguins %>% slice(1:5))
penguins_dup
%>%
penguins_dup distinct()
#> # A tibble: 10 × 8
#> species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
#> 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
#> 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007
#> 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007
#> 4 Adelie Torgersen NA NA NA NA <NA> 2007
#> 5 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007
#> 6 Adelie Torgersen 39.1 18.7 181 3750 male 2007
#> 7 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007
#> 8 Adelie Torgersen 40.3 18 195 3250 fema… 2007
#> 9 Adelie Torgersen NA NA NA NA <NA> 2007
#> 10 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007
#> # … with abbreviated variable names ¹flipper_length_mm, ²body_mass_g
#> # A tibble: 5 × 8
#> species island bill_length_mm bill_depth_mm flipper_l…¹ body_…² sex year
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
#> 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
#> 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007
#> 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007
#> 4 Adelie Torgersen NA NA NA NA <NA> 2007
#> 5 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007
#> # … with abbreviated variable names ¹flipper_length_mm, ²body_mass_g
2.2 열기준 제거
<- penguins %>%
penguins_cols select(1:3) %>%
cbind(penguins %>% select(1:3))
penguins_cols
<- duplicated(colnames(penguins_cols))
duplicated_colnames
!duplicated_colnames] %>%
penguins_cols[as_tibble()
#> # A tibble: 344 × 3
#> species island bill_length_mm
#> <fct> <fct> <dbl>
#> 1 Adelie Torgersen 39.1
#> 2 Adelie Torgersen 39.5
#> 3 Adelie Torgersen 40.3
#> 4 Adelie Torgersen NA
#> 5 Adelie Torgersen 36.7
#> 6 Adelie Torgersen 39.3
#> 7 Adelie Torgersen 38.9
#> 8 Adelie Torgersen 39.2
#> 9 Adelie Torgersen 34.1
#> 10 Adelie Torgersen 42
#> # … with 334 more rows
3 자료형 변환
%>%
penguins select(species, year, island, bill_length_mm) %>%
mutate(year = lubridate::make_date(year = year, month = 7L, day = 1L))
#> # A tibble: 344 × 4
#> species year island bill_length_mm
#> <fct> <date> <fct> <dbl>
#> 1 Adelie 2007-07-01 Torgersen 39.1
#> 2 Adelie 2007-07-01 Torgersen 39.5
#> 3 Adelie 2007-07-01 Torgersen 40.3
#> 4 Adelie 2007-07-01 Torgersen NA
#> 5 Adelie 2007-07-01 Torgersen 36.7
#> 6 Adelie 2007-07-01 Torgersen 39.3
#> 7 Adelie 2007-07-01 Torgersen 38.9
#> 8 Adelie 2007-07-01 Torgersen 39.2
#> 9 Adelie 2007-07-01 Torgersen 34.1
#> 10 Adelie 2007-07-01 Torgersen 42
#> # … with 334 more rows
4 이상점 제거
<- penguins %>%
penguin_stat drop_na() %>%
summarise(mean = mean(bill_length_mm),
sd = sd(bill_length_mm))
%>%
penguins filter(bill_length_mm < penguin_stat$mean - 2 * penguin_stat$sd |
> penguin_stat$mean + 2 * penguin_stat$sd) bill_length_mm
#> # A tibble: 6 × 8
#> species island bill_length_mm bill_depth_mm flipper_le…¹ body_…² sex year
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
#> 1 Adelie Dream 32.1 15.5 188 3050 fema… 2009
#> 2 Gentoo Biscoe 59.6 17 230 6050 male 2007
#> 3 Gentoo Biscoe 55.9 17 228 5600 male 2009
#> 4 Gentoo Biscoe 55.1 16 230 5850 male 2009
#> 5 Chinstrap Dream 58 17.8 181 3700 fema… 2007
#> 6 Chinstrap Dream 55.8 19.8 207 4000 male 2009
#> # … with abbreviated variable names ¹flipper_length_mm, ²body_mass_g
5 공백문자제거
%>%
penguins mutate(species = str_trim(species))
#> # A tibble: 344 × 8
#> species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year
#> <chr> <fct> <dbl> <dbl> <int> <int> <fct> <int>
#> 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
#> 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007
#> 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007
#> 4 Adelie Torgersen NA NA NA NA <NA> 2007
#> 5 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007
#> 6 Adelie Torgersen 39.3 20.6 190 3650 male 2007
#> 7 Adelie Torgersen 38.9 17.8 181 3625 fema… 2007
#> 8 Adelie Torgersen 39.2 19.6 195 4675 male 2007
#> 9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007
#> 10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007
#> # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm,
#> # ²body_mass_g
6 칼럼 변환
%>%
penguins mutate_if(is.factor, toupper)
# mutate_if(is.factor, toupper)
#> # A tibble: 344 × 8
#> species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year
#> <chr> <chr> <dbl> <dbl> <int> <int> <chr> <int>
#> 1 ADELIE TORGERSEN 39.1 18.7 181 3750 MALE 2007
#> 2 ADELIE TORGERSEN 39.5 17.4 186 3800 FEMA… 2007
#> 3 ADELIE TORGERSEN 40.3 18 195 3250 FEMA… 2007
#> 4 ADELIE TORGERSEN NA NA NA NA <NA> 2007
#> 5 ADELIE TORGERSEN 36.7 19.3 193 3450 FEMA… 2007
#> 6 ADELIE TORGERSEN 39.3 20.6 190 3650 MALE 2007
#> 7 ADELIE TORGERSEN 38.9 17.8 181 3625 FEMA… 2007
#> 8 ADELIE TORGERSEN 39.2 19.6 195 4675 MALE 2007
#> 9 ADELIE TORGERSEN 34.1 18.1 193 3475 <NA> 2007
#> 10 ADELIE TORGERSEN 42 20.2 190 4250 <NA> 2007
#> # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm,
#> # ²body_mass_g
7 특정문자 변환
%>%
penguins mutate(species_kr = glue::glue("{species}_종")) %>%
select(species, species_kr, island)
#> # A tibble: 344 × 3
#> species species_kr island
#> <fct> <glue> <fct>
#> 1 Adelie Adelie_종 Torgersen
#> 2 Adelie Adelie_종 Torgersen
#> 3 Adelie Adelie_종 Torgersen
#> 4 Adelie Adelie_종 Torgersen
#> 5 Adelie Adelie_종 Torgersen
#> 6 Adelie Adelie_종 Torgersen
#> 7 Adelie Adelie_종 Torgersen
#> 8 Adelie Adelie_종 Torgersen
#> 9 Adelie Adelie_종 Torgersen
#> 10 Adelie Adelie_종 Torgersen
#> # … with 334 more rows
8 결측값 치환
%>%
penguins mutate(bill_length_mm = ifelse(is.na(bill_length_mm),
mean(bill_length_mm, na.rm = TRUE),
bill_length_mm))
#> # A tibble: 344 × 8
#> species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
#> 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
#> 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007
#> 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007
#> 4 Adelie Torgersen 43.9 NA NA NA <NA> 2007
#> 5 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007
#> 6 Adelie Torgersen 39.3 20.6 190 3650 male 2007
#> 7 Adelie Torgersen 38.9 17.8 181 3625 fema… 2007
#> 8 Adelie Torgersen 39.2 19.6 195 4675 male 2007
#> 9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007
#> 10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007
#> # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm,
#> # ²body_mass_g
9 Pivoting
폭넓은 데이터(wide-format data)를 긴 데이터(long-format data)로 혹은 반대로 데이터의 형태를 바꾼다.
9.1 Wide → Long
<- penguins %>%
penguins_long pivot_longer(cols = contains("mm"), names_to = "측정변수",
values_to = "측정값")
penguins_long
#> # A tibble: 1,032 × 7
#> species island body_mass_g sex year 측정변수 측정값
#> <fct> <fct> <int> <fct> <int> <chr> <dbl>
#> 1 Adelie Torgersen 3750 male 2007 bill_length_mm 39.1
#> 2 Adelie Torgersen 3750 male 2007 bill_depth_mm 18.7
#> 3 Adelie Torgersen 3750 male 2007 flipper_length_mm 181
#> 4 Adelie Torgersen 3800 female 2007 bill_length_mm 39.5
#> 5 Adelie Torgersen 3800 female 2007 bill_depth_mm 17.4
#> 6 Adelie Torgersen 3800 female 2007 flipper_length_mm 186
#> 7 Adelie Torgersen 3250 female 2007 bill_length_mm 40.3
#> 8 Adelie Torgersen 3250 female 2007 bill_depth_mm 18
#> 9 Adelie Torgersen 3250 female 2007 flipper_length_mm 195
#> 10 Adelie Torgersen NA <NA> 2007 bill_length_mm NA
#> # … with 1,022 more rows
9.2 long → Wide
%>%
penguins_long pivot_wider(names_from = "측정변수",
values_from = "측정값",
values_fn = mean)
#> # A tibble: 284 × 8
#> species island body_mass_g sex year bill_length_mm bill_dept…¹ flipp…²
#> <fct> <fct> <int> <fct> <int> <dbl> <dbl> <dbl>
#> 1 Adelie Torgersen 3750 male 2007 39.1 18.7 181
#> 2 Adelie Torgersen 3800 female 2007 39.5 17.4 186
#> 3 Adelie Torgersen 3250 female 2007 40.3 18 195
#> 4 Adelie Torgersen NA <NA> 2007 NA NA NA
#> 5 Adelie Torgersen 3450 female 2007 37.7 19.2 194
#> 6 Adelie Torgersen 3650 male 2007 39.3 20.6 190
#> 7 Adelie Torgersen 3625 female 2007 38.9 17.8 181
#> 8 Adelie Torgersen 4675 male 2007 39.2 19.6 195
#> 9 Adelie Torgersen 3475 <NA> 2007 34.1 18.1 193
#> 10 Adelie Torgersen 4250 <NA> 2007 42 20.2 190
#> # … with 274 more rows, and abbreviated variable names ¹bill_depth_mm,
#> # ²flipper_length_mm
10 변수 쪼개기
%>%
penguins_long separate(측정변수, into = c("부위", "측정", "단위"), sep = "_")
#> # A tibble: 1,032 × 9
#> species island body_mass_g sex year 부위 측정 단위 측정값
#> <fct> <fct> <int> <fct> <int> <chr> <chr> <chr> <dbl>
#> 1 Adelie Torgersen 3750 male 2007 bill length mm 39.1
#> 2 Adelie Torgersen 3750 male 2007 bill depth mm 18.7
#> 3 Adelie Torgersen 3750 male 2007 flipper length mm 181
#> 4 Adelie Torgersen 3800 female 2007 bill length mm 39.5
#> 5 Adelie Torgersen 3800 female 2007 bill depth mm 17.4
#> 6 Adelie Torgersen 3800 female 2007 flipper length mm 186
#> 7 Adelie Torgersen 3250 female 2007 bill length mm 40.3
#> 8 Adelie Torgersen 3250 female 2007 bill depth mm 18
#> 9 Adelie Torgersen 3250 female 2007 flipper length mm 195
#> 10 Adelie Torgersen NA <NA> 2007 bill length mm NA
#> # … with 1,022 more rows