= ' _ 대한민국 화이팅!!! _ '
text print('[' + text.rstrip() + ']')
#> [ _ 대한민국 화이팅!!! _]
print('[' + text.lstrip() + ']')
#> [_ 대한민국 화이팅!!! _ ]
print('[' + text.strip() + ']')
#> [_ 대한민국 화이팅!!! _]
데이터 전처리 : 파이썬
데이터 전처리 (Preprocessing)를 구성하는 기계학습과 시각화를 위한 다양한 데이터 정제(Cleansing)하는 방법을 다룹니다.
1 데이터셋
palmerpenguins 데이터셋을 파이썬 데이터 패키지로 구현되어 있어 이를 설치하여 불러 실습에 사용한다.
$ pip3 install palmerpenguins
2 결측값 제거
참고자료: 결측데이터
import numpy as np
import pandas as pd
from palmerpenguins import load_penguins
penguins
#> species island bill_length_mm ... body_mass_g sex year
#> 0 Adelie Torgersen 39.1 ... 3750.0 male 2007
#> 1 Adelie Torgersen 39.5 ... 3800.0 female 2007
#> 2 Adelie Torgersen 40.3 ... 3250.0 female 2007
#> 3 Adelie Torgersen NaN ... NaN NaN 2007
#> 4 Adelie Torgersen 36.7 ... 3450.0 female 2007
#>
#> [5 rows x 8 columns]
2.1 행기준 제거
\
penguins .dropna()
#> species island bill_length_mm ... body_mass_g sex year
#> 0 Adelie Torgersen 39.1 ... 3750.0 male 2007
#> 1 Adelie Torgersen 39.5 ... 3800.0 female 2007
#> 2 Adelie Torgersen 40.3 ... 3250.0 female 2007
#> 4 Adelie Torgersen 36.7 ... 3450.0 female 2007
#> 5 Adelie Torgersen 39.3 ... 3650.0 male 2007
#> .. ... ... ... ... ... ... ...
#> 339 Chinstrap Dream 55.8 ... 4000.0 male 2009
#> 340 Chinstrap Dream 43.5 ... 3400.0 female 2009
#> 341 Chinstrap Dream 49.6 ... 3775.0 male 2009
#> 342 Chinstrap Dream 50.8 ... 4100.0 male 2009
#> 343 Chinstrap Dream 50.2 ... 3775.0 female 2009
#>
#> [333 rows x 8 columns]
2.2 열기준 제거
\
penguins = 1) .dropna(axis
#> species island year
#> 0 Adelie Torgersen 2007
#> 1 Adelie Torgersen 2007
#> 2 Adelie Torgersen 2007
#> 3 Adelie Torgersen 2007
#> 4 Adelie Torgersen 2007
#> .. ... ... ...
#> 339 Chinstrap Dream 2009
#> 340 Chinstrap Dream 2009
#> 341 Chinstrap Dream 2009
#> 342 Chinstrap Dream 2009
#> 343 Chinstrap Dream 2009
#>
#> [344 rows x 3 columns]
2.3 결측값 현황
= penguins.isna().sum()
nan_sum
type(nan_sum)
= nan_sum.to_frame(name = "결측값")
nan_df
# nan_df = pd.DataFrame(nan_sum).rename(columns = {0:'결측값'})
= penguins.shape[0]
penguins_nrow
\
nan_df = nan_df['결측값'] / penguins_nrow) .assign(결측비율
#> <class 'pandas.core.series.Series'>
#> 결측값
#> species 0
#> island 0
#> bill_length_mm 2
#> bill_depth_mm 2
#> flipper_length_mm 2
#> body_mass_g 2
#> sex 11
#> year 0
#> 결측값 결측비율
#> species 0 0.000000
#> island 0 0.000000
#> bill_length_mm 2 0.005814
#> bill_depth_mm 2 0.005814
#> flipper_length_mm 2 0.005814
#> body_mass_g 2 0.005814
#> sex 11 0.031977
#> year 0 0.000000
3 중복 제거
3.1 행기준 제거
= penguins \
penguins_dup "species", "island", "bill_length_mm", "bill_depth_mm"]]
.loc[:, [
= penguins_dup \
penguins_dup_mask = "first")
.duplicated(keep
penguins_dup[penguins_dup_mask]
'species'] == "Adelie") &
penguins [(penguins['island'] == "Biscoe") &
(penguins['bill_length_mm'] == 35.0) &
(penguins['bill_depth_mm'] == 17.9)] (penguins[
#> species island bill_length_mm bill_depth_mm
#> 100 Adelie Biscoe 35.0 17.9
#> 104 Adelie Biscoe 37.9 18.6
#> 184 Gentoo Biscoe 45.1 14.5
#> 199 Gentoo Biscoe 50.5 15.9
#> species island bill_length_mm ... body_mass_g sex year
#> 52 Adelie Biscoe 35.0 ... 3450.0 female 2008
#> 100 Adelie Biscoe 35.0 ... 3725.0 female 2009
#>
#> [2 rows x 8 columns]
3.2 열기준 제거
= pd.concat([penguins[['species', 'island']], penguins['species']], axis=1)
penguins_dup_df
penguins_dup_df.columns.duplicated()
~penguins_dup_df.columns.duplicated()] penguins_dup_df.loc[:,
#> array([False, False, True])
#> species island
#> 0 Adelie Torgersen
#> 1 Adelie Torgersen
#> 2 Adelie Torgersen
#> 3 Adelie Torgersen
#> 4 Adelie Torgersen
#> .. ... ...
#> 339 Chinstrap Dream
#> 340 Chinstrap Dream
#> 341 Chinstrap Dream
#> 342 Chinstrap Dream
#> 343 Chinstrap Dream
#>
#> [344 rows x 2 columns]
4 자료형 변환
'year'] = pd.to_datetime(penguins['year'], format='%Y')
penguins[
\
penguins "species", "year", "island", "bill_length_mm"]] .loc[:, [
#> species year island bill_length_mm
#> 0 Adelie 2007-01-01 Torgersen 39.1
#> 1 Adelie 2007-01-01 Torgersen 39.5
#> 2 Adelie 2007-01-01 Torgersen 40.3
#> 3 Adelie 2007-01-01 Torgersen NaN
#> 4 Adelie 2007-01-01 Torgersen 36.7
#> .. ... ... ... ...
#> 339 Chinstrap 2009-01-01 Dream 55.8
#> 340 Chinstrap 2009-01-01 Dream 43.5
#> 341 Chinstrap 2009-01-01 Dream 49.6
#> 342 Chinstrap 2009-01-01 Dream 50.8
#> 343 Chinstrap 2009-01-01 Dream 50.2
#>
#> [344 rows x 4 columns]
5 이상점 제거
= penguins["bill_length_mm"].mean()
bill_mean = penguins["bill_length_mm"].std()
bill_sd
\
penguins "bill_length_mm"] < bill_mean - 2 * bill_sd) | \
.loc[(penguins["bill_length_mm"] > bill_mean + 2 * bill_sd), :] (penguins[
#> species island bill_length_mm ... body_mass_g sex year
#> 142 Adelie Dream 32.1 ... 3050.0 female 2009-01-01
#> 185 Gentoo Biscoe 59.6 ... 6050.0 male 2007-01-01
#> 253 Gentoo Biscoe 55.9 ... 5600.0 male 2009-01-01
#> 267 Gentoo Biscoe 55.1 ... 5850.0 male 2009-01-01
#> 293 Chinstrap Dream 58.0 ... 3700.0 female 2007-01-01
#> 339 Chinstrap Dream 55.8 ... 4000.0 male 2009-01-01
#>
#> [6 rows x 8 columns]
6 공백문자제거
\
penguins =lambda x: x["species"].str.strip()) .assign(species
#> species island bill_length_mm ... body_mass_g sex year
#> 0 Adelie Torgersen 39.1 ... 3750.0 male 2007-01-01
#> 1 Adelie Torgersen 39.5 ... 3800.0 female 2007-01-01
#> 2 Adelie Torgersen 40.3 ... 3250.0 female 2007-01-01
#> 3 Adelie Torgersen NaN ... NaN NaN 2007-01-01
#> 4 Adelie Torgersen 36.7 ... 3450.0 female 2007-01-01
#> .. ... ... ... ... ... ... ...
#> 339 Chinstrap Dream 55.8 ... 4000.0 male 2009-01-01
#> 340 Chinstrap Dream 43.5 ... 3400.0 female 2009-01-01
#> 341 Chinstrap Dream 49.6 ... 3775.0 male 2009-01-01
#> 342 Chinstrap Dream 50.8 ... 4100.0 male 2009-01-01
#> 343 Chinstrap Dream 50.2 ... 3775.0 female 2009-01-01
#>
#> [344 rows x 8 columns]
7 칼럼 변환
def toupper(x):
return x.str.upper()
\
penguins apply(lambda x: toupper(x) if x.dtype == "object" else x) .
#> species island bill_length_mm ... body_mass_g sex year
#> 0 ADELIE TORGERSEN 39.1 ... 3750.0 MALE 2007-01-01
#> 1 ADELIE TORGERSEN 39.5 ... 3800.0 FEMALE 2007-01-01
#> 2 ADELIE TORGERSEN 40.3 ... 3250.0 FEMALE 2007-01-01
#> 3 ADELIE TORGERSEN NaN ... NaN NaN 2007-01-01
#> 4 ADELIE TORGERSEN 36.7 ... 3450.0 FEMALE 2007-01-01
#> .. ... ... ... ... ... ... ...
#> 339 CHINSTRAP DREAM 55.8 ... 4000.0 MALE 2009-01-01
#> 340 CHINSTRAP DREAM 43.5 ... 3400.0 FEMALE 2009-01-01
#> 341 CHINSTRAP DREAM 49.6 ... 3775.0 MALE 2009-01-01
#> 342 CHINSTRAP DREAM 50.8 ... 4100.0 MALE 2009-01-01
#> 343 CHINSTRAP DREAM 50.2 ... 3775.0 FEMALE 2009-01-01
#>
#> [344 rows x 8 columns]
8 특정문자 변환
\
penguins = lambda x: x["species"] + "_종") \
.assign(species_kr "species", "species_kr", "island"]] .loc[:, [
#> species species_kr island
#> 0 Adelie Adelie_종 Torgersen
#> 1 Adelie Adelie_종 Torgersen
#> 2 Adelie Adelie_종 Torgersen
#> 3 Adelie Adelie_종 Torgersen
#> 4 Adelie Adelie_종 Torgersen
#> .. ... ... ...
#> 339 Chinstrap Chinstrap_종 Dream
#> 340 Chinstrap Chinstrap_종 Dream
#> 341 Chinstrap Chinstrap_종 Dream
#> 342 Chinstrap Chinstrap_종 Dream
#> 343 Chinstrap Chinstrap_종 Dream
#>
#> [344 rows x 3 columns]
9 결측값 치환
= penguins["bill_length_mm"].mean()
mean_bill_length
\
penguins = lambda x: x["bill_length_mm"].fillna(mean_bill_length)) .assign(bill_length_mm
#> species island bill_length_mm ... body_mass_g sex year
#> 0 Adelie Torgersen 39.10000 ... 3750.0 male 2007-01-01
#> 1 Adelie Torgersen 39.50000 ... 3800.0 female 2007-01-01
#> 2 Adelie Torgersen 40.30000 ... 3250.0 female 2007-01-01
#> 3 Adelie Torgersen 43.92193 ... NaN NaN 2007-01-01
#> 4 Adelie Torgersen 36.70000 ... 3450.0 female 2007-01-01
#> .. ... ... ... ... ... ... ...
#> 339 Chinstrap Dream 55.80000 ... 4000.0 male 2009-01-01
#> 340 Chinstrap Dream 43.50000 ... 3400.0 female 2009-01-01
#> 341 Chinstrap Dream 49.60000 ... 3775.0 male 2009-01-01
#> 342 Chinstrap Dream 50.80000 ... 4100.0 male 2009-01-01
#> 343 Chinstrap Dream 50.20000 ... 3775.0 female 2009-01-01
#>
#> [344 rows x 8 columns]
10 Pivoting
폭넓은 데이터(wide-format data)를 긴 데이터(long-format data)로 혹은 반대로 데이터의 형태를 바꾼다.
10.1 Wide → Long
= pd.melt(penguins, id_vars=["species"],
penguins_long = penguins.columns[penguins.columns.str.contains("mm")],
value_vars = "측정변수",
var_name = "측정값")
value_name
penguins_long
#> species 측정변수 측정값
#> 0 Adelie bill_length_mm 39.1
#> 1 Adelie bill_length_mm 39.5
#> 2 Adelie bill_length_mm 40.3
#> 3 Adelie bill_length_mm NaN
#> 4 Adelie bill_length_mm 36.7
#> ... ... ... ...
#> 1027 Chinstrap flipper_length_mm 207.0
#> 1028 Chinstrap flipper_length_mm 202.0
#> 1029 Chinstrap flipper_length_mm 193.0
#> 1030 Chinstrap flipper_length_mm 210.0
#> 1031 Chinstrap flipper_length_mm 198.0
#>
#> [1032 rows x 3 columns]
10.2 long → Wide
= "species",
penguins_long.reset_index().pivot_table(index = "측정변수",
columns = "측정값",
values = 'mean') aggfunc
#> 측정변수 bill_depth_mm bill_length_mm flipper_length_mm
#> species
#> Adelie 18.346358 38.791391 189.953642
#> Chinstrap 18.420588 48.833824 195.823529
#> Gentoo 14.982114 47.504878 217.186992
11 변수 쪼개기
= penguins_long["측정변수"].str.extract(r"([a-z]+?)_([a-z]+?)_(mm)", expand=True)
extracted = ["부위", "측정", "단위"]
extracted.columns
**extracted) penguins_long.assign(
#> species 측정변수 측정값 부위 측정 단위
#> 0 Adelie bill_length_mm 39.1 bill length mm
#> 1 Adelie bill_length_mm 39.5 bill length mm
#> 2 Adelie bill_length_mm 40.3 bill length mm
#> 3 Adelie bill_length_mm NaN bill length mm
#> 4 Adelie bill_length_mm 36.7 bill length mm
#> ... ... ... ... ... ... ..
#> 1027 Chinstrap flipper_length_mm 207.0 flipper length mm
#> 1028 Chinstrap flipper_length_mm 202.0 flipper length mm
#> 1029 Chinstrap flipper_length_mm 193.0 flipper length mm
#> 1030 Chinstrap flipper_length_mm 210.0 flipper length mm
#> 1031 Chinstrap flipper_length_mm 198.0 flipper length mm
#>
#> [1032 rows x 6 columns]