library(reticulate)
py_install("siuba")
데이터 다루기
데이터프레임에 대해 살펴봅니다.
1 dplyr
동사
- arrange (다시 정렬하기) : 행을 다시 정렬한다.
- select (변수 선택하기) : 변수명으로 특정 칼럼을 추출한다.
- mutate (변수 추가하기) : 새로운 변수를 추가한다.
- filter (관측점 필터링) : 특정 기준을 만족하는 행을 추출한다.
- summarise (변수를 값으로 줄이기) : 변수를 값(스칼라)으로 요약한다.
2 팔머 펭귄
2.1 데이터셋
library(dplyr)
library(palmerpenguins)
penguins
penguins #> # A tibble: 344 × 8
#> species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
#> 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
#> 2 Adelie Torgersen 39.5 17.4 186 3800 fema… 2007
#> 3 Adelie Torgersen 40.3 18 195 3250 fema… 2007
#> 4 Adelie Torgersen NA NA NA NA <NA> 2007
#> 5 Adelie Torgersen 36.7 19.3 193 3450 fema… 2007
#> 6 Adelie Torgersen 39.3 20.6 190 3650 male 2007
#> 7 Adelie Torgersen 38.9 17.8 181 3625 fema… 2007
#> 8 Adelie Torgersen 39.2 19.6 195 4675 male 2007
#> 9 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007
#> 10 Adelie Torgersen 42 20.2 190 4250 <NA> 2007
#> # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm,
#> # ²body_mass_g
2.2 오름/내림 차순 정렬
%>%
penguins arrange(bill_length_mm)
#> # A tibble: 344 × 8
#> species island bill_length_mm bill_depth_mm flipper_…¹ body_…² sex year
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
#> 1 Adelie Dream 32.1 15.5 188 3050 fema… 2009
#> 2 Adelie Dream 33.1 16.1 178 2900 fema… 2008
#> 3 Adelie Torgersen 33.5 19 190 3600 fema… 2008
#> 4 Adelie Dream 34 17.1 185 3400 fema… 2008
#> 5 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007
#> 6 Adelie Torgersen 34.4 18.4 184 3325 fema… 2007
#> 7 Adelie Biscoe 34.5 18.1 187 2900 fema… 2008
#> 8 Adelie Torgersen 34.6 21.1 198 4400 male 2007
#> 9 Adelie Torgersen 34.6 17.2 189 3200 fema… 2008
#> 10 Adelie Biscoe 35 17.9 190 3450 fema… 2008
#> # … with 334 more rows, and abbreviated variable names ¹flipper_length_mm,
#> # ²body_mass_g
2.3 변수 선택
%>%
penguins select(species, bill_length_mm, sex)
#> # A tibble: 344 × 3
#> species bill_length_mm sex
#> <fct> <dbl> <fct>
#> 1 Adelie 39.1 male
#> 2 Adelie 39.5 female
#> 3 Adelie 40.3 female
#> 4 Adelie NA <NA>
#> 5 Adelie 36.7 female
#> 6 Adelie 39.3 male
#> 7 Adelie 38.9 female
#> 8 Adelie 39.2 male
#> 9 Adelie 34.1 <NA>
#> 10 Adelie 42 <NA>
#> # … with 334 more rows
2.4 관측점 필터링
%>%
penguins filter(island == "Dream")
#> # A tibble: 124 × 8
#> species island bill_length_mm bill_depth_mm flipper_len…¹ body_…² sex year
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
#> 1 Adelie Dream 39.5 16.7 178 3250 fema… 2007
#> 2 Adelie Dream 37.2 18.1 178 3900 male 2007
#> 3 Adelie Dream 39.5 17.8 188 3300 fema… 2007
#> 4 Adelie Dream 40.9 18.9 184 3900 male 2007
#> 5 Adelie Dream 36.4 17 195 3325 fema… 2007
#> 6 Adelie Dream 39.2 21.1 196 4150 male 2007
#> 7 Adelie Dream 38.8 20 190 3950 male 2007
#> 8 Adelie Dream 42.2 18.5 180 3550 fema… 2007
#> 9 Adelie Dream 37.6 19.3 181 3300 fema… 2007
#> 10 Adelie Dream 39.8 19.1 184 4650 male 2007
#> # … with 114 more rows, and abbreviated variable names ¹flipper_length_mm,
#> # ²body_mass_g
2.5 변수 생성
\[BMI = \frac{체중(kg)}{키(미터)^2}\]
\[Bill(부리) = \frac{\text{부리길이}}{부리깊이} \times 2 \]
%>%
penguins select(species, island, bill_length_mm, bill_depth_mm) %>%
mutate(bill = bill_length_mm / bill_depth_mm * 2)
#> # A tibble: 344 × 5
#> species island bill_length_mm bill_depth_mm bill
#> <fct> <fct> <dbl> <dbl> <dbl>
#> 1 Adelie Torgersen 39.1 18.7 4.18
#> 2 Adelie Torgersen 39.5 17.4 4.54
#> 3 Adelie Torgersen 40.3 18 4.48
#> 4 Adelie Torgersen NA NA NA
#> 5 Adelie Torgersen 36.7 19.3 3.80
#> 6 Adelie Torgersen 39.3 20.6 3.82
#> 7 Adelie Torgersen 38.9 17.8 4.37
#> 8 Adelie Torgersen 39.2 19.6 4
#> 9 Adelie Torgersen 34.1 18.1 3.77
#> 10 Adelie Torgersen 42 20.2 4.16
#> # … with 334 more rows
2.6 요약
%>%
penguins ::drop_na() %>%
tidyrgroup_by(species) %>%
summarise(부리길이평균 = mean(bill_length_mm))
#> # A tibble: 3 × 2
#> species 부리길이평균
#> <fct> <dbl>
#> 1 Adelie 38.8
#> 2 Chinstrap 48.8
#> 3 Gentoo 47.6
3 파이썬 Suiba
[
]{.aside}
3.1 데이터셋
import pandas as pd
= pd.read_csv("https://raw.githubusercontent.com/mcnakhaee/palmerpenguins/master/palmerpenguins/data/penguins.csv")
penguins penguins.head()
#> species island bill_length_mm ... body_mass_g sex year
#> 0 Adelie Torgersen 39.1 ... 3750.0 male 2007
#> 1 Adelie Torgersen 39.5 ... 3800.0 female 2007
#> 2 Adelie Torgersen 40.3 ... 3250.0 female 2007
#> 3 Adelie Torgersen NaN ... NaN NaN 2007
#> 4 Adelie Torgersen 36.7 ... 3450.0 female 2007
#>
#> [5 rows x 8 columns]
3.2 오름/내림 차순 정렬
import siuba as si
( penguins>> si.arrange(si.__.bill_length_mm)
)
#> species island bill_length_mm ... body_mass_g sex year
#> 142 Adelie Dream 32.1 ... 3050.0 female 2009
#> 98 Adelie Dream 33.1 ... 2900.0 female 2008
#> 70 Adelie Torgersen 33.5 ... 3600.0 female 2008
#> 92 Adelie Dream 34.0 ... 3400.0 female 2008
#> 8 Adelie Torgersen 34.1 ... 3475.0 NaN 2007
#> .. ... ... ... ... ... ... ...
#> 253 Gentoo Biscoe 55.9 ... 5600.0 male 2009
#> 293 Chinstrap Dream 58.0 ... 3700.0 female 2007
#> 185 Gentoo Biscoe 59.6 ... 6050.0 male 2007
#> 3 Adelie Torgersen NaN ... NaN NaN 2007
#> 271 Gentoo Biscoe NaN ... NaN NaN 2009
#>
#> [344 rows x 8 columns]
3.3 변수 선택
import siuba as si
( penguins >> si.select(si._.species, si._.bill_length_mm, si._.sex)
)
#> species bill_length_mm sex
#> 0 Adelie 39.1 male
#> 1 Adelie 39.5 female
#> 2 Adelie 40.3 female
#> 3 Adelie NaN NaN
#> 4 Adelie 36.7 female
#> .. ... ... ...
#> 339 Chinstrap 55.8 male
#> 340 Chinstrap 43.5 female
#> 341 Chinstrap 49.6 male
#> 342 Chinstrap 50.8 male
#> 343 Chinstrap 50.2 female
#>
#> [344 rows x 3 columns]
3.4 관측점 필터링
import siuba as si
( penguins >> si.filter(si._.island == "Dream")
)
#> species island bill_length_mm ... body_mass_g sex year
#> 30 Adelie Dream 39.5 ... 3250.0 female 2007
#> 31 Adelie Dream 37.2 ... 3900.0 male 2007
#> 32 Adelie Dream 39.5 ... 3300.0 female 2007
#> 33 Adelie Dream 40.9 ... 3900.0 male 2007
#> 34 Adelie Dream 36.4 ... 3325.0 female 2007
#> .. ... ... ... ... ... ... ...
#> 339 Chinstrap Dream 55.8 ... 4000.0 male 2009
#> 340 Chinstrap Dream 43.5 ... 3400.0 female 2009
#> 341 Chinstrap Dream 49.6 ... 3775.0 male 2009
#> 342 Chinstrap Dream 50.8 ... 4100.0 male 2009
#> 343 Chinstrap Dream 50.2 ... 3775.0 female 2009
#>
#> [124 rows x 8 columns]
3.5 변수 생성
\[BMI = \frac{체중(kg)}{키(미터)^2}\]
\[Bill(부리) = \frac{\text{부리길이}}{부리깊이} \times 2 \]
import siuba as si
( penguins >> si.select(si._.species, si._.island, si._.bill_length_mm, si._.bill_depth_mm)
>> si.mutate(bill = si._.bill_length_mm / si._.bill_depth_mm * 2)
)
#> species island bill_length_mm bill_depth_mm bill
#> 0 Adelie Torgersen 39.1 18.7 4.181818
#> 1 Adelie Torgersen 39.5 17.4 4.540230
#> 2 Adelie Torgersen 40.3 18.0 4.477778
#> 3 Adelie Torgersen NaN NaN NaN
#> 4 Adelie Torgersen 36.7 19.3 3.803109
#> .. ... ... ... ... ...
#> 339 Chinstrap Dream 55.8 19.8 5.636364
#> 340 Chinstrap Dream 43.5 18.1 4.806630
#> 341 Chinstrap Dream 49.6 18.2 5.450549
#> 342 Chinstrap Dream 50.8 19.0 5.347368
#> 343 Chinstrap Dream 50.2 18.7 5.368984
#>
#> [344 rows x 5 columns]
3.6 요약
import siuba as si
( penguins >> si.group_by(si._.species)
>> si.summarize(부리길이평균 = si._.bill_length_mm.mean())
)
#> species 부리길이평균
#> 0 Adelie 38.791391
#> 1 Chinstrap 48.833824
#> 2 Gentoo 47.504878
4 Pandas 문법
칼럼(변수)를 선택하는 다양한 방법
4.1 df.column_name
penguins.island#> 0 Torgersen
#> 1 Torgersen
#> 2 Torgersen
#> 3 Torgersen
#> 4 Torgersen
#> ...
#> 339 Dream
#> 340 Dream
#> 341 Dream
#> 342 Dream
#> 343 Dream
#> Name: island, Length: 344, dtype: object
4.2 df['column_name']
'island']
penguins[#> 0 Torgersen
#> 1 Torgersen
#> 2 Torgersen
#> 3 Torgersen
#> 4 Torgersen
#> ...
#> 339 Dream
#> 340 Dream
#> 341 Dream
#> 342 Dream
#> 343 Dream
#> Name: island, Length: 344, dtype: object
4.3 df[['column_name']]
'island']]
penguins[[#> island
#> 0 Torgersen
#> 1 Torgersen
#> 2 Torgersen
#> 3 Torgersen
#> 4 Torgersen
#> .. ...
#> 339 Dream
#> 340 Dream
#> 341 Dream
#> 342 Dream
#> 343 Dream
#>
#> [344 rows x 1 columns]
4.4 df.loc[:, 'column_name']
'island']
penguins.loc[:, #> 0 Torgersen
#> 1 Torgersen
#> 2 Torgersen
#> 3 Torgersen
#> 4 Torgersen
#> ...
#> 339 Dream
#> 340 Dream
#> 341 Dream
#> 342 Dream
#> 343 Dream
#> Name: island, Length: 344, dtype: object
4.5 df.iloc[:, column_index]
1]
penguins.iloc[:, #> 0 Torgersen
#> 1 Torgersen
#> 2 Torgersen
#> 3 Torgersen
#> 4 Torgersen
#> ...
#> 339 Dream
#> 340 Dream
#> 341 Dream
#> 342 Dream
#> 343 Dream
#> Name: island, Length: 344, dtype: object
5 파이프 vs 체인줄
5.1 사례
부리길이(bill_length_mm)가 5 이상이고,
펭귄종과 부리길이 칼럼만 선택해서
펭귄종별로
평균 부리길이를 구하세요.
5.1.1 dplyr 파이프
%>%
penguins filter(bill_length_mm > 5) %>%
select(species, bill_length_mm) %>%
group_by(species) %>%
summarize(mean_bill_length = mean(bill_length_mm))
#> # A tibble: 3 × 2
#> species mean_bill_length
#> <fct> <dbl>
#> 1 Adelie 38.8
#> 2 Chinstrap 48.8
#> 3 Gentoo 47.5
5.1.2 siuba 파이프
import siuba as si
( penguins >> si.filter(si._.bill_length_mm > 5)
>> si.select(si._.species, si._.bill_length_mm)
>> si.group_by(si._.species)
>> si.summarize(부리길이평균 = si._.bill_length_mm.mean())
)
#> species 부리길이평균
#> 0 Adelie 38.791391
#> 1 Chinstrap 48.833824
#> 2 Gentoo 47.504878
5.1.3 pandas 체이닝
import pandas as pd
\
penguins > 5] \
.loc[penguins.bill_length_mm "species", "bill_length_mm"]] \
.loc[:, ["species") \
.groupby( .bill_length_mm.mean()
#> species
#> Adelie 38.791391
#> Chinstrap 48.833824
#> Gentoo 47.504878
#> Name: bill_length_mm, dtype: float64
6 SQL 쿼리문
library(dbplyr)
<- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
con copy_to(con, penguins)
<- tbl(con, "penguins")
penguins_db
%>%
penguins_db filter(bill_length_mm > 5) %>%
select(species, bill_length_mm) %>%
group_by(species) %>%
summarize(mean_bill_length = mean(bill_length_mm)) %>%
show_query()
#> <SQL>
#> SELECT `species`, AVG(`bill_length_mm`) AS `mean_bill_length`
#> FROM (
#> SELECT `species`, `bill_length_mm`
#> FROM `penguins`
#> WHERE (`bill_length_mm` > 5.0)
#> )
#> GROUP BY `species`