신뢰구간은 통계학에서 모수 추정에 사용되는 중요한 개념입니다. 신뢰구간은 표본에서 계산된 추정량이 실제 모수를 포함할 확률을 나타내는 구간을 의미합니다. 신뢰구간은 모수 추정의 불확실성을 고려하여 모수 추정에 대한 더 정확한 정보를 제공합니다. 대표적으로 모집단 평균과 모집단 비율에 대한 신뢰구간을 계산하는 방법을 시각적으로 확인할 수 있습니다.
1 Shiny 앱
#| label: shinylive-infer-ci
#| viewerWidth: 800
#| viewerHeight: 600
#| standalone: true
library(shiny)
library(ggplot2)
library(tidyverse)
library(showtext)
showtext_auto()
ui <- fluidPage(
title = "신뢰구간 모의실험",
fluidRow(
column(3,
h2('신뢰구간'),
hr(),
radioButtons("case", "평균과 비율 추정 선택:",
choices = c("모집단 평균", "모집단 비율"),
inline = TRUE,
selected = "모집단 평균"),
hr(),
conditionalPanel(
condition = "input.case == '모집단 비율'",
sliderInput("pop_prop",
"모집단 비율:",
min = 0,
max = 1,
step = 0.05,
value = 0.4)
),
conditionalPanel(
condition = "input.case == '모집단 평균'",
numericInput("pop_mean", "모집단 평균:", value = 10),
numericInput("pop_sd", "모집단 표준편차:", value = 5, min = 0)
),
hr(),
sliderInput("sample_size",
"표본 크기:",
min = 10,
max = 1000,
step = 10,
value = 100),
sliderInput("n_samples",
"표본 수:",
min = 0,
max = 500,
step = 10,
value = 10),
sliderInput("conf_level",
"신뢰 수준:",
min = 5,
max = 99,
step = 1,
value = 95)
),
column(8,
mainPanel(
plotOutput('IntervalPlot', height = '500px'),
verbatimTextOutput("summary"),
verbatimTextOutput("ci")
)
)
)
)
server <- function(input, output) {
output$IntervalPlot <- renderPlot({
n_samples = input$n_samples
sample_size = input$sample_size
alpha = 1 - (input$conf_level / 100)
if (input$case == "모집단 비율") {
n_population <- 100000
population <- rbernoulli(n_population, p = input$pop_prop)
true_value <- mean(population)
sample_stats <- vector('numeric', n_samples)
upper_cis <- vector('numeric', n_samples)
lower_cis <- vector('numeric', n_samples)
contains_true_value <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stat <- mean(sample)
sample_std <- (sqrt(sample_stat * (1 - sample_stat)) / sqrt(sample_size))
z_value <- qnorm(alpha/2, lower.tail = FALSE)
margin_of_error <- z_value * sample_std
sample_stats[i] <- sample_stat
upper_cis[i] <- sample_stat + margin_of_error
lower_cis[i] <- sample_stat - margin_of_error
contains_true_value[i] <- ifelse(true_value >= lower_cis[i] & true_value <= upper_cis[i], 1, 0)
}
title_text <- paste(input$conf_level, "% 신뢰구간 (모집단 비율)", sep = " ")
y_label <- "표본 비율"
} else if (input$case == "모집단 평균") {
n_population <- 100000
population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd)
true_value <- mean(population)
sample_stats <- vector('numeric', n_samples)
upper_cis <- vector('numeric', n_samples)
lower_cis <- vector('numeric', n_samples)
contains_true_value <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stat <- mean(sample)
sample_std <- sd(sample) / sqrt(sample_size)
t_value <- qt(alpha/2, df = sample_size - 1, lower.tail = FALSE)
margin_of_error <- t_value * sample_std
sample_stats[i] <- sample_stat
upper_cis[i] <- sample_stat + margin_of_error
lower_cis[i] <- sample_stat - margin_of_error
contains_true_value[i] <- ifelse(true_value >= lower_cis[i] & true_value <= upper_cis[i], 1, 0)
}
title_text <- paste(input$conf_level, "% 신뢰구간 (모집단 평균)", sep = " ")
y_label <- "표본 평균"
}
subtitle_text <- paste0('관측된 포함률: ',
round(mean(contains_true_value), 2) * 100, '%')
contains_true_value_f <- ifelse(contains_true_value==1, "포함", "미포함")
contains_true_value_f <- factor(contains_true_value_f)
p <- tibble(
sample_stat = sample_stats,
upper_ci = upper_cis,
lower_ci = lower_cis) %>%
mutate(sample_number = as.factor(row_number())) %>%
ggplot(aes(x = sample_number, y = sample_stat)) +
coord_flip() +
ggtitle(title_text, subtitle = subtitle_text) +
ylab(y_label) +
xlab("") +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.x = element_line(color="black", size = 0.5),
legend.position = "none") +
scale_x_discrete(breaks = NULL)
p <- p + geom_hline(aes(yintercept = true_value),
linetype = 'dashed', size = 1)
p <- p +
geom_linerange(aes(ymin = lower_ci, ymax = upper_ci, color=contains_true_value_f)) +
geom_point(aes(color=contains_true_value_f), size=2) +
scale_color_manual(values=c("포함" = "#009E73", "미포함" = "#D55E00"))
p
})
output$summary <- renderPrint({
n_samples = input$n_samples
sample_size = input$sample_size
if (input$case == "모집단 비율") {
n_population <- 100000
population <- rbernoulli(n_population, p = input$pop_prop)
sample_stats <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stats[i] <- mean(sample)
}
cat("표본 비율의 평균(점추정):\n")
print(mean(sample_stats))
} else if (input$case == "모집단 평균") {
n_population <- 100000
population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd)
sample_stats <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stats[i] <- mean(sample)
}
cat("표본 평균의 평균(점 추정):\n")
print(mean(sample_stats))
}
})
output$ci <- renderPrint({
n_samples = input$n_samples
sample_size = input$sample_size
alpha = 1 - (input$conf_level / 100)
if (input$case == "모집단 비율") {
n_population <- 100000
population <- rbernoulli(n_population, p = input$pop_prop)
sample_stats <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stats[i] <- mean(sample)
}
se <- sqrt(mean(sample_stats) * (1 - mean(sample_stats)) / sample_size)
lower_ci <- mean(sample_stats) - qnorm(1 - alpha/2) * se
upper_ci <- mean(sample_stats) + qnorm(1 - alpha/2) * se
cat(paste0(input$conf_level, "% 신뢰구간 (모집단 비율):\n"))
cat(paste0("[", round(lower_ci, 2), ", ", round(upper_ci, 2), "]"))
} else if (input$case == "모집단 평균") {
n_population <- 100000
population <- rnorm(n_population, mean = input$pop_mean, sd = input$pop_sd)
sample_stats <- vector('numeric', n_samples)
for (i in 1:n_samples) {
sample <- sample(population, sample_size, replace = FALSE)
sample_stats[i] <- mean(sample)
}
se <- sd(sample_stats) / sqrt(n_samples)
lower_ci <- mean(sample_stats) - qt(1 - alpha/2, df = n_samples - 1) * se
upper_ci <- mean(sample_stats) + qt(1 - alpha/2, df = n_samples - 1) * se
cat(paste0(input$conf_level, "% 신뢰구간 (모집단 평균):\n"))
cat(paste0("[", round(lower_ci, 2), ", ", round(upper_ci, 2), "]"))
}
})
}
shinyApp(ui = ui, server = server)
2 코딩
라이센스
CC BY-SA-NC & GPL-3