대표적 표본추출 방법

표본
표본추출
단순임의추출
층화추출
군집추출
다단계추출
시각화
저자

이광춘

공개

2024-05-17

모집단에서 표본을 추출하는 대표적인 단순임의추출, 층화추출, 군집추출, 다단계추출 4가지 방법에 대한 개념을 시각적으로 확인해보자.

1 Shiny 앱

#| label: shinylive-sampling
#| viewerWidth: 800
#| viewerHeight: 700
#| standalone: true

library(shiny)
library(openintro)
data(COL)
library(showtext)
showtext_auto()

# _____ Simple Random _____ #
build_srs <- function(n, N) {

  colSamp <- COL[4]
  PCH <- rep(c(1, 3, 20)[3], 3)
  col <- rep(COL[1], N)
  pch <- PCH[match(col, COL)]

  plot(0, xlim = c(0,2), ylim = 0:1, type = 'n', axes = FALSE, xlab = "", ylab = "")
  box()
  x   <- runif(N, 0, 2)
  y   <- runif(N)
  inc <- n
  points(x, y, col = col, pch = pch)

  these <- sample(N, n)
  points(x[these], y[these], pch = 20, cex = 0.8, col = colSamp)
  points(x[these], y[these], cex = 1.4, col = colSamp)
}


# _____ Stratified _____ #
build_stratified <- function(N, numStrata, sampleSizePerStratum) {
  colSamp <- COL[4]
  col <- rep(COL[1], N)
  PCH <- rep(c(1, 3, 20)[3], 3)
  plot(0, xlim = c(0, 2), ylim = 0:1 + 0.01,
       type = 'n', axes = FALSE, xlab = "", ylab = "")
  box()
  X <- seq(0.1, 1.9, length.out = numStrata)
  Y <- rep(0.5, numStrata)

  # 각 계층의 크기를 무작위로 생성
  strataSizes <- sample(ceiling(N/numStrata) * 0.5 + c(-1, 1) * ceiling(N/numStrata) * 0.25, numStrata, replace = TRUE)
  strataSizes <- round(strataSizes / sum(strataSizes) * N)

  R <- sqrt(strataSizes / 500)
  above <- rep(1, numStrata)
  currentIndex <- 1
  for (i in 1:numStrata) {
    hold <- seq(0, 2 * pi, length.out = 99)
    x <- X[i] + (R[i] + 0.01) * cos(hold)
    y <- Y[i] + (R[i] + 0.01) * sin(hold)
    polygon(x, y, border = COL[5, 4])
    x <- rep(NA, strataSizes[i])
    y <- rep(NA, strataSizes[i])
    for (j in 1:strataSizes[i]) {
      inside <- FALSE
      while (!inside) {
        xx <- runif(1, -R[i], R[i])
        yy <- runif(1, -R[i], R[i])
        if (sqrt(xx^2 + yy^2) < R[i]) {
          inside <- TRUE
          x[j] <- xx
          y[j] <- yy
        }
      }
    }
    type <- sample(1, strataSizes[i], TRUE)
    pch <- PCH[type]
    col <- COL[type]
    x <- X[i] + x
    y <- Y[i] + y
    points(x, y, pch = pch, col = col)
    these <- sample(strataSizes[i], min(sampleSizePerStratum, strataSizes[i]))
    points(x[these], y[these],
           pch = 20, cex = 0.8, col = colSamp)
    points(x[these], y[these], cex = 1.4, col = colSamp)
    currentIndex <- currentIndex + strataSizes[i]
  }
  text(X, Y + above * (R),
       paste("계층", 1:numStrata),
       pos = 2 + above,
       cex = 1.3)
}

# _____ Cluster _____ #
build_cluster <- function(numClusters, clusterSizes, selectedClusters) {
  colSamp <- COL[4]
  PCH <- rep(c(1, 3, 20)[3], 3)
  plot(0, xlim = c(0, 2), ylim = c(0.01, 1.04), type = 'n', axes = FALSE, xlab = "", ylab = "")
  box()
  X <- seq(0.1, 1.9, length.out = numClusters)
  Y <- runif(numClusters, 0.2, 0.8)
  R <- sqrt(clusterSizes / 500)
  above <- ifelse(Y > 0.5, 1, -1)
  for (i in 1:numClusters) {
    hold <- seq(0, 2 * pi, length.out = 99)
    x <- X[i] + (R[i] + 0.02) * cos(hold)
    y <- Y[i] + (R[i] + 0.02) * sin(hold)
    polygon(x, y, border = COL[5, 4])
    if (i %in% selectedClusters) {
      polygon(x, y, border = COL[4], lty = 2, lwd = 1.5)
    }
    x <- rep(NA, clusterSizes[i])
    y <- rep(NA, clusterSizes[i])
    for (j in 1:clusterSizes[i]) {
      inside <- FALSE
      while (!inside) {
        xx <- runif(1, -R[i], R[i])
        yy <- runif(1, -R[i], R[i])
        if (sqrt(xx^2 + yy^2) < R[i]) {
          inside <- TRUE
          x[j] <- xx
          y[j] <- yy
        }
      }
    }
    type <- sample(1, clusterSizes[i], TRUE)
    pch <- PCH[type]
    col <- COL[type]
    x <- X[i] + x
    y <- Y[i] + y
    points(x, y, pch = pch, col = col)
    if (i %in% selectedClusters) {
      points(x, y, pch = 20, cex = 0.8, col = colSamp)
      points(x, y, cex = 1.4, col = colSamp)
    }
  }
  text(X, Y + above * (R + 0.01),
       paste("군집", 1:numClusters),
       pos = 2 + above,
       cex = 1.3)
}

# _____ Multistage Sampling _____ #
build_multistage <- function(numClusters, sampleSizePerCluster, clusterSizes) {
  colSamp <- COL[4]
  PCH <- rep(c(1, 3, 20)[3], 3)
  plot(0, xlim = c(0, 2), ylim = c(0.01, 1.04), type = 'n', axes = FALSE, xlab = "", ylab = "")
  box()
  X <- seq(0.1, 1.9, length.out = numClusters)
  Y <- runif(numClusters, 0.2, 0.8)
  R <- sqrt(clusterSizes / 500)
  above <- ifelse(Y > 0.5, 1, -1)
  for (i in 1:numClusters) {
    hold <- seq(0, 2 * pi, length.out = 99)
    x <- X[i] + (R[i] + 0.02) * cos(hold)
    y <- Y[i] + (R[i] + 0.02) * sin(hold)
    polygon(x, y, border = COL[5, 4])
    x <- rep(NA, clusterSizes[i])
    y <- rep(NA, clusterSizes[i])
    for (j in 1:clusterSizes[i]) {
      inside <- FALSE
      while (!inside) {
        xx <- runif(1, -R[i], R[i])
        yy <- runif(1, -R[i], R[i])
        if (sqrt(xx^2 + yy^2) < R[i]) {
          inside <- TRUE
          x[j] <- xx
          y[j] <- yy
        }
      }
    }
    type <- sample(1, clusterSizes[i], TRUE)
    pch <- PCH[type]
    col <- COL[type]
    x <- X[i] + x
    y <- Y[i] + y
    points(x, y, pch = pch, col = col)
    these <- sample(clusterSizes[i], min(sampleSizePerCluster, clusterSizes[i]))
    points(x[these], y[these], pch = 20, cex = 0.8, col = colSamp)
    points(x[these], y[these], cex = 1.4, col = colSamp)
  }
  text(X, Y + above * (R + 0.01),
       paste("군집", 1:numClusters),
       pos = 2 + above, cex = 1.3)
}

# ui -----
ui <- fluidPage(
  titlePanel("표본 추출 방법 시각화"),
  sidebarLayout(
    sidebarPanel(
      radioButtons("method", "표본 추출 방법 선택:",
                   c("단순 무작위 표본 추출" = "srs",
                     "층화 표본 추출" = "stratified",
                     "군집 표본 추출" = "cluster",
                     "다단계 표본 추출" = "multistage"),
                   inline = TRUE),
      conditionalPanel(
        condition = "input.method == 'srs'",
        sliderInput("sampleSize", "표본 크기:", min = 1, max = 100, value = 10)
      ),
      conditionalPanel(
        condition = "input.method == 'stratified'",
        sliderInput("numStrata", "층(Stratum) 수:", min = 1, max = 5, value = 3),
        sliderInput("sampleSizePerStratum", "층 내부 표본 크기:", min = 1, max = 10, value = 3)
      ),
      conditionalPanel(
        condition = "input.method == 'cluster'",
        sliderInput("numClusters", "군집 수:", min = 1, max = 9, value = 3)
      ),
      conditionalPanel(
        condition = "input.method == 'multistage'",
        sliderInput("numClustersMultistage", "군집 수:", min = 1, max = 9, value = 3),
        sliderInput("sampleSizePerClusterMultistage", "군집 내부 표본 크기:", min = 1, max = 10, value = 3)
      )
    ),
    mainPanel(
      plotOutput("samplingPlot")
    )
  )
)

server <- function(input, output) {
  output$samplingPlot <- renderPlot({
    if (input$method == "srs") {
      build_srs(n = input$sampleSize, N = 100)
    } else if (input$method == "stratified") {
      build_stratified(N = 100, numStrata = input$numStrata, sampleSizePerStratum = input$sampleSizePerStratum)
    } else if (input$method == "cluster") {
      clusterSizeMin <- sample(5:30, 1)
      clusterSizeMax <- sample((clusterSizeMin+5):50, 1)
      clusterSizes <- sample(clusterSizeMin:clusterSizeMax, input$numClusters, replace = TRUE)
      selectedClusters <- sample(1:input$numClusters, round(input$numClusters/3))
      build_cluster(numClusters = input$numClusters, clusterSizes = clusterSizes, selectedClusters = selectedClusters)
    } else if (input$method == "multistage") {
      clusterSizeMin <- sample(5:30, 1)
      clusterSizeMax <- sample((clusterSizeMin+5):50, 1)
      clusterSizes <- sample(clusterSizeMin:clusterSizeMax, input$numClustersMultistage, replace = TRUE)
      build_multistage(numClusters = input$numClustersMultistage, sampleSizePerCluster = input$sampleSizePerClusterMultistage, clusterSizes = clusterSizes)
    }
  })
}

shinyApp(ui, server)

2 코딩

라이센스

CC BY-SA-NC & GPL-3