Misinterpretation of Error Bars: Error bars are frequently misunderstood in terms of what they signify about statistical significance, particularly when comparing the overlap of error bars from different data sets.
Confusing SD with SEM: There is often a mix-up between standard deviation (SD) and standard error of the mean (SEM). SD error bars tell about the variability within the population, whereas SEM bars inform about the uncertainty of the estimated mean, which is influenced by sample size.
Misconception about Confidence Intervals: A common error is the belief that a 95% confidence interval (CI) indicates that there is a 95% probability that the interval contains the true population mean, which is a misinterpretation of how confidence intervals function statistically.
library(tidyverse)
set.seed(123)
ratings <- tibble(rating = rnorm(100, mean = 3, sd = 0.5))
stats <- summarise(ratings,
mean_rating = mean(rating),
sd_rating = sd(rating),
se_rating = sd_rating / sqrt(n()),
ci80 = qnorm(1 - 0.10/2) * se_rating,
ci95 = qnorm(1 - 0.05/2) * se_rating,
ci99 = qnorm(1 - 0.01/2) * se_rating)
data <- tibble(
x = rep(1, 100),
y = ratings$rating,
ymin = NA_real_,
ymax = NA_real_,
group = "Jitter"
)
stats_data <- tibble(
x = 2:6,
y = rep(stats$mean_rating, 5),
ymin = stats$mean_rating - c(stats$sd_rating, stats$se_rating, stats$ci80, stats$ci95, stats$ci99),
ymax = stats$mean_rating + c(stats$sd_rating, stats$se_rating, stats$ci80, stats$ci95, stats$ci99),
group = c("SD", "SE", "CI80", "CI95", "CI99")
)
data <- bind_rows(data, stats_data)
head(data)
x | y | ymin | ymax | group |
---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <chr> |
1 | 2.719762 | NA | NA | Jitter |
1 | 2.884911 | NA | NA | Jitter |
1 | 3.779354 | NA | NA | Jitter |
1 | 3.035254 | NA | NA | Jitter |
1 | 3.064644 | NA | NA | Jitter |
1 | 3.857532 | NA | NA | Jitter |
colors <- c("Jitter" = "grey60", "SD" = "blue", "SE" = "red", "CI80" = "green", "CI95" = "purple", "CI99" = "orange")
p <- ggplot(data, aes(x = as.factor(x), y = y, ymin = ymin, ymax = ymax, color = group)) +
geom_jitter(data = filter(data, group == "Jitter"), width = 0.05) +
geom_errorbar(data = filter(data, group != "Jitter"), width = 0.1, position = position_dodge(width = 0.8)) +
scale_color_manual(values = colors) +
scale_x_discrete(labels = c("Sample", "+/- SD", "+/- SE", "80% CI", "95% CI", "99% CI")) +
labs(title = "Different Error Bars for Simulated Data", x = "Simulated Data", y = "") +
theme(axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1), legend.title = element_blank())
print(p)
library(ggplot2)
mean1 <- 5
mean2 <- 4
n <- 10
alpha <- 0.05
t_critical <- qt(1 - alpha/2, df=(n-1)*2)
std_dev_estimated <- abs(mean1 - mean2) / t_critical * sqrt(n/2)
sem1 <- std_dev_estimated / sqrt(n)
sem2 <- std_dev_estimated / sqrt(n)
ci_half_width1 <- t_critical * sem1
ci_half_width2 <- t_critical * sem2
data <- data.frame(
type = rep(c('STD', 'SEM', 'CI'), each = 2),
sample = factor(rep(c('Sample 1', 'Sample 2'), 3)),
mean = c(mean1, mean2, mean1, mean2, mean1, mean2),
ymin = c(mean1 - std_dev_estimated, mean2 - std_dev_estimated,
mean1 - sem1, mean2 - sem2,
mean1 - ci_half_width1, mean2 - ci_half_width2),
ymax = c(mean1 + std_dev_estimated, mean2 + std_dev_estimated,
mean1 + sem1, mean2 + sem2,
mean1 + ci_half_width1, mean2 + ci_half_width2)
)
p <- ggplot(data, aes(x = type, y = mean, fill = sample)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8), width = 0.7) +
geom_errorbar(aes(ymin = ymin, ymax = ymax), position = position_dodge(width = 0.8), width = 0.25) +
theme_minimal() +
labs(title = "Grouped by Error Types with P-value = 0.05", y = "Mean Value") +
scale_fill_brewer(palette = "Pastel1", name = "Sample")
print(p)