workshops

Correlation and Causation

if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")
if (!requireNamespace("corrplot", quietly = TRUE)) install.packages("corrplot")

# Load packages
library(ggplot2)
library(corrplot)
corrplot 0.92 loaded

Visualize Correlation with Synthetic Data

# Generate data with correlation
set.seed(123)
n <- 100
x <- rnorm(n)
y <- x + rnorm(n)

correlation_coeff <- cor(x, y)

# ggplot
library(ggplot2)
df <- data.frame(x = x, y = y)
ggplot(df, aes(x, y)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Scatterplot showing correlation", x = "X", y = "Y") +
  annotate("text", x = 1, y = 4, label = paste("Correlation coefficient:", round(correlation_coeff, 2)))

`geom_smooth()` using formula = 'y ~ x'

svg

Visualize Correlations in MTCARS data

data("mtcars")
head(mtcars)
A data.frame: 6 × 11
mpgcyldisphpdratwtqsecvsamgearcarb
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
Mazda RX421.061601103.902.62016.460144
Mazda RX4 Wag21.061601103.902.87517.020144
Datsun 71022.84108 933.852.32018.611141
Hornet 4 Drive21.462581103.083.21519.441031
Hornet Sportabout18.783601753.153.44017.020032
Valiant18.162251052.763.46020.221031
cor_matrix <- cor(mtcars)
corrplot(cor_matrix, method = "circle")

svg

cor_matrix <- cor(mtcars)

corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust",
         addCoef.col = "black", # Color of the correlation coefficients
         tl.col = "black", tl.srt = 45, # Color and rotation of the labels
         title = "Correlation Matrix of Mtcars Dataset")

svg

cor_matrix <- cor(mtcars, method="spearman")
corrplot(cor_matrix, method = "circle", type = "upper", order = "hclust",
         addCoef.col = "black",
         tl.col = "black", tl.srt = 45,
         title = "Correlation Matrix of Mtcars Dataset")

svg

Choosing Between Spearman’s and Pearson’s Correlation

Pearson’s Correlation Coefficient

Spearman’s Rank Correlation Coefficient

Summary

# Scatter plot of mpg vs. wt
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  ggtitle("MPG vs. Weight") +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

svg

How Does Sample Size Effect Correlation

# Simulate data and calculate correlation
simulate_correlation <- function(n, rho = 0.5) {
  x <- rnorm(n)
  y <- rho * x + sqrt(1 - rho^2) * rnorm(n)
  cor(x, y)
}

# Different sample sizes
sample_sizes <- seq(10, 1000, by = 10)

# Calculate correlation for each sample size
set.seed(123)
correlations <- sapply(sample_sizes, simulate_correlation)

# Create a dataframe
data_for_plot <- data.frame(sample_size = sample_sizes, correlation = correlations)

# Plotting
ggplot(data_for_plot, aes(x = sample_size, y = correlation)) +
  geom_line() +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  labs(title = "Effect of Sample Size on Correlation",
       x = "Sample Size",
       y = "Estimated Correlation Coefficient") +
  theme_minimal() +
  annotate("text", x = 500, y = 0.5, label = "True Correlation (0.5)", hjust = 0, color = "red")

svg

Causation Analysis Guide

library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ tibble  3.2.1     ✔ dplyr   1.1.1
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 0.5.2
✔ purrr   1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Hypothetical Scenario:

Imagine we have a dataset from examining the effect of a new educational program on student performance, where:

set.seed(123)
n <- 100
edu_data <- data.frame(
  treatment = sample(c(0, 1), size = n, replace = TRUE, prob = c(0.5, 0.5)),
  pre_test = rnorm(n, mean = 75, sd = 10),
  post_test = NA
)

# Assuming the treatment has a positive effect
edu_data$post_test[edu_data$treatment == 1] <- edu_data$pre_test[edu_data$treatment == 1] + rnorm(sum(edu_data$treatment == 1), mean = 5, sd = 5)
edu_data$post_test[edu_data$treatment == 0] <- edu_data$pre_test[edu_data$treatment == 0] + rnorm(sum(edu_data$treatment == 0), mean = 0, sd = 5)

head(edu_data)

A data.frame: 6 × 3
treatmentpre_testpost_test
<dbl><dbl><dbl>
1177.5331986.47188
2074.7145377.43050
3174.5713083.41651
4088.6860286.61432
5072.7422970.36106
6190.1647196.82572
# Calculate average improvement by group
avg_improvement <- edu_data %>%
  mutate(improvement = post_test - pre_test) %>%
  group_by(treatment) %>%
  summarize(mean_improvement = mean(improvement))

print(avg_improvement)

# A tibble: 2 × 2
  treatment mean_improvement
      <dbl>            <dbl>
1         0           -0.391
2         1            5.49
ggplot(edu_data, aes(x = factor(treatment), y = post_test - pre_test, fill = factor(treatment))) +
  geom_boxplot() +
  labs(x = "Treatment Group", y = "Improvement in Test Scores", fill = "Group") +
  theme_minimal() +
  ggtitle("Effect of Educational Program on Test Score Improvement")

svg

t.test(post_test ~ treatment, data = edu_data)

	Welch Two Sample t-test

data:  post_test by treatment
t = -4.0852, df = 97.985, p-value = 9.011e-05
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -13.50354  -4.67358
sample estimates:
mean in group 0 mean in group 1
       72.37132        81.45988