How To Wrangle a Bunch of Data

Preliminary setup and finding the data

Our whole goal here is to make some beautiful plots so we may well want to dive into ggplot universe

library(ggplot2)
library(ggridges)

Lets go ahead and find the files that we are working with

# List all CSV files in the folder
file_list <- list.files(path = "./", pattern = "\\.csv$", full.names = TRUE)

# Initialize list to store data
all_data <- list()

Reading the data

Lets write a loop that will work its way through each file.

# Loop through each file
for (file in file_list) {
  df <- read.csv(file)
  if (!"haploid" %in% names(df)) {
    cat("Missing 'haploid' column in:", basename(file), "\n")
    next
  }
  file_name <- tools::file_path_sans_ext(basename(file))
  temp_df <- data.frame(file = file_name, haploid = df$haploid)
  all_data[[length(all_data) + 1]] <- temp_df
}

Data Munging

Now we need to get our data in order

final_df <- do.call(rbind, all_data)
drop <- !is.na(as.numeric(final_df$haploid))

## Warning: NAs introduced by coercion

final_df <- final_df[drop,]
final_df$haploid <- as.numeric(final_df$haploid)

Plotting

These are all bad examples but you could use them as starting points

# 1) Boxplot + jitter (good for comparing distributions across files)
ggplot(final_df, aes(x = reorder(file, haploid, median, na.rm = TRUE), y = haploid)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.15, alpha = 0.5, size = 1) +
  coord_flip() +
  labs(x = "file", y = "haploid", title = "Boxplot + jitter by file") +
  theme_minimal()

# 2) Violin + jitter (shows shape with individual points)
ggplot(final_df, aes(x = reorder(file, haploid, median, na.rm = TRUE), y = haploid)) +
  geom_violin(trim = FALSE) +
  geom_jitter(width = 0.15, alpha = 0.5, size = 1) +
  coord_flip() +
  labs(x = "file", y = "haploid", title = "Violin + jitter by file") +
  theme_minimal()

# 3) Ridgeline densities (compact comparison of distributions)
ggplot(final_df, aes(
  x = haploid,
  y = reorder(file, haploid, median, na.rm = TRUE),
  height = after_stat(density)
)) +
  geom_density_ridges(scale = 1, rel_min_height = 0.01, alpha = 0.8) +
  labs(y = "file", x = "haploid", title = "Ridgeline density by file") +
  theme_minimal()

## Picking joint bandwidth of 0.803

# 4) Faceted histograms (each file gets its own panel; handles uneven sample sizes)
ggplot(final_df, aes(x = haploid)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ file, scales = "free_y") +
  labs(x = "haploid", y = "count", title = "Histogram per file (faceted)") +
  theme_minimal()

Things to consider

Phylogeny? Ordering of data? Transformations? Colors? Cartoons? What else?