Reading files into R

Lets say you have a bunch of data files you want to read into one data frame. This is the first step.

# Get list of all CSV files in directory
file_list <- list.files(path = "./", pattern = "\\.csv$", full.names = TRUE)

# Initialize list to store data
all_data <- list()

Reading the data

Lets write a loop that will work its way through each file.

# Loop through each file
for (file in file_list) {
  df <- read.csv(file)
  if (!"haploid" %in% names(df)) {
    cat("Missing 'haploid' column in:", basename(file), "\n")
    next
  }
  file_name <- tools::file_path_sans_ext(basename(file))
  temp_df <- data.frame(file = file_name, haploid = df$haploid)
  all_data[[length(all_data) + 1]] <- temp_df
}

Data Munging

Now we need to get our data in order

final_df <- do.call(rbind, all_data)
drop <- !is.na(as.numeric(final_df$haploid))
## Warning: NAs introduced by coercion
final_df <- final_df[drop,]
final_df$haploid <- as.numeric(final_df$haploid)

Plotting

These are all bad examples but you could use them as starting points

# 1) Boxplot + jitter (good for comparing distributions across files)
ggplot(final_df, aes(x = reorder(file, haploid, median, na.rm = TRUE), y = haploid)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.15, alpha = 0.5, size = 1) +
  coord_flip() +
  labs(x = "file", y = "haploid", title = "Boxplot + jitter by file") +
  theme_minimal()
## `geom_boxplot()` removes missing values
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# 2) Density plot (good for looking at distribution shape)
ggplot(final_df, aes(x = haploid, fill = file)) +
  geom_density(alpha = 0.6) +
  labs(x = "haploid", y = "Density", title = "Density plot by file", fill = "File") +
  theme_minimal()
## Warning: Groups with fewer than two observations have been dropped
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# 3) Violin plot (like a box plot but shows distribution)
ggplot(final_df, aes(x = reorder(file, haploid, median, na.rm = TRUE), y = haploid)) +
  geom_violin() +
  geom_jitter(width = 0.1, alpha = 0.5, size = 1) +
  coord_flip() +
  labs(x = "file", y = "haploid", title = "Violin plot by file") +
  theme_minimal()
## `geom_violin()` removes missing values
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# 4) Histogram faceted by file
ggplot(final_df, aes(x = haploid, fill = "#FF6B6B")) +
  geom_histogram(bins = 20) +
  facet_wrap(~ file) +
  labs(x = "haploid", y = "Count", title = "Histogram by file") +
  theme_minimal() +
  theme(legend.position = "none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# 5) Cumulative distribution
ggplot(final_df, aes(x = haploid, color = file)) +
  stat_ecdf() +
  labs(x = "haploid", y = "Cumulative probability", title = "Cumulative distribution by file", color = "File") +
  theme_minimal()
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# 6) Ridge plot using ggridges (good for looking at distributions across groups)
ggplot(final_df, aes(x = haploid, y = file, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient() +
  labs(x = "haploid", title = "Ridge plot by file", fill = "haploid") +
  theme_minimal() +
  theme(axis.title.y = element_blank())
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

Visualizations

Here is a summary table of the data

file min max mean median sd
chr10L 0.02 0.99 0.52 0.52 0.28
chr10R 0.02 0.99 0.52 0.52 0.28
chr2L 0.01 1.00 0.48 0.47 0.30
chr2R 0.01 1.00 0.50 0.51 0.29
chr3L 0.01 1.00 0.48 0.48 0.30
chr3R 0.01 1.00 0.49 0.50 0.29
chrX 0.02 0.99 0.48 0.47 0.30

Final thoughts

The key here is that you can write a loop to do things to multiple files. The structure I showed above is useful for:

  • Reading in a bunch of files
  • Filtering data
  • Merging data
  • Preprocessing

These are foundational skills for data wrangling in R.