Applying GI Detection Methods on Simulated Data

Delta Log Fold Change (dLFC) Application

Here, we show how we may apply dLFC [1] to calculate GI on simulated data, as an example to analyze the simulation output.

knitr::opts_knit$set(root.dir = getwd())
# read example simulated data
sample_name = "example_data"
pseudo_counts = 5e-07
repA_path <- here::here("data", paste0(sample_name, "_repA.csv"))
repB_path <- here::here("data", paste0(sample_name, "_repB.csv"))

# each simulation is containing two replicates A and B, and we take average of the LFC for analysis
cell_lib_guide2_A = read.csv(repA_path) %>%
  dplyr::select(-X)

cell_lib_guide2_B = read.csv(repB_path) %>%
  dplyr::select(-X) %>%
  dplyr::select(guide_id, counts_guide_t0,
                counts_guide_t1, counts_guide_t2,
                rel_freq_guide_t0, rel_freq_guide_t2, LFC)

cell_lib_guide2 = merge(cell_lib_guide2_A, cell_lib_guide2_B, by = "guide_id") %>%
  dplyr::rename(counts_guide_t0_1 = counts_guide_t0.x,
                counts_guide_t0_2 = counts_guide_t0.y,
                counts_guide_t1_1 = counts_guide_t1.x,
                counts_guide_t1_2 = counts_guide_t1.y,
                counts_guide_t2_1 = counts_guide_t2.x,
                counts_guide_t2_2 = counts_guide_t2.y,
                rel_freq_guide_t0_1 = rel_freq_guide_t0.x,
                rel_freq_guide_t0_2 = rel_freq_guide_t0.y,
                rel_freq_guide_t2_1 = rel_freq_guide_t2.x,
                rel_freq_guide_t2_2 = rel_freq_guide_t2.y,
                LFC_t2_1 = LFC.x,
                LFC_t2_2 = LFC.y) %>%
  mutate(guide1_type = ifelse(guide1_type == 1, "high", "low"),
         guide2_type = ifelse(guide2_type == 1, "high", "low"),
         construct_type = ifelse(is.na(gene2_behavior), gene1_behavior,
                                 paste0(gene1_behavior, " + ", gene2_behavior)),
         LFC_t2_1 = log2(((rel_freq_guide_t2_1 + pseudo_counts) /
                            (rel_freq_guide_t0_1 + pseudo_counts))),
         LFC_t2_2 = log2(((rel_freq_guide_t2_2 + pseudo_counts) /
                            (rel_freq_guide_t0_2 + pseudo_counts))))%>%
  mutate(LFC = (LFC_t2_1 + LFC_t2_2)/2) # aggregate the LFC between 2 replicates by averaging


# read the simulated data and prepare sample name
sim_data = cell_lib_guide2 %>%
  mutate(construct_type = ifelse(KO_type == "SKO", "SKO",
                                 ifelse(gene1_behavior == "Non-Targeting Control"
                                        | gene2_behavior == "Non-Targeting Control", "SKO", "DKO" )))

nc_gene = unique(as.character(filter(sim_data, gene1_behavior == "Non-Targeting Control")$gene1))

# calculate single mutant/knockout fitness
gene_SMF = sim_data %>%
  filter(construct_type == "SKO", gene1_behavior != "Non-Targeting Control") %>%
  group_by(gene1) %>%
  dplyr::summarise(SMF = mean(LFC))
# calculate mean LFC for gene pairs that don't contain control
gene_LFC = sim_data %>%
  filter(construct_type == "DKO") %>%
  group_by(gene_pair_id) %>%
  dplyr::summarise(LFC_mean = mean(LFC))
# calculate dLFC = LFC_mean - (SMF1 + SMF2)
sim_data_dLFC = left_join(sim_data, gene_SMF, by = "gene1") %>%
  dplyr::rename(gene1_SMF = SMF) %>%
  left_join(gene_SMF, by = c("gene2"="gene1")) %>%
  dplyr::rename(gene2_SMF = SMF) %>%
  left_join(gene_LFC, by = "gene_pair_id") %>%
  mutate(gene1_SMF = ifelse(is.na(gene1_SMF), 0, gene1_SMF),
         gene2_SMF = ifelse(is.na(gene2_SMF), 0, gene2_SMF),
         LFC_mean = ifelse(construct_type == "SKO",
                           ifelse(gene1_SMF == 0, gene2_SMF, gene1_SMF), LFC_mean)) %>%
  mutate(dLFC = LFC_mean - (gene1_SMF+gene2_SMF))

# add data frame for the non-control group
control = sim_data_dLFC %>% filter(gene1 %in% nc_gene | gene2 %in% nc_gene)
sim_dLFC_noncontrol = sim_data_dLFC %>% anti_join(control, by = "gene1_gene2_id")
# add data frame for the non-interacting group
control0 = sim_data_dLFC %>% filter(interaction_gene == 0)
sim_dLFC_noncontrol0 = sim_data_dLFC %>% anti_join(control0, by = "gene1_gene2_id")

# compute correlation and p-values
cor_test_all <- cor.test(sim_data_dLFC$interaction_gene,
                         sim_data_dLFC$dLFC, method = "pearson")
cor_test_noncontrol <- cor.test(sim_dLFC_noncontrol$interaction_gene,
                                sim_dLFC_noncontrol$dLFC, method = "pearson")
cor_test_noncontrol0 <- cor.test(sim_dLFC_noncontrol0$interaction_gene,
                                 sim_dLFC_noncontrol0$dLFC, method = "pearson")
# show correlation value
print(cor_test_all)
print(cor_test_noncontrol)
print(cor_test_noncontrol0)

Visualizations

Then you may visualize the calculated GI from dLFC on scatterplots. An example code for plotting might be constructed as follows:

library(ggplot2)
library(ggtext)

# define a function to format p-values if p-values are extremely small
format_p <- function(pval) {
  if (is.na(pval)) return("NA")
  if (pval < 1e-16) {
    return("< 1e-16")
  } else {
    return(formatC(pval, format = "e", digits = 2))
  }
}

# plot the scatterplot
xlim = c(-8,8)
ylim = c(-20,20)

theme_text = theme(
  plot.title = element_markdown(size = 8, face = "bold"),   # Title size
  axis.title.x = element_text(size = 10),  # X-axis title
  axis.title.y = element_text(size = 10),  # Y-axis title
  axis.text.x = element_text(size = 10),   # X-axis text
  axis.text.y = element_text(size = 10),   # Y-axis text
  legend.title = element_text(size = 10),  # Legend title
  legend.text = element_text(size = 10),   # Legend text
  strip.text = element_text(size = 15)     # Facet labels
)
scatterplot_dlfc = ggplot(sim_data_dLFC, aes(x = interaction_gene, y = dLFC)) +
  geom_point(alpha = 0.7, color = "black") +
  geom_point(data = control0, aes(x = interaction_gene, y = dLFC),
             size = 3, shape = 17, color = "blue", fill = "yellow", stroke = 1.5) +
  geom_point(data = control, aes(x = interaction_gene, y = dLFC),
             size = 3, shape = 17, color = "#f76d84", fill = "yellow", stroke = 1.5) +
  # Regression line for all data
  geom_smooth(method = "lm", col = "black", se = FALSE, linetype = "solid") +
  # Regression line for non-control data
  geom_smooth(data = sim_dLFC_noncontrol, aes(x = interaction_gene, y = dLFC),
              method = "lm", col = "#f76d84", se = FALSE, linetype = "dashed") +
  # Regression line for non-zero genetic interactions data
  geom_smooth(data = sim_dLFC_noncontrol0, aes(x = interaction_gene, y = dLFC),
              method = "lm", col = "blue", se = FALSE, linetype = "dashed") +
  ggtitle(paste0(
  "Scatter Plot for dLFC vs. Simulated GI:<br>",
  "Correlation r = ", round(cor_test_all$estimate, 3),
  ", *p* ", format_p(cor_test_all$p.value), "<br>",
  "Correlation r (w/o non-targeting controls) = <span style='color:red;'>", round(cor_test_noncontrol$estimate, 3),
  "</span>, *p* ", format_p(cor_test_noncontrol$p.value), "<br>",
  "Correlation r (w/o non-interacting gene pairs) = <span style='color:#0404a0;'>", round(cor_test_noncontrol0$estimate, 3),
  "</span>, *p* ", format_p(cor_test_noncontrol0$p.value))) +
  labs(color = "Gene-gene Interaction Type") +
  guides(color = guide_legend(override.aes = list(size = 5))) +
  ylim(ylim[1],ylim[2])+
  xlim(xlim[1],xlim[2])+
  theme_minimal() +
  xlab("Simulated Genetic Interaction") +
  ylab("Delta Log Fold Change (dLFC)") +
  # Add annotation label for control points
  annotate("text", x = xlim[1],
           y = ylim[2]-3,
           label = "PINK: Data w/ gene pairs containing Non-Targeting Control",
           color = "#f85470", size = 3, hjust = 0, fontface = "bold") +
  # Add annotation label for non-interacting points
  annotate("text", x = xlim[1],
           y = ylim[2]-1.5,
           label = "BLUE: Data w/ non-interacting gene pairs",
           color = "blue", size = 3, hjust = 0, fontface = "bold")  +
  theme_text

# show the plot
options(repr.plot.width = 8, repr.plot.height = 10)
scatterplot_dlfc

The example scatterplot is shown below:

Example dLFC scatterplot

Check relevant details in the pre-built DKOsimR vignettes (PDF) Section 6.

References

[1] Dede, M., McLaughlin, M., Kim, E. et al. Multiplex enCas12a screens detect functional buffering among paralogs otherwise masked in monogenic Cas9 knockout screens. Genome Biol 21, 262 (2020). https://doi.org/10.1186/s13059-020-02173-2.