Case 1 pleiotropy
This RMarkdown document demonstrates how key elements from the notebook for case study 1 in the EpiGraphDB paper can be achieved using the R package. For detailed explanations of the case study please refer to the paper or the case study notebook.
Context
Mendelian randomization (MR), a technique to evaluate the causal role of modifiable exposures on a health outcome using a set of genetic instruments, tacitly assumes that a genetic variant (e.g. SNP) is only related to an outcome of interest through an exposure (i.e. the “exclusion restriction criterion”). Horizontal pleiotropy, where a SNP is associated with multiple phenotypes independently of the exposure of interest, potentially violates this assumption delivering misleading conclusions. In contrast, vertical pleiotropy, where a SNP is associated with multiple phenotypes on the same biological pathway, does not violate this assumption.
Here, we use external evidence on biological pathways and protein-protein interactions to assess the pleiotropic profile for a genetic variant based on the genes (and proteins) with which that variant is associated. In a graph representation, we will show that:
- We have evidence of potential vertical pleiotropy when the proteins associated with a variant belong to one connected group,
- In contrast, if the proteins associated with a single variant are isolated from each other, this suggests evidence of horizontal pleiotropy.
This case study goes as follows:
- For a list of genes that we wish to assess their pleiotropic profiles (against a variant of interests), we retrieve from EpiGraphDB their mapped protein products.
- Then respectively from the evidence on shared pathway network and protein-protein interaction network, we can extract the proteins groups formed in these networks as connected communities to assess the pleiotropic profiles of the genes.
library("magrittr")
library("dplyr")
library("purrr")
library("glue")
library("igraph")
library("epigraphdb")
Here we configure the parameters used in the case study example. In this example we will look at the variant rs12720356, a SNP located in chromosome 19 that has been associated with Crohn’s disease and psoriasis.**
SNP <- "rs12720356"
GENELIST <- c("ZGLP1", "FDX1L", "MRPL4", "ICAM5", "TYK2", "GRAP2", "KRI1", "TMED1", "ICAM1")
PPI_N_INTERMEDIATE_PROTEINS <- 1
Mapping genes to proteins
The first step of the analysis is to map each of these genes to their protein product.
get_gene_protein <- function(genelist) {
endpoint <- "/mappings/gene-to-protein"
params <- list(
gene_name_list = genelist %>% I()
)
r <- query_epigraphdb(
route = endpoint,
params = params,
mode = "table",
method = "POST"
)
protein_df <- r
if (nrow(protein_df) > 0) {
res_df <- protein_df %>%
select(gene_name = `gene.name`, uniprot_id = `protein.uniprot_id`)
} else {
res_df <- tibble() %>% set_names(c("gene_name", "uniprot_id"))
}
res_df
}
gene_protein_df <- get_gene_protein(genelist = GENELIST)
gene_protein_df
#> # A tibble: 9 × 2
#> gene_name uniprot_id
#> <chr> <chr>
#> 1 ZGLP1 P0C6A0
#> 2 FDX1L Q6P4F2
#> 3 MRPL4 Q9BYD3
#> 4 ICAM5 Q9UMF0
#> 5 TYK2 P29597
#> 6 GRAP2 O75791
#> 7 KRI1 Q8N9T8
#> 8 TMED1 Q13445
#> 9 ICAM1 P05362
Identifying the involvement in the same biological pathways
For each protein we retrieve the pathways they are involved in.
get_protein_pathway <- function(gene_protein_df) {
endpoint <- "/protein/in-pathway"
params <- list(
uniprot_id_list = gene_protein_df %>% pull(`uniprot_id`) %>% I()
)
df <- query_epigraphdb(route = endpoint, params = params, mode = "table", method = "POST")
if (nrow(df) > 0) {
res_df <- gene_protein_df %>%
select(`uniprot_id`) %>%
left_join(df, by = c("uniprot_id"))
} else {
res_df <- gene_protein_df %>%
select(`uniprot_id`) %>%
mutate(pathway_count = NA_integer_, pathway_reactome_id = NA_character_)
}
res_df <- res_df %>%
mutate(
pathway_count = ifelse(is.na(pathway_count), 0L, as.integer(pathway_count)),
pathway_reactome_id = ifelse(is.na(pathway_reactome_id), c(), pathway_reactome_id)
)
res_df
}
pathway_df <- get_protein_pathway(gene_protein_df = gene_protein_df)
pathway_df
#> # A tibble: 9 × 3
#> uniprot_id pathway_count pathway_reactome_id
#> <chr> <int> <list>
#> 1 P0C6A0 0 <NULL>
#> 2 Q6P4F2 15 <chr [15]>
#> 3 Q9BYD3 6 <chr [6]>
#> 4 Q9UMF0 5 <chr [5]>
#> 5 P29597 28 <chr [28]>
#> 6 O75791 15 <chr [15]>
#> 7 Q8N9T8 0 <NULL>
#> 8 Q13445 0 <NULL>
#> 9 P05362 11 <chr [11]>
Now for each pair of proteins we match the pathways they have in common.
get_shared_pathway <- function(pathway_df) {
# For the protein-pathway data
# Get protein-protein permutations where they share pathways
per_permutation <- function(pathway_df, permutation) {
df <- pathway_df %>% filter(uniprot_id %in% permutation)
primary_pathway <- pathway_df %>%
filter(uniprot_id == permutation[1]) %>%
pull(pathway_reactome_id) %>%
unlist()
assoc_pathway <- pathway_df %>%
filter(uniprot_id == permutation[2]) %>%
pull(pathway_reactome_id) %>%
unlist()
intersect(primary_pathway, assoc_pathway)
}
pairwise_permutations <- pathway_df %>%
pull(`uniprot_id`) %>%
gtools::permutations(n = length(.), r = 2, v = .)
shared_pathway_df <- tibble(
protein = pairwise_permutations[, 1],
assoc_protein = pairwise_permutations[, 2]
) %>%
mutate(
shared_pathway = map2(`protein`, `assoc_protein`, function(x, y) {
per_permutation(pathway_df = pathway_df, permutation = c(x, y))
}),
combination = map2_chr(`protein`, `assoc_protein`, function(x, y) {
comb <- sort(c(x, y))
paste(comb, collapse = ",")
}),
count = map_int(`shared_pathway`, function(x) na.omit(x) %>% length()),
connected = count > 0
)
shared_pathway_df
}
shared_pathway_df <- get_shared_pathway(pathway_df)
n_pairs <- length(shared_pathway_df %>% filter(count > 0))
print(glue::glue("Num. shared_pathway pairs: {n_pairs}"))
#> Num. shared_pathway pairs: 6
shared_pathway_df %>% arrange(desc(count))
#> # A tibble: 72 × 6
#> protein assoc_protein shared_pathway combination count connected
#> <chr> <chr> <list> <chr> <int> <lgl>
#> 1 P05362 P29597 <chr [6]> P05362,P29597 6 TRUE
#> 2 P29597 P05362 <chr [6]> P05362,P29597 6 TRUE
#> 3 P05362 Q9UMF0 <chr [5]> P05362,Q9UMF0 5 TRUE
#> 4 Q9UMF0 P05362 <chr [5]> P05362,Q9UMF0 5 TRUE
#> 5 O75791 P05362 <chr [2]> O75791,P05362 2 TRUE
#> 6 O75791 P29597 <chr [2]> O75791,P29597 2 TRUE
#> 7 O75791 Q9UMF0 <chr [2]> O75791,Q9UMF0 2 TRUE
#> 8 P05362 O75791 <chr [2]> O75791,P05362 2 TRUE
#> 9 P29597 O75791 <chr [2]> O75791,P29597 2 TRUE
#> 10 Q9UMF0 O75791 <chr [2]> O75791,Q9UMF0 2 TRUE
#> # … with 62 more rows
We can further query EpiGraphDB regarding the detailed pathway
information using GET /meta/nodes/Pathway/search
.
get_pathway_info <- function(reactome_id) {
endpoint <- "/meta/nodes/Pathway/search"
params <- list(id = reactome_id)
df <- query_epigraphdb(route = endpoint, params = params, mode = "table")
df
}
pathway <- shared_pathway_df %>%
pull(shared_pathway) %>%
unlist() %>%
unique()
pathway_info <- pathway %>% map_df(get_pathway_info)
pathway_info %>% print()
#> # A tibble: 12 × 6
#> node._name node.name node._source node.id node._id node.url
#> <chr> <chr> <list> <chr> <chr> <chr>
#> 1 Immune System Immune System <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 2 Adaptive Immune … Adaptive Immune… <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 3 Signal Transduct… Signal Transduc… <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 4 Interferon Signa… Interferon Sign… <chr [1]> R-HSA-… R-HSA-9… https://rea…
#> 5 Interleukin-4 an… Interleukin-4 a… <chr [1]> R-HSA-… R-HSA-6… https://rea…
#> 6 Interleukin-10 s… Interleukin-10 … <chr [1]> R-HSA-… R-HSA-6… https://rea…
#> 7 Signaling by Int… Signaling by In… <chr [1]> R-HSA-… R-HSA-4… https://rea…
#> 8 Cytokine Signali… Cytokine Signal… <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 9 Integrin cell su… Integrin cell s… <chr [1]> R-HSA-… R-HSA-2… https://rea…
#> 10 Immunoregulatory… Immunoregulator… <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 11 Extracellular ma… Extracellular m… <chr [1]> R-HSA-… R-HSA-1… https://rea…
#> 12 Disease Disease <chr [1]> R-HSA-… R-HSA-1… https://rea…
In order to extract protein groups from the shared pathways, the last step for this query is to convert the shared pathway data into a graph where
- Proteins are represented as nodes.
- And shared pathways are represented as edges.
- Then the protein groups (via shared pathways) are the connected communities in the graph.
We then count the number of nodes in each connected community and plot the graph.
protein_df_to_graph <- function(df) {
df_connected <- df %>%
filter(connected) %>%
distinct(`combination`, .keep_all = TRUE)
nodes <- df %>%
pull(protein) %>%
unique()
graph <- igraph::graph_from_data_frame(
df_connected,
directed = FALSE, vertices = nodes
)
graph$layout <- igraph::layout_with_kk
graph
}
graph_to_protein_groups <- function(graph) {
graph %>%
igraph::components() %>%
igraph::groups() %>%
tibble(group_member = .) %>%
mutate(group_size = map_int(`group_member`, length)) %>%
arrange(desc(group_size))
}
pathway_protein_graph <- shared_pathway_df %>% protein_df_to_graph()
pathway_protein_groups <- pathway_protein_graph %>% graph_to_protein_groups()
pathway_protein_groups %>% str()
#> tibble [5 × 2] (S3: tbl_df/tbl/data.frame)
#> $ group_member:List of 5
#> ..$ 1: chr [1:5] "O75791" "P05362" "P29597" "Q6P4F2" ...
#> ..$ 2: chr "P0C6A0"
#> ..$ 3: chr "Q13445"
#> ..$ 4: chr "Q8N9T8"
#> ..$ 5: chr "Q9BYD3"
#> ..- attr(*, "dim")= int 5
#> ..- attr(*, "dimnames")=List of 1
#> .. ..$ : chr [1:5] "1" "2" "3" "4" ...
#> $ group_size : Named int [1:5] 5 1 1 1 1
#> ..- attr(*, "names")= chr [1:5] "1" "2" "3" "4" ...
png(file = "figs/case1-pathway_protein_graph.png")
plot(pathway_protein_graph)
dev.off()
#> png
#> 2
Retrieving shared protein-protein interactions
We can complement the information given by pathway ontologies with data of interactions between proteins. For each pair of proteins, we retrieve shared protein-protein interactions, either a direct interaction or an interaction where there’s one mediator protein in the middle (see parameters section).
get_ppi <- function(gene_protein_df, n_intermediate_proteins = 0) {
endpoint <- "/protein/ppi/pairwise"
params <- list(
uniprot_id_list = gene_protein_df %>% pull("uniprot_id") %>% I(),
n_intermediate_proteins = n_intermediate_proteins
)
df <- query_epigraphdb(
route = endpoint,
params = params,
mode = "table",
method = "POST"
)
if (nrow(df) > 0) {
res_df <- gene_protein_df %>%
select(protein = uniprot_id) %>%
left_join(df, by = c("protein"))
} else {
res_df <- gene_protein_df %>%
select(protein = uniprot_id) %>%
mutate(assoc_protein = NA_character_, path_size = NA_integer_)
}
res_df
}
ppi_df <- get_ppi(gene_protein_df, n_intermediate_proteins = PPI_N_INTERMEDIATE_PROTEINS)
ppi_df
#> # A tibble: 1,686 × 3
#> protein assoc_protein path_size
#> <chr> <chr> <int>
#> 1 P0C6A0 P0C6A0 2
#> 2 P0C6A0 P0C6A0 2
#> 3 Q6P4F2 Q6P4F2 2
#> 4 Q6P4F2 Q6P4F2 2
#> 5 Q6P4F2 Q6P4F2 2
#> 6 Q6P4F2 Q6P4F2 2
#> 7 Q6P4F2 Q6P4F2 2
#> 8 Q6P4F2 Q6P4F2 2
#> 9 Q6P4F2 Q6P4F2 2
#> 10 Q6P4F2 Q6P4F2 2
#> # … with 1,676 more rows
Like in the query for pathway data, we convert and analyse these results as a graph.
get_shared_ppi <- function(ppi_df) {
per_row <- function(ppi_df, x, y) {
res <- ppi_df %>%
filter(
`protein` == x,
`assoc_protein` == y
) %>%
nrow() %>%
`>`(0)
res
}
pairwise_permutations <- ppi_df %>%
pull(`protein`) %>%
unique() %>%
gtools::permutations(n = length(.), r = 2, v = .)
shared_ppi_df <- tibble(
protein = pairwise_permutations[, 1],
assoc_protein = pairwise_permutations[, 2]
) %>%
mutate(
shared_ppi = map2_lgl(`protein`, `assoc_protein`, function(x, y) {
per_row(ppi_df = ppi_df, x = x, y = y)
}),
combination = map2_chr(`protein`, `assoc_protein`, function(x, y) {
comb <- sort(c(x, y))
paste(comb, collapse = ",")
}),
connected = shared_ppi
)
shared_ppi_df
}
shared_ppi_df <- get_shared_ppi(ppi_df)
n_ppi_pairs <- shared_ppi_df %>%
filter(shared_ppi) %>%
nrow()
print(glue::glue("Num. shared_ppi pairs: {n_ppi_pairs}"))
#> Num. shared_ppi pairs: 10
shared_ppi_df
#> # A tibble: 72 × 5
#> protein assoc_protein shared_ppi combination connected
#> <chr> <chr> <lgl> <chr> <lgl>
#> 1 O75791 P05362 TRUE O75791,P05362 TRUE
#> 2 O75791 P0C6A0 FALSE O75791,P0C6A0 FALSE
#> 3 O75791 P29597 TRUE O75791,P29597 TRUE
#> 4 O75791 Q13445 FALSE O75791,Q13445 FALSE
#> 5 O75791 Q6P4F2 FALSE O75791,Q6P4F2 FALSE
#> 6 O75791 Q8N9T8 FALSE O75791,Q8N9T8 FALSE
#> 7 O75791 Q9BYD3 FALSE O75791,Q9BYD3 FALSE
#> 8 O75791 Q9UMF0 FALSE O75791,Q9UMF0 FALSE
#> 9 P05362 O75791 TRUE O75791,P05362 TRUE
#> 10 P05362 P0C6A0 FALSE P05362,P0C6A0 FALSE
#> # … with 62 more rows
ppi_protein_graph <- protein_df_to_graph(shared_ppi_df)
ppi_protein_groups <- graph_to_protein_groups(ppi_protein_graph)
ppi_protein_groups %>% str()
#> tibble [5 × 2] (S3: tbl_df/tbl/data.frame)
#> $ group_member:List of 5
#> ..$ 1: chr [1:4] "O75791" "P05362" "P29597" "Q9UMF0"
#> ..$ 5: chr [1:2] "Q8N9T8" "Q9BYD3"
#> ..$ 2: chr "P0C6A0"
#> ..$ 3: chr "Q13445"
#> ..$ 4: chr "Q6P4F2"
#> ..- attr(*, "dim")= int 5
#> ..- attr(*, "dimnames")=List of 1
#> .. ..$ : chr [1:5] "1" "5" "2" "3" ...
#> $ group_size : Named int [1:5] 4 2 1 1 1
#> ..- attr(*, "names")= chr [1:5] "1" "5" "2" "3" ...
png(file = "figs/case1-ppi_protein_graph.png")
plot(ppi_protein_graph)
dev.off()
#> png
#> 2
sessionInfo
sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.3 LTS
#>
#> Matrix products: default
#> BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] epigraphdb_0.2.3 igraph_1.2.11 glue_1.6.0 purrr_0.3.4
#> [5] dplyr_1.0.7 magrittr_2.0.1
#>
#> loaded via a namespace (and not attached):
#> [1] knitr_1.37 tidyselect_1.1.1 R6_2.5.1 rlang_0.4.12
#> [5] fastmap_1.1.0 fansi_0.5.0 httr_1.4.2 stringr_1.4.0
#> [9] tools_4.1.2 xfun_0.29 utf8_1.2.2 cli_3.1.0
#> [13] DBI_1.1.2 gtools_3.9.2 htmltools_0.5.2 ellipsis_0.3.2
#> [17] assertthat_0.2.1 yaml_2.2.1 digest_0.6.29 tibble_3.1.6
#> [21] lifecycle_1.0.1 crayon_1.4.2 vctrs_0.3.8 curl_4.3.2
#> [25] evaluate_0.14 rmarkdown_2.11 stringi_1.7.6 compiler_4.1.2
#> [29] pillar_1.6.4 generics_0.1.1 jsonlite_1.7.2 pkgconfig_2.0.3