# microbiomics package lives in github (https://github.com/tvatanen/microbiomics/)
# install using install_github("tvatanen/microbiomics") (requires devtools)
library(microbiomics)

# Maaslin package lives in bitbucket (https://bitbucket.org/biobakery/maaslin/)
# install using install_bitbucket("biobakery/maaslin", ref = "default")
# (requires devtools)
# see also http://huttenhower.sph.harvard.edu/maaslin
library(Maaslin)

# Q value threshold for all results
q_value_threshold <- 0.1

# load metadata
load("diabimmune_karelia_metadata.rdata")

metaphlan2_species <- read_metaphlan_table("diabimmune_karelia_metaphlan_table.txt",kingdom = "k__Bacteria", lvl=7)

# select and order metadata according to the MetaPhlAn data
metadata <- metadata[ match(rownames(metaphlan2_species), metadata$gid_wgs) , ]
# check that otu table and mapping file has same order
all(rownames(metaphlan2_species) == metadata$gid_wgs)

# there are some subject with no diet and/or breastfreeding information
# exclude them from association analysis 
subjects_no_diet_data <- c("P000675","P000756","P001307","P003173","P004230","P004946","P007550","P007649","P008119","P008579","P008633","P008723","P009219","P011108","P020604","P020631","P021343","P021772","P022384","P026562","P003191","P004113","P005558","P009291","P010313","P011207","P013823","P015163")

rows_no_feeding <- which(metadata$subjectID %in% subjects_no_diet_data)
metadata <- metadata[-rows_no_feeding,]
metaphlan2_species <- metaphlan2_species[-rows_no_feeding,]

# filter MetaPhlAn table for rare species:
# require greater than 0.0001 relative abundance in more than ten samples
metaphlan2_species <- metaphlan2_species[ , apply(metaphlan2_species>0.001,2,sum) > 10 ]

# replace outlier risk class (N=1, one subject) with 2
metadata$hla_risk_class[ metadata$hla_risk_class == 1 ] <- 2
# replace outlier autoantibody counts (N=4, 4 subjects) with 2
metadata$num_aabs[ metadata$num_aabs > 2 ] <- 2

# modify species table colnames
colnames(metaphlan2_species) <- sapply(colnames(metaphlan2_species), function(x) gsub(".",";",x,fixed = T))
colnames(metaphlan2_species) <- sapply(colnames(metaphlan2_species), function(x) gsub("k__Bacteria;","",x,fixed = T))

####################
### All samples 
####################

# Test associations between metadata and taxa using all samples (excluding the samples with no diet metadata)

variables_to_test <- c("Breast_feeding_end", "Any_baby_formula", "Rice", "Wheat", "Oat", "Barley", "Cereal", "Root_vegetables", "Vegetables", "Eggs", "Soy", "Milk", "Meat","Fruits_and_berries","Corn","Fish","bf_length","age_at_collection","country","gender","delivery","after_abx","hla_risk_class","seroconverted","num_preceeding_abx","mgx_reads_filtered")

# Initialize random number generator to enable complete replication of the results
set.seed(859263)
res_all <- Maaslin.wrapper(metaphlan2_species, metadata, strOutputDIR = "maaslin_wgs_all", variables = variables_to_test, strRandomCovariates = c("subjectID","mgx_pool"))
# there are error messages on computationally singular systems for some taxa
# these are due to sparse / zero inflated taxa which will be skipped in testing 
# they do not affect the results

# output directory "maaslin_wgs_all" contains visualizations of associations (data-<variable>.pdf) 
# and full output tables (data-<variable>.txt) per metadata variable 
# (this applies for all analyses below)

# collect significant results and write a table
res_all_combined <- data.frame()
for (f in names(res_all)) {
  rows <- which(res_all[[f]][,"Q.value"] < q_value_threshold)
  res_all_combined <- rbind(res_all_combined, res_all[[f]][rows,])
}
write.table(res_all_combined, file="diabimmune_karelia_wgs_associations_alldata.txt", sep="\t", quote = F, row.names = F)

####################
### Samples collected during the first year of life 
####################

# Test associations between metadata and taxa using samples collected during the first year of life
# This will enable detection of associations that are prominent in this time window but dissipate later in life

# N.B. Maaslin package is not very efficient in garbage collection. This means it will fill up the memory 
# and you may have to restart your R in order to free up enough memory for another Maaslin run.
# (depending on the amount of memory, you may have to do this after each Maaslin run in this R script)

# less diet variables to correct compared to all data, since some dietary factors (fruits and berries, corn, 
# fish) has not been introduced during the first year
variables_to_test_firstYear <- c("Breast_feeding_end", "Any_baby_formula", "Rice", "Wheat", "Oat", "Barley", "Cereal", "Root_vegetables", "Vegetables", "Eggs", "Soy", "Milk", "Meat","age_at_collection","country","gender","delivery","after_abx","hla_risk_class","seroconverted","num_aabs","mgx_reads_filtered")

set.seed(859263)
res_first <- Maaslin.wrapper(metaphlan2_species[ metadata$age_at_collection < 360 , ], metadata[ metadata$age_at_collection < 360 , ], strOutputDIR = "maaslin_wgs_first", variables = variables_to_test_firstYear, strRandomCovariates = c("subjectID","mgx_pool"))

# collect significant results and write a combined table
res_first_combined <- data.frame()
for (f in names(res_first)) {
  rows <- which(res_first[[f]][,"Q.value"] < q_value_threshold)
  res_first_combined <- rbind(res_first_combined, res_first[[f]][rows,])
}
write.table(res_first_combined, file="diabimmune_karelia_wgs_associations_firstyear.txt", sep="\t", quote = F, row.names = F)

####################
### Samples collected after the first year of life 
####################

# Test associations between metadata and taxa using samples collected after the first year of life
# This will enable detection of associations that are occuring later in development
# Use same set of variables that was used for analysing all the samples

set.seed(859263)
res_late <- Maaslin.wrapper(metaphlan2_species[ metadata$age_at_collection >= 360 , ], metadata[ metadata$age_at_collection >= 360 , ], strOutputDIR = "maaslin_wgs_late", variables = variables_to_test, strRandomCovariates = c("subjectID","mgx_pool"))

# collect significant results and write a table
res_late_combined <- data.frame()
for (f in names(res_late)) {
  rows <- which(res_late[[f]][,"Q.value"] < q_value_threshold)
  res_late_combined <- rbind(res_late_combined, res_late[[f]][rows,])
}
write.table(res_late_combined, file="diabimmune_karelia_wgs_associations_late.txt", sep="\t", quote = F, row.names = F)

# Conduct same three modeling runs (see above) with a subset of samples which have data on allergic sensitization
# and serum total IgA measurements

####################
### All samples with allergy metadata
####################

# select rows which have allergy metadata
rows_with_allergy_data <- which(!(apply(metadata[,c("totalige_log","allergy_milk","allergy_egg","allergy_peanut","allergy_dustmite","totalige_high","allergy_cat","allergy_dog","allergy_birch","allergy_timothy")],1,function(x) any(is.na(x)))))
metadata_allergy <- metadata[ rows_with_allergy_data , ]
metaphlan2_species_allergy <- metaphlan2_species[ metadata_allergy$gid_wgs , ]

variables_to_test_allergy <- c("Breast_feeding_end", "Any_baby_formula", "Rice", "Wheat", "Oat", "Barley", "Cereal", "Root_vegetables", "Vegetables", "Eggs", "Soy", "Milk", "Meat","Fruits_and_berries","Corn","Fish","bf_length","age_at_collection","country","gender","delivery","after_abx","hla_risk_class","seroconverted","num_preceeding_abx","totalige_log","allergy_milk","allergy_egg","allergy_peanut","allergy_dustmite","allergy_cat","allergy_dog","allergy_birch","allergy_timothy","mgx_reads_filtered")

set.seed(859263)
res_allergies <- Maaslin.wrapper(metaphlan2_species_allergy, metadata_allergy, strOutputDIR = "maaslin_wgs_allergies", variables = variables_to_test_allergy, strRandomCovariates = c("subjectID","mgx_pool"))

# collect significant results and write a table
res_allergies_combined <- data.frame()
for (f in names(res_allergies)) {
  rows <- which(res_allergies[[f]][,"Q.value"] < q_value_threshold)
  res_allergies_combined <- rbind(res_allergies_combined, res_allergies[[f]][rows,])
}
write.table(res_allergies_combined, file="diabimmune_karelia_wgs_associations_allergies.txt", sep="\t", quote = F, row.names = F)

####################
### Samples collected during the first year of life with allergy metadata
####################

variables_to_test_allergy_firstYear <- c("Breast_feeding_end", "Any_baby_formula", "Rice", "Wheat", "Oat", "Barley", "Cereal", "Root_vegetables", "Vegetables", "Eggs", "Soy", "Milk", "Meat","age_at_collection","country","gender","delivery","after_abx","hla_risk_class","seroconverted", "num_aabs", "totalige_log","allergy_milk","allergy_egg","allergy_peanut","allergy_dustmite","allergy_cat","allergy_dog","allergy_birch","allergy_timothy","mgx_reads_filtered")

set.seed(859263)
res_allergies_first <- Maaslin.wrapper(metaphlan2_species_allergy[ metadata_allergy$age_at_collection < 360 , ], metadata_allergy[ metadata_allergy$age_at_collection < 360 , ], strOutputDIR = "maaslin_wgs_allergies_first", variables = variables_to_test_allergy_firstYear, strRandomCovariates = c("subjectID","mgx_pool"))

# collect significant results and write a table
res_allergies_first_combined <- data.frame()
for (f in names(res_allergies_first)) {
  rows <- which(res_allergies_first[[f]][,"Q.value"] < q_value_threshold)
  res_allergies_first_combined <- rbind(res_allergies_first_combined, res_allergies_first[[f]][rows,])
}
write.table(res_allergies_first_combined, file="diabimmune_karelia_wgs_associations_allergies_firstyear.txt", sep="\t", quote = F, row.names = F)

####################
### Samples collected after the first year of life with allergy metadata
####################

# same variables here compared to the run with all samples
variables_to_test_allergy <- c("Breast_feeding_end", "Any_baby_formula", "Rice", "Wheat", "Oat", "Barley", "Cereal", "Root_vegetables", "Vegetables", "Eggs", "Soy", "Milk", "Meat","Fruits_and_berries","Corn","Fish","bf_length","age_at_collection","country","gender","delivery","after_abx","hla_risk_class","seroconverted","num_preceeding_abx","totalige_log","allergy_milk","allergy_egg","allergy_peanut","allergy_dustmite","allergy_cat","allergy_dog","allergy_birch","allergy_timothy","mgx_reads_filtered")

set.seed(859263)
res_allergies_late <- Maaslin.wrapper(metaphlan2_species_allergy[ metadata_allergy$age_at_collection >= 360 , ], metadata_allergy[ metadata_allergy$age_at_collection >= 360 , ], strOutputDIR = "maaslin_wgs_allergies_late", variables = variables_to_test_allergy, strRandomCovariates = c("subjectID","mgx_pool"))

# collect significant results and write a table
res_allergies_late_combined <- data.frame()
for (f in names(res_allergies_late)) {
  rows <- which(res_allergies_late[[f]][,"Q.value"] < q_value_threshold)
  res_allergies_late_combined <- rbind(res_allergies_late_combined, res_allergies_late[[f]][rows,])
}
write.table(res_allergies_late_combined, file="diabimmune_karelia_wgs_associations_allergies_late.txt", sep="\t", quote = F, row.names = F)

## Optional
# Bash command to combine all WGS results in one file with comparison type as the first column:
# for f in $(ls diabimmune_karelia_wgs_associations*.txt|cut -d. -f1); do cat "$f".txt|awk -v file="$f" 'BEGIN {OFS="\t"}; {print file,$0}' >> diabimmune_karelia_wgs_associations_combined.txt; done