# last updated 2024-09-19 by GideonE
#______ Summary Information for 'Decoding the Amazon' ________#
#%%%%%%%% Install then load required packages
library(dplyr)
library(ggplot2)
library(plyr)
library(reshape2)
library(readr)
library(tidyr)
#%%%%%%%% Import all related metadata data
barcodes <- read.csv("lab_final.csv",header = TRUE) # sample sheet from the molecular laboratory
taxa <- read.csv("taxonomy_final.csv",header = TRUE) # taxonomy data from BOLD system
collection <- read.csv("collection_data_final.csv", header = TRUE) # collection data fro BOLD system
#%%%%%%%% Merge data frames together, 2 at a time, and then perform some basic data clean-up
alldata <- merge(barcodes,collection, by="Sample.ID")
alldata <- merge(alldata,taxa, by="Sample.ID")
colnames(alldata) # show column names of combined data
# "Green Lab" site was stored in the "Extra.info" column, move this to institution variable
table(alldata$Extra.Info) # view all unique values in 'Extra.info'
table(alldata$Institution) # view all unique values in 'Institution'
alldata[alldata$Extra.Info=="GreenLab",]$Institution<-"Green Lab"
table(alldata$Institution) # check 'Institution' again for 'Green Lab'
## remove variables that are not wanted for data summaries
names(alldata)
patterns <- c("Trace", "Accession", "Email", "Stage", "Image","Contam","Tribe","Depth","GPS","Event","Coord") # define search patterns for columns to remove
regex_pattern <- paste(patterns, collapse = "|") # insert patterns into a list of "OR" regex expressions
cols_to_remove <- grep(regex_pattern, names(alldata), value = TRUE) # create vector of column names that match any of the patterns
alldata2 <- alldata %>% select(-one_of(cols_to_remove)) # remove columns that match any of the names in the vector
nrow(alldata2)
#remove all unwanted rows, mainly samples from the coast, to keep data set to Amazonian secies.
# Define the Sample.ID values to be removed, all that are nonAmazonian
ids_to_remove <- c("TUNKI-0138", "TUNKI-0139", "TUNKI-0140", "TUNKI-233", "TUNKI-0012",
"TUNKI-0079", "TUNKI-0080", "TUNKI-0081", "TUNKI-0082", "TUNKI-0083",
"TUNKI-0094", "TUNKI-0095", "TUNKI-0264", "TUNKI-0016", "TUNKI-0233")
alldata2 <- alldata2 %>% filter(!Sample.ID %in% ids_to_remove) # remove target samples
nrow(alldata2) #count number of rows remaining in data set
## remove all values of "0[n]", replace with zeros
# Function to replace "0[n]" with blank
replace_zero_n_brackets <- function(x) {
gsub("0\\[n\\]", "0", x)
}
# Apply the function to all columns of the data frame
alldata2 <- alldata2 %>% mutate_all(~ replace_zero_n_brackets(.))
#%%%%%%%% Data Summaries
nrow(alldata2) # counter records in data frame
names(alldata2) # show all variable names in data frame
## correctly identified (field identification versus genetic reference identity)
alldata2$IDmatching<-ifelse(alldata2$Identification == alldata2$Species,1,0)
length(alldata2$IDmatching)
table(alldata2$IDmatching)
write.csv(alldata2[alldata2$IDmatching==0,],"mismatches.csv",row.names = F) # export mismatches to CVS file
## sequences by state department
write.csv(ddply(alldata2,.(State.Province),nrow),"state.department.csv")
## sequences by order
orders<-ddply(alldata2,.(Order),nrow) # count number of sequences per Order
names(orders)[2]<-"count" # correct variable name
write.csv(ddply(alldata2,.(Order),nrow),"order.csv") # export sequence count by Order
## pie charts of sequences by Order for each taxonomic Class
class.orders<-ddply(alldata2,.(Class,Order),nrow) # count sequences by Order by Class
names(class.orders)[3]<-"count" # correct variable name
# create list of data frames for each Class
class.orders_l<-split(class.orders,class.orders$Class)
# function to generate pie chart
generate_pie_chart <- function(df) {
df <- df %>%
arrange(desc(count)) %>%
mutate(
prop = count / sum(count) * 100,
Order = paste(Order, "(", round(prop, 1), "%)", sep = "")
)
# Determine number of legend columns
legend_cols <- ifelse(nrow(df) > 6, 2, 1)
ggplot(df, aes(x = "", y = prop, fill = Order)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar("y", start = 0) +
theme_void() +
scale_fill_viridis_d() +
labs(title = paste(unique(df$Class)), fill = "Order") +
guides(fill = guide_legend(ncol = legend_cols))
}
# run function across all objects in the list of data frames
plots <- lapply(class.orders_l, generate_pie_chart)
# export each plot in turn
ggsave(filename = "plot1.png",plot = plots[[1]], device = 'png',width = 800, height = 700, dpi = 150, units = "px", bg="white")
ggsave(filename = "plot2.png",plot = plots[[2]], device = 'png',width = 800, height = 700, dpi = 150, units = "px", bg="white")
ggsave(filename = "plot3.png",plot = plots[[3]], device = 'png',width = 800, height = 700, dpi = 150, units = "px", bg="white")
ggsave(filename = "plot4.png",plot = plots[[4]], device = 'png',width = 800, height = 700, dpi = 150, units = "px", bg="white")
ggsave(filename = "plot5.png",plot = plots[[5]], device = 'png',width = 800, height = 700, dpi = 150, units = "px", bg="white")
# display all plots in single figure
grid.arrange(grobs = plots, ncol = 2)
## sequences by family
write.csv(ddply(alldata2,.(Family),nrow),"family.csv")
## sequences by subfamily
nrow(ddply(alldata2,.(Subfamily),nrow))
## sequences by genus
nrow(ddply(alldata2,.(Genus),nrow))
## sequences by species
nrow(ddply(alldata2,.(Species),nrow))
## sequences by subspecies
nrow(ddply(alldata2,.(Subspecies),nrow))
## Table 4: summary table per gene marker
names(alldata2) # show variable names of full data set
ddply(alldata2,.(Class),nrow) # number of individuals per Class
names(alldata2)[7:12]<-c("RBCL","18S","CO1","MATK","CYTB","TRNH-PSBA") # correct gene marker names
unique(alldata2$Class) # display all unique Class values
alldata3<-alldata2[c(1,39,44,7:12)] # make filtered, working copy of data set
names(alldata3) # show varible names of filtered dataset
# convert all gene markeres to binary 1 (present) 0 (absent)
alldata3[4:9]<-alldata3[4:9] %>% mutate_all(~ifelse(is.na(.), NA, ifelse(. != 0, 1, 0)))
alldata3.melt<-melt(alldata3,id.vars = c("Sample.ID","Class","Species")) # transform filtered data to long format
alldata3.melt<-alldata3.melt[alldata3.melt$value>0,] # remove all records with a sequence
alldata3_class<-ddply(alldata3.melt,.(Class),nrow);alldata3_seq # count number of sequences per Class
alldata3_gene_seq<-ddply(alldata3.melt,.(Class,variable),nrow);alldata3_seq # count number of sequences per gene per Class
alldata3_class<-ddply(alldata3.melt,.(Class,Species),nrow);alldata3_class # count species per Class
alldata3_sum1<-ddply(alldata3.melt,.(Class,Species,variable),summarize,seq_num=sum(value));alldata3_sum1 # count sequences per species per gene per Class
alldata3_sum2<-ddply(alldata3_sum1,.(Class,variable),nrow);alldata3_sum2 # species per gene per class
#%%%%%%%% complimenting the Mammal gap analysis (Pacheco, V. et al. Disproportion between the Peruvian Amazonian megadiverse mammalian fauna and the available molecular information. Zoologia 41, e23110 (2024).)
gap_mammals<-read.csv('Gaps_mammals.csv') # import data table from publication
names(gap_mammals) # show variable names
gap_mammals<-gap_mammals[-9] # filter unwanted variable
names(gap_mammals)[5:8]<-c("peru_12S","peru_CO1","peru_CYTB","peru_MIT");names(gap_mammals) # rename gene marker variables with 'peru' prefix
gap_mammals<-gap_mammals %>% mutate_at(vars(names(gap_mammals)[5:8]), ~replace(., . == "N", "0")) # convert gene markers to binary variables
gap_mammals<-gap_mammals %>% mutate_at(vars(names(gap_mammals)[5:8]), as.integer) # ensure gene markers variable is an integer
gap_mammals$peru_prior_seq <- rowSums(gap_mammals[5:8]) # sum across all available gene markers per species and store in new column
gap_mammals <- gap_mammals %>% mutate(Species = gsub('"', '', Species)) # remove quotation marks
gap_mammals[c(5:9)] <- gap_mammals[c(5:9)] %>% mutate_all(~ifelse(. != "0", "1", "0")) # convert data to binary again
gap_mammals[c(5:9)] <- gap_mammals[c(5:9)] %>% mutate_all(as.integer) # ensure binary gene marker values are integers
colSums(gap_mammals[c(5:9)]) # count sequence records for each column (last column indicates any sequence record for a species)
names(alldata2) # show variable names of new data set
alldata2m<-alldata2[alldata2$Class=="Mammalia",c(1,39,43:44,7:12)];names(alldata2m) # filter to only mammals and variables of interest, and make new working data frame
names(alldata2m)[5:10]<-c("RBCL","g18S","CO1","MATK","CYTB","TRNH");names(alldata2m) # correct gene marker names
alldata2m[5:10] <- alldata2m[5:10] %>% mutate_all(~ifelse(. != "0", "1", "0")) # make gene marker values binary
alldata2m[5:10] <- alldata2m[5:10] %>% mutate_all(as.integer) # ensure binary gene marker values are integers
# count number of sequences per gene marker by species
alldata2m<-ddply(alldata2m,.(Class,Genus,Species),summarize,
us_RBCL=sum(RBCL,na.rm=T),
us_18S=sum(g18S,na.rm=T),
us_CO1=sum(CO1,na.rm=T),
us_MATK=sum(MATK,na.rm=T),
us_CYTB=sum(CYTB,na.rm=T),
us_TRNH=sum(TRNH,na.rm=T))
names(alldata2m) # check variable names
alldata2m[4:9]<-alldata2m[4:9] %>% mutate_all(~ifelse(. >= 1, 1, .)) # convert gene marker variabales to binary values
names(alldata2m) # check variable names
alldata2m$peru_seq_new<-rowSums(alldata2m[4:9]) # create new variable of gene markers generated per species
mmerge2<-merge(alldata2m,gap_mammals,by = "Species", all=T) # merge new and prior data sets
colSums(mmerge2[18],na.rm = T)
table(mmerge2$Species) # count number of all species in merged data
mmerge2<-mmerge2[mmerge2$Species!="",] # remove empty rows
# following command will count how many mammals of the prior data list were included in this study
table(mmerge2$Class, useNA = "ifany")
names(mmerge2) # show variable names of merged data
mmerge2[c(4:10,14:18)] <- mmerge2[c(4:10,14:18)] %>% mutate_all(~replace_na(., 0)) # replace NA values with 0s
mmerge2[c(4:10,14:18)] <- mmerge2[c(4:10,14:18)] %>% mutate_all(~ifelse(. > 1, 1, .)) # make all values binary
## calculate how many new sequences per species per gene
names(mmerge2) # show variable names of merged data
#how many species that had no prior genetic record
nrow(gap_mammals[gap_mammals$peru_prior_seq==0,]) # 146 total
# calculate several new values per species
mmerge2 <- mmerge2 %>%
mutate(
CO1_diff=ifelse(us_CO1 == 1 & peru_CO1 == 0,1,0), # value of 1 is we generated a COI record that did not previously exist
CYTB_diff=ifelse(us_CYTB == 1 & peru_CYTB == 0,1,0), # value of 1 if we generated a cytB record that did not previously exit
new_species_ref=ifelse(peru_seq_new ==1 & peru_prior_seq == 0,1,0) # value of 1 if we generated a sequence record for a species that had none
)
names(mmerge2)
# sum values for all variables to count total number of new gene and species records
colSums(mmerge2[c(4:10,14:21)])
new_mammals<-mmerge2[mmerge2$new_species_ref==1,] # subset to just mammals in which a first gene record was generated by this study
colSums(new_mammals[c(4:10,14:21)]) # sum values for all variables
#%%%%%%%% Complimenting the Bird gap analysis (Arana, A. et al. Lack of local genetic representation in one of the regions with the highest bird species richness, the Peruvian Amazonia. PLoS One 19, e0296305 (2024).)
gap_bird<-read.csv('Gaps_aves.csv') # import data table from publication
gap_bird2<-gap_bird # create working data frame
names(gap_bird2) # show variable names
names(gap_bird2)[5:10]<-c("Amazonian_birds","CO1_peru","CO1_world","CYTB_world","18S_world","12S_world");names(gap_bird2) # rename gene marker variables with 'peru' prefix
gap_bird2<-gap_bird2[!is.na(gap_bird2$Amazonian_birds),] # filter to only Amazonian birds
table(gap_bird2$CO1_world) # show table of records in 'CO1_workd" variable
gap_bird2$CO1_world[is.na(gap_bird2$CO1_world)]<-"N" # insert "N" for all records that are NA
gap_bird2$CO1_world[gap_bird2$CO1_world %in% c("public","private")]<-"K" # replace public and private values with "K"
gap_bird2<-gap_bird2 %>% mutate_at(vars("CO1_peru","CO1_world","CYTB_world","18S_world","12S_world"), ~ifelse(is.na(.) | . != "K", 0, 1)) # make all gene marker variables binary values
names(gap_bird2) # show variable names
colSums(gap_bird2[6:10]) # sum values for all gene marker variables in prior data set
names(alldata2) # show variable names of new data set
alldata2b<-alldata2[alldata2$Class=="Aves",c(1,39,44,7:12)];names(alldata2b) # filter to only birds and variables of interest, and make new working data frame
names(alldata2b)[4:9]<-c("RBCL","g18S","CO1","MATK","CYTB","TRNH");names(alldata2b) # correct gene marker names
alldata2b[4:9] <- alldata2b[4:9] %>% mutate_all(~ifelse(. != "0", "1", "0")) # make gene marker values binary
alldata2b[4:9] <- alldata2b[4:9] %>% mutate_all(as.integer) # ensure binary gene marker values are integers
# count number of sequences per gene marker by species
alldata2b<-ddply(alldata2b,.(Class,Species),summarize,
us_RBCL=sum(RBCL,na.rm=T),
us_18S=sum(g18S,na.rm=T),
us_CO1=sum(CO1,na.rm=T),
us_MATK=sum(MATK,na.rm=T),
us_CYTB=sum(CYTB,na.rm=T),
us_TRNH=sum(TRNH,na.rm=T))
alldata2b[3:8]<-alldata2b[3:8] %>% mutate_all(~ifelse(. >= 1, 1, .)) # convert gene marker variabales to binary values
names(alldata2b) # check variable names
alldata2b$peru_new_seq<-rowSums(alldata2b[3:8]) # create new variable of gene markers generated per species
bmerge<-merge(alldata2b, gap_bird2, by.x="Species", by.y="Scientific_name", all=T) # merge new and prior data sets
bmerge<-bmerge[bmerge$Species!="",] #remove blank records after merge
# how many birds and mammals did we cover of the existing species list
table(bmerge$Class, useNA = "ifany")
names(bmerge) # check variable names
bmerge[c(3:9,14:18)] <- bmerge[c(3:9,14:18)] %>% mutate_all(~replace_na(., 0)) # replace NA values with 0s
bmerge$world_prior_seq <- rowSums(bmerge[14:18]);names(bmerge) # sum values across prior sequence records per species
bmerge[c(3:9,14:19)] <- bmerge[c(3:9,14:19)] %>% mutate_all(~ifelse(. > 1, 1, .)) # convert all numerical values to binary
names(bmerge) # check variable names
# calculate several new values per species
bmerge <- bmerge %>%
mutate(
CO1peru_diff=ifelse(us_CO1 == 1 & CO1_peru == 0,1,0), # value of 1 if new COI record from Peruvian specimen generated by this study
CO1world_diff=ifelse(us_CO1 == 1 & CO1_world == 0,1,0), # value of 1 if new CO1 record for each species was generated by this study
CYTBworld_diff=ifelse(us_CYTB == 1 & CYTB_world == 0,1,0), # value of 1 if new cytB record for each species was generated by this study
x18Sworld_diff=ifelse(us_18S == 1 & '18S_world' == 0,1,0), # value of 1 if new 18S record for each species was generated by this study
new_species_ref=ifelse(peru_new_seq == 1 & world_prior_seq == 0,1,0) # value of 1 if first gene marker reference for each species was generated by this study
)
names(bmerge)
# sum values for all variables to count total number of new gene and species records
colSums(bmerge[c(3:9,14:24)])
# subset to just mammals in which a first gene record was generated by this study
new_birds<-bmerge[bmerge$new_species_ref==1,]
# sum values for all variables
colSums(new_birds[c(3:9,14:24)])