-
Notifications
You must be signed in to change notification settings - Fork 3
/
EPA_BNTI_multithreaded.R
207 lines (150 loc) · 9.18 KB
/
EPA_BNTI_multithreaded.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# Writing a script to both rarefy and calculate bNTI for OTU tables
# Modified from https://github.com/stegen/Stegen_etal_ISME_2013/blob/master/bNTI_Local_Machine.r; Stegen et al, 2013
# Modified from https://github.com/stegen/Stegen_etal_ISME_2013/blob/master/Raup_Crick_Abundance.r; Stegen et al, 2013
# RED 2017; [email protected]
local = 1 # Whether on the server or not
micron = 1 # Whether or not to include the 0.1um fraction (1 is include, 0 is exclude)
library(vegan)
library(picante)
library(doMC)
is.wholenumber <- function(x, tol = .Machine$double.eps^0.5) abs(x - round(x)) < tol # Need this to fix the fact that you can't exactly have half of a read created from my averaging step
rc.reps = 9999
registerDoMC(cores = 1)
###################################
#### Data Loading and cleaning ####
###################################
if(local==1){
setwd("/Users/danczak.6/Documents/Wilkins Lab/Ohio EPA Project/Sequencing Data/16S Data/") # Working directory for 16S comm on my local computer
data = read.delim("Raw_EPA_no_pynast.txt") # Importing the raw OTU data
tree = read.tree("rep_set.tre")
factors = read.csv("OTU_table_factors.csv") # Importing factors to make the averaging of replicates possible
} else {
setwd("/home/rdanczak/EPA_16S_Analysis/No_Pynast_bNTI") # Working directory for 16S comm on the server
data = read.delim("Raw_EPA_no_pynast.txt") # Importing the raw OTU data
tree = read.tree("Raw_EPA_OTU_Tree.tre") # Importing the raw OTU tree
factors = read.csv("OTU_table_factors.csv") # Importing factors to make the averaging of replicates possible
}
row.names(data) = data[,1] # Setting the the first column to the row.names and then removing them
data = data[,-1]
data = data[,-which(colnames(data) %in% c("YB04","BL01","BL02"))] # Removing YB04 becuase it was a failed sequencing run
# Averaging the replicates for our sequencing endeavors
row.names(factors) = factors[,1] # This and the next three lines are just processing the data into more useful formats
factors = factors[,-1]
factors = factors[-which(row.names(factors) %in% c("YB04","BL01","BL02")),]
########################################
#### Averaging my duplicate results ####
########################################
x = as.character(unique(factors[,1])) # Setting the unique factors to a variable
data.avg = matrix(nrow = length(data[,1]), ncol = length(x)) # Creating an empty matrix to spit the averaged data into
for(i in 1:length(x)){
w = which(factors[,1] %in% x[i])
if(length(w)==1){
data.avg[,i] = data[,w]
} else {
data.avg[,i] = apply(data[,w], 1, mean)
}
} # This loop goes through by column and averages the two columns with matching factors
colnames(data.avg) = x
row.names(data.avg) = row.names(data)
############################
#### Running Raup-Crick ####
############################
# Removing the 0.1um filters from the dataset
if(micron==0){
data.avg = data.avg[,-grep("0.1", colnames(data.avg))]
zero.members = apply(data.avg, 1, max) # Finding the maximum number of sequence per OTU
data.avg = data.avg[-which(zero.members==0),]
}
# Setting up the data to mesh with the Stegen et al. code
spXsite = t(data.avg)
spXsite[which(is.wholenumber(spXsite) == FALSE)] = spXsite[which(is.wholenumber(spXsite) == FALSE)]+0.5 # Rounding the 0.5s up as half a sequence is a sequence - it was present at least once
# Count number of sites and total species richness across all plots (gamma)
n_sites = nrow(spXsite)
gamma = ncol(spXsite)
# Build a site by site matrix for the results, with the names of the sites in the row and col names:
results = matrix(data=NA, nrow=n_sites, ncol=n_sites, dimnames=list(row.names(spXsite), row.names(spXsite)))
# Make the spXsite matrix into a new, pres/abs. matrix:
spXsite.inc = ceiling(spXsite/max(spXsite))
# Create an occurrence vector- used to give more weight to widely distributed species in the null model
occur = apply(spXsite.inc, MARGIN=2, FUN=sum)
# Create an abundance vector- used to give more weight to abundant species in the second step of the null model
abundance = apply(spXsite, MARGIN=2, FUN=sum)
# Loops through every pairwise comparison, generating null results
for(null.one in 1:(nrow(spXsite)-1)){
for(null.two in (null.one+1):nrow(spXsite)){
null_bray_curtis<-NULL
null_bray_curtis = foreach(i=1:rc.reps, .packages = c("vegan","picante")) %dopar% {
# Generates two empty communities of size gamma
com1<-rep(0,gamma)
com2<-rep(0,gamma)
# Add observed number of species to com1, weighting by species occurrence frequencies
com1[sample(1:gamma, sum(spXsite.inc[null.one,]), replace=FALSE, prob=occur)]<-1
com1.samp.sp = sample(which(com1>0), (sum(spXsite[null.one,])-sum(com1)), replace=TRUE, prob=abundance[which(com1>0)]);
com1.samp.sp = cbind(com1.samp.sp,1);
com1.sp.counts = as.data.frame(tapply(com1.samp.sp[,2], com1.samp.sp[,1], FUN=sum)); colnames(com1.sp.counts) = 'counts'; # head(com1.sp.counts);
com1.sp.counts$sp = as.numeric(rownames(com1.sp.counts));
com1[com1.sp.counts$sp] = com1[com1.sp.counts$sp] + com1.sp.counts$counts;
#sum(com1) - sum(spXsite[null.one,]); # This should be zero if everything worked properly
rm('com1.samp.sp','com1.sp.counts');
# Again for com2
com2[sample(1:gamma, sum(spXsite.inc[null.two,]), replace=FALSE, prob=occur)]<-1
com2.samp.sp = sample(which(com2>0), (sum(spXsite[null.two,])-sum(com2)), replace=TRUE, prob=abundance[which(com2>0)]);
com2.samp.sp = cbind(com2.samp.sp,1);
com2.sp.counts = as.data.frame(tapply(com2.samp.sp[,2], com2.samp.sp[,1], FUN=sum)); colnames(com2.sp.counts) = 'counts'; # head(com2.sp.counts);
com2.sp.counts$sp = as.numeric(rownames(com2.sp.counts));
com2[com2.sp.counts$sp] = com2[com2.sp.counts$sp] + com2.sp.counts$counts;
# sum(com2) - sum(spXsite[null.two,]); # This should be zero if everything worked properly
rm('com2.samp.sp','com2.sp.counts');
null.spXsite = rbind(com1,com2); # Null.spXsite
# Calculates the null Bray-Curtis
null_bray_curtis[i] = vegdist(null.spXsite, method='bray');
}; # End of the null loop
# Unlisting the parallel list
null_bray_curtis = unlist(null_bray_curtis)
# Calculates the observed Bray-Curtis
obs.bray = vegdist(spXsite[c(null.one,null.two),], method='bray');
# How many null observations is the observed value tied with?
num_exact_matching_in_null = sum(null_bray_curtis==obs.bray);
# How many null values are smaller than the observed *dissimilarity*?
num_less_than_in_null = sum(null_bray_curtis<obs.bray);
rc = ((num_less_than_in_null + (num_exact_matching_in_null)/2)/rc.reps) # This variation of rc splits ties
rc = (rc-.5)*2 # Adjusts the range of the Raup-Crick caclulation to -1 to 1
results[null.two,null.one] = round(rc,digits=2); # Stores rc into the results matrix
print(c(null.one,null.two,date())); # Keeps track of position
}; # End of inner loop
}; # End of outer loop
rc.results = as.dist(results) # Converts results into a distance matrix
write.csv(as.matrix(rc.results), "RC_BC_MC_Results.csv", quote = F)
rm('spXsite.inc')
####################################
#### Beginning the bNTI Process ####
####################################
# Transposing data because the rrarefy command doesn't like samples in columns
# alt.data = as.data.frame(t(data.avg))
# alt.data[which(is.wholenumber(alt.data) == FALSE)] = alt.data[which(is.wholenumber(alt.data) == FALSE)]+0.5
# Rarefying the data because bNTI calculations don't seem to tolerate too many OTUs over 20000; specifically, the error is produced in cophenetic
data.rare = as.data.frame(rrarefy(spXsite, 8000)) # Rarefying to the sample with the lowest sequence number
# Determining OTUs which have zero abundance
zero.members = apply(data.rare, 2, max) # Finding the maximum number of sequence per OTU
data.rare = data.rare[,-which(zero.members==0)] # Removing those OTUs which have 0 sequences after rarefaction
# Matching the tree to the newly rarefied OTU dataset
phylo = match.phylo.data(tree,t(data.rare))
# Calculating bMNTD for my samples
bMNTD = as.matrix(comdistnt(t(phylo$data), cophenetic(phylo$phy), abundance.weighted = T))
# Calculating the bMNTD for 999 random distributions
bMNTD.rand = array(c(-999),dim=c(ncol(phylo$data),ncol(phylo$data),999)) # Creating 999 'dummy' matrices to put random results into
for(i in 1:999){
bMNTD.rand[,,i] = as.matrix(comdistnt(t(phylo$data),taxaShuffle(cophenetic(phylo$phy)),abundance.weighted = T,exclude.conspecifics = F))
print(c(date(),i)) # Measuring how far the loop has gotten
} # Performing the calculations on using the OTU table but with randomized taxonomic affiliations
# Calculating the bNTI between these randomized communities and the empirical dataset
bNTI = matrix(c(NA),nrow=ncol(phylo$data),ncol=ncol(phylo$data))
for(i in 1:(ncol(phylo$data)-1)){
for(j in (i+1):ncol(phylo$data)){
m = bMNTD.rand[j,i,] # Just setting all the randomizations for a given comparison to a matrix
bNTI[j,i] = ((bMNTD[j,i]-mean(m))/sd(m)) # The bNTI calculation
}
}
rownames(bNTI) = colnames(phylo$data)
colnames(bNTI) = colnames(phylo$data)
write.csv(bNTI, "MC_EPA_bNTI.csv", quote = F)