##--------------------------------------------------------------------------------------------
##	
##		R code for simulations and analysis presented in:  
##			--> Galipaud et al. Ecologists overestimate the importance of 
##				predictor variables in model averaging: a plea for cautious 
##				interpretations. Methods in Ecology and Evolution
##		Author contact : matthias.galipaud@u-bourgogne.fr
##		R version : 3.0.2
##		file last modified: 19:20 04/08/2014
##
##--------------------------------------------------------------------------------------------
 
###################################################################################
###
###		install and load the main requested packages
###
###################################################################################
 
### MuMIn package for model averaging :
# install.packages('MuMIn')
library(MuMIn)
options(na.action = "na.fail")  #  prevent fitting models to different datasets
 
### Package for Choleski decompositiuon (used to simulate date with a controlled 
###	correlation structure)
# install.packages("mvtnorm") 
library(mvtnorm)
 
###################################################################################
###
###	 simulation of a dataset with a controled correlation structure
###	 with some summary statistics and control tests
###
###################################################################################
 
###
###	function for the simulation of the dataset
###
 
data.simulation <- function(sample.size = 100, corr_y_x1 = 0.7, corr_y_x2 = 0.2, corr_y_x3 = 0.1, precision = 0.01) {	
	# sample.size is the parameter controling the number of observations in the dataset
	# corr_y_x1 = expected correlation coefficient between y and x1
	# corr_y_x2 = expected correlation coefficient between y and x2
	# corr_y_x3 = expected correlation coefficient between y and x3
	corr_xi_xj <- 0  # = expected  correlation coefficient between predictor variables
	mat <- matrix(cbind(1, corr_y_x1, corr_y_x2, corr_y_x3,
						corr_y_x1, 1, corr_xi_xj,  corr_xi_xj,  
						corr_y_x2, corr_xi_xj,  1, corr_xi_xj, 
						corr_y_x3 ,corr_xi_xj, corr_xi_xj, 1), nrow=4)
	cor1 <- 999
	cor2 <- 999
	cor3 <- 999
	# loop repeating data simulations until the requested correlations are reached
	while(	abs(cor1 - corr_y_x1) > precision |  abs(cor2 - corr_y_x2) > precision | abs(cor3 - corr_y_x3) > precision ){
		dat <- rmvnorm(n=sample.size, mean=c(0,0,0,0), sigma=mat, method="chol")
		dat <- as.data.frame(dat); names(dat) <- c("y", "x1", "x2", "x3")
		y <- dat$y + 5
		x1 <- dat$x1 + 10
		x2 <- dat$x2 + 10
		x3 <- dat$x3 + 10
		cor1 <- cor.test(y,x1)$estimate
		cor2 <- cor.test(y,x2)$estimate
		cor3 <- cor.test(y,x3)$estimate
		# print(c(cor1, cor2, cor3))
	}
	x4 <- rnorm(sample.size , 10, 1)	
	dat <- data.frame(y, x1, x2, x3, x4)
	return(dat)
}
 
###
### simulate or import a data set
###
 
dat <- data.simulation(sample.size = 1000)
 
###
### some control statistics about the simulated dataset
###
 
summary(dat)
pairs(dat)
cor(dat)
cor.test(dat$y, dat$x1)
cor.test(dat$y, dat$x2)
cor.test(dat$y, dat$x3)
cor.test(dat$y, dat$x4)
 
### 
###	assessment of the normality of the distribution
###
 
# install.packages('fitdistrplus')
library(fitdistrplus)
## for the response
plot(fitdist(dat$y, "norm"))
shapiro.test(dat$y)
## for x1
plot(fitdist(dat$x1, "norm"))
shapiro.test(dat$x1)
## for x2
plot(fitdist(dat$x2, "norm"))
shapiro.test(dat$x2)
## for x3
plot(fitdist(dat$x3, "norm"))
shapiro.test(dat$x3)
## for x4
plot(fitdist(dat$x4, "norm"))
shapiro.test(dat$x4)
 
### 
###	assessment of the collinearity between covariates as it may affect the results
###
 
mat_collinearity <- model.matrix(~ x1 + x3 + x2 + x4, data = dat)
# Construction of a 
correlation matrix among the explanatory variables will yield 
indications as to the likelihood that any given couplet of 
right-hand-side variables are creating multicollinearity problems. 
Correlation values (off-diagonal elements) of at least .4 are sometimes 
interpreted as indicating a multicollinearity problem.
cov(mat_collinearity[,-1])
# Using the fact that 
the determinant of a diagonal matrix is the product of the eigenvalues 
=> The presence of one or more small eigenvalues indicates 
collinearity
eigen(cov(mat_collinearity[,-1]))$values
# variance inflation factor : a VIF of 5 or 10 and above indicates a multicollinearity problem (but see O'Brien 2007).
# install.packages('HH')
library(HH)
vif(dat[,-1])
 
###################################################################################
###
###	 model averaging with one single simulated data set (--> Tab. 2)
###
###################################################################################
 
###
###	model averaging (using MuMin package)
###		
 
dat <- data.simulation(sample.size = 1000)
 
reg <- lm(y ~ x1 + x2 + x3 + x4, data = dat)
# reg <- lm(y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x2:x4 + x3:x4, dat = dat)
summary(reg)
ms1 <- dredge(reg, extra = "adjR^2", rank = "AICc")
ms1 # as presented in Tab. 2
confset <- get.models(ms1)
avgmod <- model.avg(confset)
summary(avgmod) # parameters' sum of weigth
confint(avgmod) # averaged estimates
 
### 
###	computation of the basal sum of weights by permutation
### based on Burnham & Anderson (2002, p. 345-347, second method)
###
 
permutation_nb <- 1000 # permutation number. Minimum 100. A thousand or more is better (but it takes longer)
perm_weight_x1 <- numeric(permutation_nb)
perm_weight_x2 <- numeric(permutation_nb)
perm_weight_x3 <- numeric(permutation_nb)
perm_weight_x4 <- numeric(permutation_nb)
for (i in 1:permutation_nb ) {
	# permutation of y -------------------------------------
	dat$y2 <- sample(dat$y)
	# model averaging --------------------------------------
	reg <- lm(y2 ~ x1 + x2 + x3 + x4, dat = dat)
	# reg <- lm(y2 ~ x1 + x2 + x3 + x4 + 
					# x1:x2 + x1:x3 + x1:x4 + x2:x3 + x2:x4 + x3:x4, dat = dat)
	ms2 <- dredge(reg)
	confset2 <- get.models(ms2, cumsum(weight) <= 1)
	avgmod2 <- model.avg(confset2)
	perm_weight_x1[i] <- summary(avgmod2)$importance["x1"]
	perm_weight_x2[i] <- summary(avgmod2)$importance["x2"]
	perm_weight_x3[i] <- summary(avgmod2)$importance["x3"]
	perm_weight_x4[i] <- summary(avgmod2)$importance["x4"]
	### computation progress (it could be very slow) -------
	cat(round(100*i/permutation_nb,2), "% \n")
	flush.console()  
}
# estimated baseline SW value for simple predictor variables :
mean(perm_weight_x1)
mean(perm_weight_x2)
mean(perm_weight_x3)
mean(perm_weight_x4)
## plot of the distributions
xmin <- 0
close.screen(all = TRUE)
split.screen(c(2,2))
screen(1)
hist(perm_weight_x1, breaks = seq(xmin, 1, 0.05),
		xlab = expression(paste("permuted sum of weights ", (x[1]))),
		main = paste("baseline sum of weights =", round(mean(perm_weight_x1), 2)))
abline(v = mean(perm_weight_x1), lwd = 2, col = "red")
screen(2)
hist(perm_weight_x2, breaks = seq(xmin, 1, 0.05),
		xlab = expression(paste("permuted sum of weights ", (x[2]))),
		main = paste("baseline sum of weights =", round(mean(perm_weight_x2), 2)))
abline(v = mean(perm_weight_x2), lwd = 2, col = "red")
screen(3)
hist(perm_weight_x3, breaks = seq(xmin, 1, 0.05),
		xlab = expression(paste("permuted sum of weights ", (x[3]))),
		main = paste("baseline sum of weights =", round(mean(perm_weight_x3), 2)))
abline(v = mean(perm_weight_x3), lwd = 2, col = "red")
screen(4)
hist(perm_weight_x4, breaks = seq(xmin, 1, 0.05),
		xlab = expression(paste("permuted sum of weights ", (x[4]))),
		main = paste("baseline sum of weights =", round(mean(perm_weight_x4), 2)))
abline(v = mean(perm_weight_x4), lwd = 2, col = "red")
 
###################################################################################
###
###	    distribution of the sum of weights (--> Fig. 1). The basic idea is to repeat
###		previous experiment with a large number of independant simulated datasets. 
###     For each dataset, we compute SW for each parameters
###
###################################################################################
 
nb.simul <- 100 # number of simulations (at least 100, but the larger, the better)
sample.size <- 100 # sample size = number of observations in the data set
weight_x1 <- numeric(nb.simul)
weight_x2 <- numeric(nb.simul)
weight_x3 <- numeric(nb.simul)
weight_x4 <- numeric(nb.simul)
for( i in 1:nb.simul) {
	### dataset simulation -------------------------------
	dat <- data.simulation(sample.size)
	### model averaging -------------------------------
	reg <- lm(y ~  x1 + x2 + x3 + x4, data = dat)
	ms1 <- dredge(reg, rank = "AICc")
	confset <- get.models(ms1)
	avgmod <- model.avg(confset)
	# estimate's sum of weights
	weight_x1[i] <- summary(avgmod)$importance["x1"]
	weight_x2[i] <- summary(avgmod)$importance["x2"]
	weight_x3[i] <- summary(avgmod)$importance["x3"]
	weight_x4[i] <- summary(avgmod)$importance["x4"]
	### computation progress (it could be very slow) -------
	cat(100*i/nb.simul, "% \n")
	flush.console()  	
}
xmin <- 0
close.screen(all = TRUE)
split.screen(c(2,2))
screen(1)
hist(weight_x1, xlim = c(0,1), breaks = seq(xmin, 1,0.05),
		main = "", xlab = expression(paste("sum of weights ", (x[1]))), las = 1)
abline(v = mean(weight_x1, na.rm = T), col = "black", lty = 3, lwd = 2)
screen(2)
hist(weight_x2, xlim = c(0,1), breaks = seq(xmin, 1,0.05),
		main = "", xlab = expression(paste("sum of weights ", (x[2]))), las = 1)
abline(v = mean(weight_x2, na.rm = T), col = "black", lty = 3, lwd = 2)
screen(3)
hist(weight_x3,  xlim = c(0,1), breaks = seq(xmin, 1,0.05),
		main = "", xlab = expression(paste("sum of weights ", (x[3]))), las = 1)
abline(v = mean(weight_x3, na.rm = T), col = "black", lty = 3, lwd = 2)
length(weight_x3[weight_x3>0.5])/nb.simul
length(weight_x3[weight_x3>0.75])/nb.simul
screen(4)
hist(weight_x4,  xlim = c(0,1), breaks = seq(xmin, 1,0.05),
		main = "", xlab = expression(paste("sum of weights ", (x[4]))), las = 1)
abline(v = mean(weight_x4, na.rm = T), col = "black", lty = 3, lwd = 2)
length(weight_x4[weight_x4>0.5])/nb.simul
length(weight_x4[weight_x4>0.75])/nb.simul