際際滷

際際滷Share a Scribd company logo
## HEADER ######################
## R CODE TO Validate SDM's
## Using the PresentAbsent package
## T.A.Groen (T.A.Groen@UTwente.com)
## May 2014
setwd("D:mod12day-9_model validationscript")
## LOAD THE REQUIRED PACKAGES #######
library("PresenceAbsence")
## Load the Data
d<-read.csv("dataBase.csv",header=TRUE, sep=";")
## ORGANIZE AND SPLIT THE DATA
## To split the data randomly, we're going to assign a
## random variable with a uniform distribution to the
## database, caled "random", using the runif() function.
## runif() stands for r[ANDOM]unif[ORM] and returns
## a list of random numbers between 0 and 1. Other minima
## and maxima can be set using additional arguments.
## Check ?runif if you want to know more about this.
d$random<-runif(length(d$Observation))
train<-d[d$random<0.70,] ## the training database (70%)
evalu<-d[d$random>=0.70,] ## the evaluation database (30%)
train<-d[d$random<0.70,]
evalu<-d[d$random>=0.70,]
## FIT THE MODELS (Make sure all packages and functions are loaded)
## 1) LOGISTIC
source("brt.functions.R")
model.glm0 <- glm(Observation ~ tmean+tmed_ja+ tr+ south+ west+slope+alt+p_tot+p_win + p_sum,
family="binomial",data=train)
model.glm <- step(model.glm0)
## 2) REGRESSION TREE
model.tree<-rpart(Observation ~ tmean+tmed_ja+ tr+ south+ west+slope+alt+p_tot+p_win + p_sum,data=train,
method="class")
## 3) BOOSTED REGRESSION TREE
source("brt.functions.R")
model.brt <- gbm.step(data=train,
gbm.x = c(10,9,7,11,5,12,36,21,20,22),
gbm.y = 1,
family = "bernoulli",
tree.complexity = 5,
learning.rate = 0.01,
bag.fraction = 0.5)
## CALCULATE THE PROBABILITIES FOR THE EVALUATION TRAININGSET
## we will now "predict" the new data for the the points that
## were not used (30% randomly selected) and calculate
## how acurately the models can predict for these
## points.
evalu$p.logistic<-predict(model.glm,type="response",newdata=evalu)
evalu$p.tree<-predict(model.tree,newdata=evalu)[,2]
evalu$p.brt<-gbm.predict.grids(model.brt,evalu, want.grids = F)
## ASSESS THE VARIOUS EVALUATION CRITERIA (ROC, AUC, KAPPA ETC)
## THE "PresenceAbsence" PACKAGE HAS A NUMBER
## OF FUNCTIONS TO SHOW THESE. THESE FUNCTIONS MOST OF THE
## TIME REQUIRE THE DATA TO BE OFFERED AS A DATAFRAME WITH
## THE FOLLOWING STRUCTURE:
## DATA[,1] plot ID text
## DATA[,2] observed values zero-one values
## DATA[,3] predicted probabilities from first model numeric (between 0 and 1)
## DATA[,4] predicted probabilities from second model, etc...
## (SEE ALSO THE HELP INFORMATION FOR THE PresenceAbsence PACKAGE)
## SO LETS FIRST CREATE SUCH A DATAFRAME
pa.validate<-data.frame(ID=1:length(evalu$Observation),
PA=evalu$Observation,
logistic=evalu$p.logistic,
tree=evalu$p.tree,
brt=evalu$p.brt)
## QUESTION: Do you understan the code provided above to create a new data
## set called "pa.validate"? How many variables should this dataset
## contain?
## NOW WE CAN START EVALUATING THE MODELS BY USING THE FUNCTIONS
## THAT THE PresenceAbsence PACKAGE PROVIDES US WITH
presence.absence.accuracy(pa.validate, threshold=0.5)
## SO BY SETTING A THRESHOLD YOU CAN ASK THIS COMMAND TO CALCULATE
## A LOT OF POSSIBLE EVALUATION PARAMETERS
## THERE ARE ALSO QUALITY PARAMETERS THAT DETERMINE THEIR
## OWN THRESHOLD (FOR EXAMPLE MAXIMUM KAPPA)
## YOU CAN USE THE FOLLOWING COMMAND FOR THAT:
optimal.thresholds(pa.validate)
## We can also have a look at relevant plots
error.threshold.plot(pa.validate,which.model=1)
error.threshold.plot(pa.validate,which.model=2)
error.threshold.plot(pa.validate,which.model=3)
## the "which.model" argument specifies which
## of the models in the dataset should be plotted
auc.roc.plot(pa.validate)
## this function can plot ROC curves for all models
## at once, but you can also use the which.model argument
## when you want to select just one model.
save.image()
## ASSIGNMENT 1
## fit models for at least three of thethe techniques
## you have learned so far
## on a random subset of your data
## using a 50-50 distribution. Look at the acuracy indicators
## of these models when you use predictions based on
## the training data and also when you use the predictions
## based on evaluation data.
## QUESTION 1
## Which of the chosen modelling techniques performs best?
## QUESTION 2
## WHich dataset (training or validation)
## gives the highest accuracy results?
## Which dataset of these gives the most honest
## assessment of the model quality?
setwd("D:mod12day-13_casestudyassignmentsscript")
d<-read.csv("viper_database.csv",header=TRUE, sep=",")
train<-d[d$random<0.70,]
evalu<-d[d$random>=0.70,
my.glm=glm(SPECIES~alt+geology+ecoclim+rad_w+temp_med_jun,data=d,family="binomial")
model.brt <- gbm.step(data=train,
gbm.x = c(10,9,7,11,5,12,36,21,20,22),
gbm.y = 1,
family = "bernoulli",
tree.complexity = 5,
learning.rate = 0.01,
bag.fraction = 0.5)

More Related Content

Validation

  • 1. ## HEADER ###################### ## R CODE TO Validate SDM's ## Using the PresentAbsent package ## T.A.Groen (T.A.Groen@UTwente.com) ## May 2014 setwd("D:mod12day-9_model validationscript") ## LOAD THE REQUIRED PACKAGES ####### library("PresenceAbsence") ## Load the Data d<-read.csv("dataBase.csv",header=TRUE, sep=";") ## ORGANIZE AND SPLIT THE DATA ## To split the data randomly, we're going to assign a ## random variable with a uniform distribution to the ## database, caled "random", using the runif() function. ## runif() stands for r[ANDOM]unif[ORM] and returns ## a list of random numbers between 0 and 1. Other minima ## and maxima can be set using additional arguments. ## Check ?runif if you want to know more about this. d$random<-runif(length(d$Observation)) train<-d[d$random<0.70,] ## the training database (70%) evalu<-d[d$random>=0.70,] ## the evaluation database (30%) train<-d[d$random<0.70,] evalu<-d[d$random>=0.70,] ## FIT THE MODELS (Make sure all packages and functions are loaded) ## 1) LOGISTIC source("brt.functions.R") model.glm0 <- glm(Observation ~ tmean+tmed_ja+ tr+ south+ west+slope+alt+p_tot+p_win + p_sum, family="binomial",data=train) model.glm <- step(model.glm0) ## 2) REGRESSION TREE model.tree<-rpart(Observation ~ tmean+tmed_ja+ tr+ south+ west+slope+alt+p_tot+p_win + p_sum,data=train, method="class") ## 3) BOOSTED REGRESSION TREE source("brt.functions.R") model.brt <- gbm.step(data=train, gbm.x = c(10,9,7,11,5,12,36,21,20,22), gbm.y = 1, family = "bernoulli", tree.complexity = 5, learning.rate = 0.01, bag.fraction = 0.5) ## CALCULATE THE PROBABILITIES FOR THE EVALUATION TRAININGSET ## we will now "predict" the new data for the the points that ## were not used (30% randomly selected) and calculate ## how acurately the models can predict for these ## points. evalu$p.logistic<-predict(model.glm,type="response",newdata=evalu) evalu$p.tree<-predict(model.tree,newdata=evalu)[,2] evalu$p.brt<-gbm.predict.grids(model.brt,evalu, want.grids = F) ## ASSESS THE VARIOUS EVALUATION CRITERIA (ROC, AUC, KAPPA ETC) ## THE "PresenceAbsence" PACKAGE HAS A NUMBER ## OF FUNCTIONS TO SHOW THESE. THESE FUNCTIONS MOST OF THE ## TIME REQUIRE THE DATA TO BE OFFERED AS A DATAFRAME WITH ## THE FOLLOWING STRUCTURE: ## DATA[,1] plot ID text ## DATA[,2] observed values zero-one values ## DATA[,3] predicted probabilities from first model numeric (between 0 and 1) ## DATA[,4] predicted probabilities from second model, etc... ## (SEE ALSO THE HELP INFORMATION FOR THE PresenceAbsence PACKAGE) ## SO LETS FIRST CREATE SUCH A DATAFRAME pa.validate<-data.frame(ID=1:length(evalu$Observation), PA=evalu$Observation, logistic=evalu$p.logistic, tree=evalu$p.tree, brt=evalu$p.brt) ## QUESTION: Do you understan the code provided above to create a new data ## set called "pa.validate"? How many variables should this dataset ## contain? ## NOW WE CAN START EVALUATING THE MODELS BY USING THE FUNCTIONS ## THAT THE PresenceAbsence PACKAGE PROVIDES US WITH presence.absence.accuracy(pa.validate, threshold=0.5) ## SO BY SETTING A THRESHOLD YOU CAN ASK THIS COMMAND TO CALCULATE ## A LOT OF POSSIBLE EVALUATION PARAMETERS ## THERE ARE ALSO QUALITY PARAMETERS THAT DETERMINE THEIR ## OWN THRESHOLD (FOR EXAMPLE MAXIMUM KAPPA) ## YOU CAN USE THE FOLLOWING COMMAND FOR THAT:
  • 2. optimal.thresholds(pa.validate) ## We can also have a look at relevant plots error.threshold.plot(pa.validate,which.model=1) error.threshold.plot(pa.validate,which.model=2) error.threshold.plot(pa.validate,which.model=3) ## the "which.model" argument specifies which ## of the models in the dataset should be plotted auc.roc.plot(pa.validate) ## this function can plot ROC curves for all models ## at once, but you can also use the which.model argument ## when you want to select just one model. save.image() ## ASSIGNMENT 1 ## fit models for at least three of thethe techniques ## you have learned so far ## on a random subset of your data ## using a 50-50 distribution. Look at the acuracy indicators ## of these models when you use predictions based on ## the training data and also when you use the predictions ## based on evaluation data. ## QUESTION 1 ## Which of the chosen modelling techniques performs best? ## QUESTION 2 ## WHich dataset (training or validation) ## gives the highest accuracy results? ## Which dataset of these gives the most honest ## assessment of the model quality? setwd("D:mod12day-13_casestudyassignmentsscript") d<-read.csv("viper_database.csv",header=TRUE, sep=",") train<-d[d$random<0.70,] evalu<-d[d$random>=0.70, my.glm=glm(SPECIES~alt+geology+ecoclim+rad_w+temp_med_jun,data=d,family="binomial") model.brt <- gbm.step(data=train, gbm.x = c(10,9,7,11,5,12,36,21,20,22), gbm.y = 1, family = "bernoulli", tree.complexity = 5, learning.rate = 0.01, bag.fraction = 0.5)