############################################################################################################################### ### R Code accompanying ### ### When to Refrain from Using Likelihood Surface Methods for Geographic Offender Profiling: An Ex Ante Test of Assumptions ### ### M. Vere Van Koppen, Henk Elffers, and Stijn Ruiter ### ### Journal of Investigative Psychology and Offender Profiling ### ### July 04, 2011 ### ############################################################################################################################### ####################################################################################### ### !!! DO NOT CHANGE CODE (unless you want to specify your own decay function) !!! ### ####################################################################################### ########################## ### !!! START CODE !!! ### ########################## SDD<-function(datafile,n.sim=10000,seed=123456789,plot.results=FALSE,counter=FALSE){ require(foreign) # for SPSS data file import data<-read.spss(file=datafile,to.data.frame=T,use.value.labels=F) dataset<-paste(datafile) require(gdata) # for upperTriangle function ### Set arbitrary Dmax (arbitrary because after standardization Dmax drops out of equation; see article) km<-10 ### Set number of Monte Carlo simulations for determining CIs n.sample<-n.sim ### Set seed for random number generator in order to obtain similar results each time set.seed(seed) ### Just a simple function to calculate number of interpoint distances samples<-function(n){n*(n-1)/2} ### Just a simple function to plot circles circle <- function(x, y, r, ...) { ang <- seq(0, 2*pi, length = 100) xx <- x + r * cos(ang) yy <- y + r * sin(ang) polygon(xx, yy, ...) } tab <- data.frame(idnr=names(table(data$idnr)), fre=as.vector(table(data$idnr))) df.new <- merge(data, tab, by="idnr") cases<-sort(unique(df.new$fre)) indices<-matrix(data=NA,nrow=length(tab$idnr),ncol=4) marauder<-matrix(data=NA,nrow=length(tab$idnr),ncol=1) indices.temp<-0 ### Loop over all sets of cases with specific number of offenses in real data if(plot.results==1){par(mfrow=c(2,1),mar=c(4,4,3,20))} for (i in 1:length(cases)){ n<-cases[i] print(paste("Series length: ",n,sep="")) identifiers<-matrix(data=NA,nrow=(dim(as.matrix(unique(subset(df.new, fre==cases[i])$idnr)))[1]),ncol=1) for (i in 1:(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1])){ identifier<-as.matrix(unique(subset(df.new, fre==n)$idnr))[i] print(paste("Offender ID: ",identifier,sep="")) identifiers[i]<-identifier } column<-rep(1:n,each=n) row<-rep(1:n,times=n) lookuptable<-cbind(column,row) ################## ### SIMULATION ### ################## file <- paste("CIs",n.sample,".",n,".csv" ,sep="") ######################################### ### Linear decay circle distributions ### ######################################### direction.decay<-matrix(data=NA,nrow=n.sample,ncol=n) distance.from.origin.decay<-matrix(data=NA,nrow=n.sample,ncol=n) for (i in 1:n.sample){ direction.decay[i,]<-runif(n,min=0,max=2*pi) distance.from.origin.decay[i,]<-km-km*sqrt(runif(n,min=.0001,max=1)) } ### Convert simulated angles + simulated distances from origin to xy-coordinates on circle y.decay<-sin(direction.decay)*distance.from.origin.decay x.decay<-cos(direction.decay)*distance.from.origin.decay if(file.exists(file)){ CIs<-read.csv(file) print(paste("Load CI file: ",file,sep="")) } else{ if(counter==1){ print(paste("The necessary CI for series length ",n," has not been simulated yet.",sep="")) print("Simulation will now be done. The CI will be saved for future use.") } #################################################################################### ### Compute all distances between crime sites within one simulated set of crimes ### #################################################################################### ### Start with calculating difference between sets of x-coordinates and sets of y-coordinates x.dif.decay<-matrix(data=NA,nrow=n*n,ncol=n.sample) y.dif.decay<-matrix(data=NA,nrow=n*n,ncol=n.sample) temp<-0 for (i in 1:n){ for (j in 1:n){ x.dif.decay[temp+j,]<-abs(x.decay[,i]-x.decay[,j]) y.dif.decay[temp+j,]<-abs(y.decay[,i]-y.decay[,j]) } temp<-temp+j } ### Determine distances distance.decay<-matrix(data=NA,nrow=n.sample,ncol=samples(n)) max.distance.decay<-matrix(data=NA,nrow=n.sample,ncol=1) rel.distance.decay<-matrix(data=NA,nrow=n.sample,ncol=samples(n)) for (i in 1:n.sample){ distance.decay[i,]<-sqrt((upperTriangle(matrix(data=(x.dif.decay[,i]),ncol=n,nrow=n))^2)+(upperTriangle(matrix(data=(y.dif.decay[,i]),ncol=n,nrow=n))^2)) max.distance.decay[i]<-max(distance.decay[i,]) rel.distance.decay[i,]<-distance.decay[i,]/max.distance.decay[i] } ### Compute confidence intervals for empirical cumulative distribution function of relative distances rel.distribution.decay<-matrix(data=NA,nrow=length(0:(km*10))*n.sample,ncol=1) temp<-0 if(counter==1){ ### Create progress bar pb<-txtProgressBar(min=0,max=n.sample,style=3)} for (i in 1:n.sample){ for (j in 0:(km*10)){ rel.distribution.decay[temp+j+1]<-ecdf(rel.distance.decay[i,])(j/100) } temp<-temp+j+1 if(counter==1){ setTxtProgressBar(pb,i) }} rel.percent2.5.decay<-matrix(data=NA,nrow=length(0:(km*10)),ncol=1) rel.percent97.5.decay<-matrix(data=NA,nrow=length(0:(km*10)),ncol=1) rel.percent0.5.decay<-matrix(data=NA,nrow=length(0:(km*10)),ncol=1) rel.percent99.5.decay<-matrix(data=NA,nrow=length(0:(km*10)),ncol=1) for (i in 1:(km*10+1)){ rel.percent2.5.decay[i]<-quantile(matrix(data=rel.distribution.decay,ncol=n.sample)[i,],probs=seq(0,1,.025),type=3)[2] rel.percent97.5.decay[i]<-quantile(matrix(data=rel.distribution.decay,ncol=n.sample)[i,],probs=seq(0,1,.025),type=3)[40] rel.percent0.5.decay[i]<-quantile(matrix(data=rel.distribution.decay,ncol=n.sample)[i,],probs=seq(0,1,.005),type=3)[2] rel.percent99.5.decay[i]<-quantile(matrix(data=rel.distribution.decay,ncol=n.sample)[i,],probs=seq(0,1,.005),type=3)[200] } CIs<-as.data.frame(cbind(rel.percent2.5.decay,rel.percent97.5.decay,rel.percent0.5.decay,rel.percent99.5.decay)) colnames(CIs)<-c("decay2.5","decay97.5","decay0.5","decay99.5") write.csv(CIs,file=file) } ################# ### Real data ### ################# x.real.orig<-matrix(data=NA,nrow=(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]),ncol=n) y.real.orig<-matrix(data=NA,nrow=(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]),ncol=n) for (i in 1:(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1])){ x.real.orig[i,]<-subset(df.new,idnr==unique(subset(df.new, fre==n)$idnr)[i])$Xpd y.real.orig[i,]<-subset(df.new,idnr==unique(subset(df.new, fre==n)$idnr)[i])$Ypd } ### Compute all distances between crime sites within one sample x.real.orig.dif<-matrix(data=NA,nrow=n*n,ncol=(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1])) y.real.orig.dif<-matrix(data=NA,nrow=n*n,ncol=(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1])) temp<-0 for (i in 1:n){ for (j in 1:n){ x.real.orig.dif[temp+j,]<-abs(x.real.orig[,i]-x.real.orig[,j]) y.real.orig.dif[temp+j,]<-abs(y.real.orig[,i]-y.real.orig[,j]) } temp<-temp+j } distance.orig.real<-matrix(data=NA,nrow=(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]),ncol=(n*(n-1)/2)) max.distance.orig.real<-matrix(data=NA,nrow=(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]),ncol=1) rel.distance.orig.real<-matrix(data=NA,nrow=(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]),ncol=(n*(n-1)/2)) whichmaxdistance.orig<-matrix(data=NA,nrow=(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]),ncol=1) whichmaxdistance.orig.col<-matrix(data=NA,nrow=(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]),ncol=1) whichmaxdistance.orig.row<-matrix(data=NA,nrow=(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]),ncol=1) for (i in 1:(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1])){ distance.orig.real[i,]<-sqrt((upperTriangle(matrix(data=(x.real.orig.dif[,i]),ncol=n,nrow=n))^2)+(upperTriangle(matrix(data=(y.real.orig.dif[,i]),ncol=n,nrow=n))^2)) max.distance.orig.real[i]<-max(distance.orig.real[i,]) rel.distance.orig.real[i,]<-distance.orig.real[i,]/max.distance.orig.real[i] whichmaxdistance.orig[i]<-min(which(max(sqrt((matrix(data=(x.real.orig.dif[,i]),ncol=n,nrow=n))^2+(matrix(data=(y.real.orig.dif[,i]),ncol=n,nrow=n))^2))==sqrt((matrix(data=(x.real.orig.dif[,i]),ncol=n,nrow=n))^2+(matrix(data=(y.real.orig.dif[,i]),ncol=n,nrow=n))^2))) whichmaxdistance.orig.row[i]<-lookuptable[min(which(max(sqrt((matrix(data=(x.real.orig.dif[,i]),ncol=n,nrow=n))^2+(matrix(data=(y.real.orig.dif[,i]),ncol=n,nrow=n))^2))==sqrt((matrix(data=(x.real.orig.dif[,i]),ncol=n,nrow=n))^2+(matrix(data=(y.real.orig.dif[,i]),ncol=n,nrow=n))^2))),2] whichmaxdistance.orig.col[i]<-lookuptable[min(which(max(sqrt((matrix(data=(x.real.orig.dif[,i]),ncol=n,nrow=n))^2+(matrix(data=(y.real.orig.dif[,i]),ncol=n,nrow=n))^2))==sqrt((matrix(data=(x.real.orig.dif[,i]),ncol=n,nrow=n))^2+(matrix(data=(y.real.orig.dif[,i]),ncol=n,nrow=n))^2))),1] } identifiers<-matrix(data=NA,nrow=(dim(as.matrix(unique(subset(df.new, fre==cases[i])$idnr)))[1]),ncol=1) real.distance.to.origin<-matrix(data=NA,nrow=n,ncol=1) for (i in 1:(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1])){ identifier<-as.matrix(unique(subset(df.new, fre==n)$idnr))[i] identifiers[i]<-identifier title <- bquote(bold(paste("Spatial pattern of offender no. ", .(identifier), " (n = ", .(n), ")"))) id<-subset(df.new,idnr==unique(subset(df.new, fre==n)$idnr)[i]) real.distance.to.origin<-sqrt(id$Xpd^2+id$Ypd^2) if(plot.results==1){ plot(id$Xpd,id$Ypd,main=title,cex.main=.8,xlab="",ylab="",col=i+4,pch=19,cex=.6,xlim=c(-max(real.distance.to.origin),max(real.distance.to.origin)),ylim=c(-max(real.distance.to.origin),max(real.distance.to.origin))) points(0,0,col=3,pch=19) circle((x.real.orig[i,whichmaxdistance.orig.row[i]]+x.real.orig[i,whichmaxdistance.orig.col[i]])/2,(y.real.orig[i,whichmaxdistance.orig.row[i]]+y.real.orig[i,whichmaxdistance.orig.col[i]])/2,max.distance.orig.real[i]/2,border=2) } marauder[i+indices.temp]<-sqrt(((x.real.orig[i,whichmaxdistance.orig.row[i]]+x.real.orig[i,whichmaxdistance.orig.col[i]])/2)^2+((y.real.orig[i,whichmaxdistance.orig.row[i]]+y.real.orig[i,whichmaxdistance.orig.col[i]])/2)^2)<=max.distance.orig.real[i]/2 } if(plot.results==1){ ### Plot confidence intervals on empirical cumulative distribution functions plot(1:(km*10+1)/5/(km*2),CIs$decay0.5,col=2,type="l",lwd=2,xlim=c(0,1),main="99% confidence interval for\nempirical cumulative distribution functions\nbelonging to linear distance decay distribution",xlab="",ylab="",cex.main=.8,cex.lab=.8) lines(1:(km*10+1)/5/(km*2),CIs$decay99.5,col=2,lwd=2) ### Plot real data on top of empirical cumulative distribution function for (i in 1:(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1])){ plot.ecdf(rel.distance.orig.real[i,],add=T,col=i+4) } lines(1:(km*10+1)/5/(km*2),CIs$decay0.5,col=2,lwd=2) lines(1:(km*10+1)/5/(km*2),CIs$decay99.5,col=2,lwd=2) plot.file<-paste("plot.",dataset,".",n.sample,".",n,".pdf" ,sep="") dev.copy2pdf(file=plot.file) } ### Determine whether empirical point pattern falls inside or outside simulated distribution below2.5.decay<-matrix(data=NA,nrow=length(1:((km*10)+1)),ncol=dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]) above97.5.decay<-matrix(data=NA,nrow=length(1:((km*10)+1)),ncol=dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]) outside.decay<-matrix(data=NA,nrow=length(1:((km*10)+1)),ncol=dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]) inside.any<-matrix(data=NA,nrow=length(1:((km*10)+1)),ncol=dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]) below0.5.decay<-matrix(data=NA,nrow=length(1:((km*10)+1)),ncol=dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]) above99.5.decay<-matrix(data=NA,nrow=length(1:((km*10)+1)),ncol=dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]) outside99.decay<-matrix(data=NA,nrow=length(1:((km*10)+1)),ncol=dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]) inside99.any<-matrix(data=NA,nrow=length(1:((km*10)+1)),ncol=dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1]) for (i in 1:(dim(as.matrix(unique(subset(df.new, fre==n)$idnr)))[1])){ for (k in 1:((km*10)+1)){ above97.5.decay[k,i]<-ecdf(rel.distance.orig.real[i,])((k-1)/100)>CIs$decay97.5[k] below2.5.decay[k,i]<-ecdf(rel.distance.orig.real[i,])((k-1)/100)CIs$decay99.5[k] below0.5.decay[k,i]<-ecdf(rel.distance.orig.real[i,])((k-1)/100)0 indices[indices.temp+i,2]<-sum(sum(outside.decay[,i])>0)<1 indices[indices.temp+i,3]<-sum(outside99.decay[,i])>0 indices[indices.temp+i,4]<-sum(sum(outside99.decay[,i])>0)<1 } indices.temp<-indices.temp+i } indices<-cbind(indices,marauder[,1]) colnames(indices)<-c("outside 95% linear decay distribution","inside any 95% distribution","outside 99% linear decay distribution","inside any 99%distribion","Marauder") indices indicesfile <- paste("indices.",dataset,".",n.sample,".csv" ,sep="") write.csv(indices,file=indicesfile) } ######################## ### !!! END CODE !!! ### ######################## ########################################## ### Change user-defined settings below ### ########################################## #################################################################### ### Required R packages: foreign, gdata ### ### If these packages are not installed, run the following code: ### ### install.packages("foreign") ### ### install.packages("gdata") ### #################################################################### ######################################################### ### Determine data location below ### ### Data file should contain the following variables: ### ### idnr (identifier for offender) ### ### Xpd (X coordinate for offense) ### ### Ypd (Y coordinate for offense) ### ######################################################### ### First set working directory that contains data file(s) setwd("C:/temp/") # Now, run the SDD function SDD("test.sav",n.sim=10000,seed=123456789,plot.results=TRUE,counter=TRUE)