### Reproduction script for R Journal Submission
### "PPCI: an R Package for Cluster Identification using Projection Pursuit"


run_reproduction_script <- function(){

### Preamble:

if(!"PPCI"%in%installed.packages()){

  cat('You must first install the PPCI package before continuing. Would you like to install the package? y/n \n \n')

  instl <- readline()

  if(instl=='y') install.packages("PPCI")
  else stop('You cannot continue to run the script without first installing the PPCI package. Please re-run the script')
}


library(PPCI)

### Example 1:
cat('Example 1: Clustering of two-dimensional toy data sets \n \n')
cat('In this example we show some of the types of clusters for which
    the methods in PPCI are, and are not, appropriate.
    Press [Enter] to continue \n \n')
readline('')  
cat('The first data set illustrates the capability of the methods
    to separate clusters with different scales. We apply the function
    mddc() to cluster the data.
    Press [Enter] to continue \n \n')
readline('')
set.seed(1)
means <- c(1, 0, -1, 0, 0, sqrt(2))*runif(6)
X1 <- matrix(means, 300, 2, byrow = T) + .05*rnorm(600)*(1:3)^2
sol1 <- mddc(X1, 3)
plot(X1, col = sol1$cluster, pch = sol1$cluster, xlab = '', ylab = '',
         xaxt = 'n', yaxt = 'n')

cat('Next we consider a simple case where the internal cluster variability
    dominates the between cluster separation. Classic methods like PCA may
    fail to find a projection which is useful for clustering such data.
    Here we use a single minimum density hyperplane to cluster the data,
    using the function mdh().
    Press [Enter] to continue \n \n')
readline('')
set.seed(1)
S <- matrix(c(1, .8, .8, 1), 2, 2)
X2 <- matrix(rnorm(1000), ncol= 2)%*%S + cbind(rep(c(.8,-.8),each=250), rep(c(-.8,.8),each=250))
sol2 <- mdh(X2)
plot(X2, col = sol2$cluster, pch = sol2$cluster, xlab = '', ylab = '',
         xaxt = 'n', yaxt = 'n')


cat('Finally we consider non-linear cluster structures. In the first case
    the two clusters are arranged so that they cannot be separated by
    a hyperplane. The methods in PPCI are inappropriate for such examples.
    In the second case the same clusters are rearranged so that their
    convex hulls do not overlap. In such cases the methods in PPCI are
    able to separate them. Again the function mdh() is applied.
    Press [Enter] to continue \n \n')
readline('')
set.seed(1)
th <- runif(500)*pi + rep(c(pi,0),each=250)
X3 <- cbind(cos(th)+rep(c(.5,-.5),each=250), sin(th)+rep(c(.2,-.2),each=250))
X3 <- X3 + .1*rnorm(1000)
sol3 <- mdh(X3)

par(mfrow = c(1, 2))
plot(X3, col = sol3$cluster, pch = sol3$cluster, xlab = '', ylab = '',
         xaxt = 'n', yaxt = 'n')

set.seed(1)
th <- runif(500)*pi + rep(c(pi,0),each=250)
X4 <- cbind(cos(th)+rep(c(.5,-.5),each=250), sin(th)-rep(c(.3,-.2),each=250))
X4 <- X4 + .1*rnorm(1000)
sol4 <- mdh(X4)
plot(X4, col = sol4$cluster, pch = sol4$cluster, xlab = '', ylab = '',
       xaxt = 'n', yaxt = 'n')

cat('Press Enter to continue to example 2. \n \n')
readline('')

### Example 2:
cat('Example 2: Clustering of images of handwritten digits \n \n')
cat('We begin by loading the data and then visualising the complete
data set as well as the data with the problematic edges removed.
Press [Enter] to continue \n \n')
readline('')
data(optidigits)
par(mfrow = c(1, 2))
plot(optidigits$x%*%eigen(cov(optidigits$x))$vectors, col = optidigits$c+1,
     xlab = 'PC1', ylab = 'PC2', main = 'Complete Data')
edges <- c((1:8)*8-7, (1:8)*8)
plot(optidigits$x[,-edges]%*%eigen(cov(optidigits$x[,-edges]))$vectors,
     col = optidigits$c+1, xlab = 'PC1', ylab = 'PC2',
     main = 'Data with Left and Right Edges Removed')

cat('Next we apply k-means to the complete data set,
the data with edges removed, and only the edges.
This will take a few seconds.
Press [Enter] to continue \n \n')
readline('')
set.seed(1)
cat('... applying k-means to the complete data set... \n \n')
km_sol <- suppressWarnings(kmeans(optidigits$x, 10, nstart = 10))
set.seed(1)
cat('... applying k-means to the data set with edges removed... \n \n')
km_sol_edges <- suppressWarnings(kmeans(optidigits$x[,edges], 10, nstart = 10))
set.seed(1)
cat('... applying k-means to only the edges... \n \n')
km_sol_middle <- suppressWarnings(kmeans(optidigits$x[,-edges], 10, nstart = 10))

cat('Next we visualise the cluster means from the three solutions
recast as images. You may need to re-size your plot window to
properly view these images.
Press [Enter] to view the solution from
the complete data set. This will take
a few seconds. \n \n')
readline('')
optidigits_mean_images(km_sol$cluster)
cat('Press [Enter] to view the solution from
the data set with edges removed. This will
take a few seconds. \n \n')
readline('')
optidigits_mean_images(km_sol_middle$cluster)
cat('Press [Enter] to view the solution from
only the edges of the images. This will take
a few seconds. \n \n')
readline('')
optidigits_mean_images(km_sol_edges$cluster)

cat('We now apply the divisive clustering
algorithms in PPCI to only the complete
data set. This will take about two minutes.
Press [Enter] to continue. \n \n')
readline('')
set.seed(1)
cat('... applying mddc to the complete data set... \n \n')
mddc_sol <- mddc(optidigits$x, 10)
cat('... applying ncutdc to the complete data set... \n \n')
ncutdc_sol <- ncutdc(optidigits$x, 10)
cat('... applying mcdc to the complete data set... \n \n')
mcdc_sol <- suppressWarnings(mcdc(optidigits$x, 10))


cat('We visualise the cluster means from the three solutions
as we did with the k-means solutions.
Press [Enter] to view the MDDC solution.
This will take a few seconds. \n \n')
readline('')
optidigits_mean_images(mddc_sol$cluster)
cat('Press [Enter] to view the NCutDC solution.
This will take a few seconds. \n \n')
readline('')
optidigits_mean_images(ncutdc_sol$cluster)
cat('Press [Enter] to view the MCDC solution.
This will take a few seconds. \n \n')
readline('')
optidigits_mean_images(mcdc_sol$cluster)

cat('You may have noticed that this final solution
was inferior to those obtained using MCDC and NCutDC.
This is caused by a slightly greater sensitivity of
MCDC to outliers. To mitigate this sensitivity we
can set the minimum cluster size to some appropriate
value. We set this to 50, but values between 10
and 200 will give similar solutions.
Press [Enter] to continue. \n \n')
readline('')
set.seed(1)
cat('... applying mcdc to the complete data set
with minimum cluster size set to 50... \n \n')
mcdc_sol <- suppressWarnings(mcdc(optidigits$x, 10, minsize = 50))
optidigits_mean_images(mcdc_sol$cluster)

cat('Press Enter to continue to example 3. \n \n')
readline('')

### Example 3:
cat('Example 3: Modifying and validating a clustering solution \n \n')
cat('In this example we run through the modification and validation
of a clustering solution obtained using PPCI. We again look
at the handwritten digits image data and begin with seven
clusters obtained using MDDC. The complete model will be
visualised as a binary partitioning tree.
Press [Enter] to continue. This will take a minute. \n \n')
readline('')
dev.new()
data(optidigits)
cat('... applying mddc for 7 clusters... \n \n')
sol <- mddc(optidigits$x, 7)
plot(sol)

cat('Notice the node numbered 4 in the plot. There are clearly
multiple clusters in the data at this node, but the
partition does not look optimal. We can investigate
this mode closely by considering alternative solutions
by considering more initialisations for projection
pursuit.
Press [Enter] to continue. \n \n')
readline('')
cat('... applying mdh to data at node 4 considering
3 initialisations... \n \n')
node4_x <- optidigits$x[sol$Nodes[[4]]$ixs,]
v0 <- function(X) rARPACK::eigs_sym(cov(X), 3)$vectors
node4_alt <- mdh(node4_x, v0 = v0)

cat('Once the alternative solutions have been found, we
can visualise them.
Press [Enter] to view the solution selected automatically. \n \n')
readline('')
plot(node4_alt)

cat('This solution appears visually to be superior to the
previous solution at node 4.
Press [Enter] to view the first (of two) alternatives to
this solution.')
readline('')
plot(node4_alt$alternatives[[1]])

cat('This solution is that in the previous model at node 4.
Press [Enter] to view the second alternative solution. \n \n')
readline('')
plot(node4_alt$alternatives[[2]])

cat('We replace the partition at node 4 with the best solution
found from these options. First we prune the tree at node 4 and
then split the node again using this new solution.
Press [Enter] to continue. \n \n')
readline('')
cat('... modifying solution at node 4 by first
pruning and then splitting the node... \n \n')
sol <- tree_prune(sol, 4)
v0 <- function(X) matrix(node4_alt$v, ncol = 1)
alpha <- node4_alt$params$alpha
sol <- tree_split(sol, 4, v0 = v0, alphamin = alpha)
plot(sol)

cat('The partition at node 4 looks superior at the scale of
the complete model as well.
At this stage all internal nodes appear to produce high
quality partitions, and so we turn attention to the
leaf nodes. Leaf nodes which appear to contain multiple
clusters should be further partitioned. Those which
are difficult to visualise at the scale of the complete
model can also be inspected to see if it would be
beneficial to continue splitting the data assigned
there. Node 10 is such an example.
Press [Enter] to view the node numbered 10 more closely. \n \n')
readline('')
plot(sol, node = 10)

cat('The outliers in the vertical direction have meant
that the scatter plot is less informative. The horizontal
direction is the optimal projection obtained from
the MDH method, while the vertical direction is simply the
principal component in the null space of the optimal
projection vector. Although the clustering itself only
takes place along the horizotal axis, the visualisation
using a scatter plot can be affected by these outliers.
In cases like this the univariate density along the
horizontal axis can be more instructive, and here we
see very strong bimodality suggesting the presence of
multiple clusters. We therefore add an additional
split at this node.
Press [Enter] to continue. \n \n')
readline('')
cat('... splitting node number 10... \n \n')
sol <- tree_split(sol, 9)
sol <- tree_split(sol, 10)

cat('We can inspect all leaf nodes in this way, and add binary
splits where there is strong evidence of multiple clusters. We
then iterate this procedure until no such leaf nodes with multiple
clusters remain. The projections at these leaves can also be modified
in the same manner as was done for node 4 previously, if it looks as
though a better partition could be obtained using a different projection
of the data. This interactive iterative procedure has the potential to
create very accurate clustering solutions, and can also be used to obtain
multiple alternative partitions of a data set. We will not go through
every step explicitly in this example.
Press [Enter] to allow the iterative interactive clustering to complete.
This will take a few minutes. \n \n')
readline('')

cat('... recursively splitting leaf nodes
until none show strong evidence of multiple
clusters... \n \n')
node12_x <- optidigits$x[sol$Nodes[[12]]$ixs,]
v0 <- function(X) rARPACK::eigs_sym(cov(X), 3)$vectors
node12_alt <- mdh(node12_x, v0 = v0)
v0 <- function(X) matrix(node12_alt$v, ncol = 1)
alpha <- node12_alt$params$alpha
sol <- tree_split(sol, 12, v0 = v0, alphamin = alpha)
sol <- tree_split(sol, 13)
sol <- tree_split(sol, 16)
sol <- tree_split(sol, 18)
sol <- tree_split(sol, 19)
sol <- tree_split(sol, 22)
sol <- tree_split(sol, 23)

plot(sol)

cat('The final solution is visualised as before. The solution is
valid in the sense that no internal nodes appear to induce sub-
optimal partitions, and no leaves show strong evidence of the presence
of multiple clusters.
Finally we compare the quality of this solution with the solution
assuming the number of clusters is given, and using the default
implementation of MDDC.
Press [Enter] to continue. This will take a few minutes. \n \n')
readline('')

cat('... applying mddc for the true number of (10) clusters... \n \n')
sol2 <- mddc(optidigits$x, 10)
print(cluster_performance(sol$cluster, optidigits$c))
print(cluster_performance(sol2$cluster, optidigits$c))

cat('The performance from the model obtained from interactive
modification (top) is substantially superior to the default model
(bottom). In Example 1 we also saw how the default model was already superior
to k-means by a large margin. \n \n')

cat('Press Enter to continue to example 4. \n \n')
readline('')

### Example 4:
cat('Example 4: Dimension reduction for clustering. \n \n')
cat('In addition to constructing a divisive hierarchical
clustering model using projection pursuit, we can combine
multiple univariate projections to obtain a single
multivariate projection of the data. We again look at the
handwritten images data.
Press [Enter] to continue. It will take about
five minutes to obtain all multivariate projections. \n \n')
readline('')
set.seed(1)
data(optidigits)
PCA_sol <- optidigits$x%*%eigen(cov(optidigits$x))$vectors[,1:9]
cat('... applying mddr for 9 dimensions... \n \n')
mddr_sol <- mddr(optidigits$x, 9)
cat('... applying mcdr for 9 dimensions... \n \n')
mcdr_sol <- suppressWarnings(mcdr(optidigits$x, 9))
cat('... applying ncutdr for 9 dimensions... \n \n')
ncutdr_sol <- ncutdr(optidigits$x, 9)

cat('To evaluate the performance of
the dimension reduction we look at the average silhouette
width of the data in the reduced space, with reference to
the true clusters. A high silhouette width suggests the
true clusters are well separated from one another. \n \n')
if(inst <- !"cluster"%in%installed.packages()){
  cat('To compute the silhouette index we use the
implementation in the cluster package, which is not
currently installed on your computer. If you would
like to install it now, type "y", otherwise type "n", then
press [Enter]. \n \n')
  entry <- readline('')
}
else entry <- ''
if(!inst || entry=='y'){
  if(entry=='y') install.packages('cluster')
  library(cluster)
  cat('The average silhouette index of the
clusters in the original data is \n')
  print(mean(silhouette(optidigits$c, dist(optidigits$x))[,3]))
  cat('The average silhouette index of the
clusters in the data in the principal component
space is \n')
  print(mean(silhouette(optidigits$c, dist(PCA_sol))[,3]))
  cat('The average silhouette index of the
clusters in the data projected using the methods
in PPCI is (top using MDH, middle using MCDC and
bottom using NCutH) \n')
  print(mean(silhouette(optidigits$c, dist(mddr_sol$fitted))[,3]))
  print(mean(silhouette(optidigits$c, dist(mcdr_sol$fitted))[,3]))
  print(mean(silhouette(optidigits$c, dist(ncutdr_sol$fitted))[,3]))
}
else{
  cat('Without the silhouette index we can visualise
the projected to to make a comparison on the quality
of the dimension reduction. To visualise the data
projected on their first principal components
press [Enter]. \n \n')
  readline('')
  plot(PCA_sol, col = optidigits$c)
  cat('To visualise the data projected onto a subspace
based on minimum density hyperplanes press [Enter]. \n \n')
  plot(mddr_sol, PCA = TRUE, labels = optidigits$c)
  cat('To visualise the data projected onto a subspace
based on maximum variance ratio press [Enter]. \n \n')
  plot(mcdr_sol, PCA = TRUE, labels = optidigits$c)
  cat('To visualise the data projected onto a subspace
based on minimum normalised cut hyperplanes press [Enter]. \n \n')
  plot(ncutdr_sol, PCA = TRUE, labels = optidigits$c)
}

cat('Press Enter to continue to example 5. \n \n')
readline('')

### Example 5:
cat('Example 5: Maximum margin hyperplanes for clustering. \n \n')
cat('In this example we use MDH repeatedly, increasing the bandwidth
each time to allow the solution to converge to a large margin
hyperplane. We apply this to the digits 3 and 9 from the optidigits
data set. We use only the test data, following Zhang et al. (2009)
"Maximum margin clustering made practical", ICML.
Press [Enter] to continue. This will take about
20 seconds. \n \n')
readline('')
cat('...finding large margin hyperplane using mdh... \n \n')
ids <- (3824:5620)[which(optidigits$c[3824:5620]%in%c(3, 9))]
x39 <- optidigits$x[ids,]
hp0 <- mdh(x39)
hp <- hp0
repeat{
  hp_new <- mdh(x39, v0 = hp$v, bandwidth = hp$params$h*0.9, alphamin = hp$params$alpha)
  if(hp$v%*%hp_new$v>(1-1e-10)) break
  else hp <- hp_new
}

cat('We can now visualise both the initial MDH solution, and
the final large margin solution. To view the first
MDH solution press [Enter]. \n \n')
readline('')
plot(hp0)

cat('To view the large margin solution, press
[Enter]. \n \n')
readline('')
plot(hp)

cat('The best performance report in Zhang et al. (2009)
was an error of 0.083. The performance of the large
margin solution obtained using MDH is \n')
print(1-cluster_performance(hp$cluster, optidigits$c[ids])[2])

cat('Press Enter to continue to example 6. \n \n')
readline('')

### Example 6:
cat('Example 6: Non-linear cluster separators. \n \n')
cat('In this final example we consider the separation
of clusters which cannot be separated by hyperplanes
in the input space. Press [Enter] to view a simple
example of such a data set. \n \n')
readline('')
set.seed(1)
th <- runif(500)*pi + rep(c(pi,0),each=250)
x <- cbind(cos(th) + rep(c(0.5,-0.5),each=250), sin(th)+ rep(c(0.2,-0.2),each=250))
x <- x + matrix(0.1*rnorm(1000), ncol=2)

plot(x)

cat('A hyperplane in this two-dimensional space
can be thought of as a straight line. Clearly there
is no straight line which can accurately separate
the two half-moon clusters. \n \n')

cat('We use kernel principal component analysis
(KPCA) to embed the data in a higher dimensional
space within which the clusters can be separated
by a hyperplane. \n')

if(inst <- !"kernlab"%in%installed.packages()){
  cat('The implementation of KPCA we use is in the
package "kernlab", which is not
currently installed on your computer.
If you would like to install it now, type "y",
otherwise type "n", then
press [Enter]. \n \n')
  entry <- readline('')
}
else entry <- ''
if(!inst || entry=='y'){
  if(entry=='y') install.packages('kernlab')
  cat('We apply ncuth to both the original data
and the embedded data. Press [Enter] to
continue. \n \n')
  readline('')
  library(kernlab)
  x2 <- kernlab::kpca(x, kernel = "rbfdot", kpar = list(sigma = 3))@rotated
  sol1 <- ncuth(x)
  sol2 <- ncuth(x2)
  par(mfrow = c(1, 2))
  par(mar = c(2.5, 2.5, 3, 2.5))
  plot(x, col = sol1$cluster, main = "Linear Separator")
  plot(x, col = sol2$cluster, main = "Non-linear Separator")
}
else{
  cat('Without the "kernlab" package we cannot
illustrate the separation of these clusters
using the methods in PPCI. \n \n')
}
cat('This completes all the examples in the paper.')
}

run_reproduction_script()
