% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/clustervalidation.R
\name{validation_kproto}
\alias{validation_kproto}
\title{Validating k Prototypes Clustering}
\usage{
validation_kproto(
  method = "silhouette",
  object = NULL,
  data = NULL,
  type = "huang",
  k = NULL,
  lambda = NULL,
  kp_obj = "optimal",
  verbose = FALSE,
  ...
)
}
\arguments{
\item{method}{Character specifying the validation index: \code{cindex}, \code{dunn}, \code{gamma}, \code{gplus}, \code{mcclain}, \code{ptbiserial}, \code{silhouette} (default) or \code{tau}.}

\item{object}{Object of class \code{kproto} resulting from a call with \code{kproto(..., keep.data=TRUE)}.}

\item{data}{Original data; only required if \code{object == NULL} and neglected if \code{object != NULL}.}

\item{type}{Character, to specify the distance for clustering; either \code{"huang"} or \code{"gower"}.}

\item{k}{Vector specifying the search range for optimum number of clusters; if \code{NULL} the range will set as \code{2:sqrt(n)}. Only required if \code{object == NULL} and neglected if \code{object != NULL}.}

\item{lambda}{Factor to trade off between Euclidean distance of numeric variables and simple matching coefficient between categorical variables.}

\item{kp_obj}{character either "optimal" or "all": Output of the index-optimal clustering (kp_obj == "optimal") or all computed cluster partitions (kp_obj == "all"); only required if \code{object != NULL}.}

\item{verbose}{Logical, whether additional information about process should be printed.}

\item{...}{Further arguments passed to \code{\link[clustMixType]{kproto}}, like:
\itemize{
  \item \code{nstart}: If > 1 repetitive computations of \code{kproto} with random initializations are computed.
  \item \code{na.rm}: Character, either \code{"yes"} to strip \code{NA} values for complete case analysis, \code{"no"} to keep and ignore \code{NA} values, \code{"imp.internal"} to impute the \code{NAs} within the algorithm or \code{"imp.onestep"} to apply the algorithm ignoring the \code{NAs} and impute them after the partition is determined.
}}
}
\value{
For computing the optimal number of clusters based on the choosen validation index for k-Prototype clustering the output contains:

\item{k_opt}{optimal number of clusters (sampled in case of ambiguity)}

\item{index_opt}{index value of the index optimal clustering}

\item{indices}{calculated indices for \eqn{k=2,...,k_{max}}}

\item{kp_obj}{if(kp_obj == "optimal") the kproto object of the index optimal clustering and if(kp_obj == "all") all kproto which were calculated}

For computing the index-value for a given k-Prototype clustering the output contains:

\item{index}{calculated index-value}
}
\description{
Calculating the preferred validation index for a k-Prototypes clustering with k clusters or computing the optimal number of clusters based on the choosen index for k-Prototype clustering. Possible validation indices are: \code{cindex}, \code{dunn}, \code{gamma}, \code{gplus}, \code{mcclain}, \code{ptbiserial}, \code{silhouette} and \code{tau}.
}
\details{
More information about the implemented validation indices:
  \itemize{
    \item \code{cindex} \deqn{Cindex = \frac{S_w-S_{min}}{S_{max}-S_{min}}} \cr
For \eqn{S_{min}} and \eqn{S_{max}} it is necessary to calculate the distances between all pairs of points in the entire data set (\eqn{\frac{n(n-1)}{2}}). 
\eqn{S_{min}} is the sum of the "total number of pairs of objects belonging to the same cluster" smallest distances and 
\eqn{S_{max}} is the sum of the "total number of pairs of objects belonging to the same cluster" largest distances. \eqn{S_w} is the sum of the within-cluster distances. \cr
The minimum value of the index is used to indicate the optimal number of clusters.

    \item \code{dunn} \deqn{Dunn = \frac{\min_{1 \leq i < j \leq q} d(C_i, C_j)}{\max_{1 \leq k \leq q} diam(C_k)}} \cr
The following applies: The dissimilarity between the two clusters \eqn{C_i} and \eqn{C_j} is defined as \eqn{d(C_i, C_j)=\min_{x \in C_i, y \in C_j} d(x,y)} and
the diameter of a cluster is defined as \eqn{diam(C_k)=\max_{x,y \in C} d(x,y)}. \cr
The maximum value of the index is used to indicate the optimal number of clusters.

    \item \code{gamma} \deqn{Gamma = \frac{s(+)-s(-)}{s(+)+s(-)}} \cr 
Comparisons are made between all within-cluster dissimilarities and all between-cluster dissimilarities. 
\eqn{s(+)} is the number of concordant comparisons and \eqn{s(-)} is the number of discordant comparisons.
A comparison is named concordant (resp. discordant) if a within-cluster dissimilarity is strictly less (resp. strictly greater) than a between-cluster dissimilarity.\cr
The maximum value of the index is used to indicate the optimal number of clusters.

    \item \code{gplus} \deqn{Gplus = \frac{2 \cdot s(-)}{\frac{n(n-1)}{2} \cdot (\frac{n(n-1)}{2}-1)}} \cr 
Comparisons are made between all within-cluster dissimilarities and all between-cluster dissimilarities. 
\eqn{s(-)} is the number of discordant comparisons and a comparison is named discordant if a within-cluster 
dissimilarity is strictly greater than a between-cluster dissimilarity. \cr
The minimum value of the index is used to indicate the optimal number of clusters.

    \item \code{mcclain} \deqn{McClain = \frac{\bar{S}_w}{\bar{S}_b}} \cr 
\eqn{\bar{S}_w} is the sum of within-cluster distances divided by the number of within-cluster distances and 
\eqn{\bar{S}_b} is the sum of between-cluster distances divided by the number of between-cluster distances.\cr
The minimum value of the index is used to indicate the optimal number of clusters.

    \item\code{ptbiserial} \deqn{Ptbiserial = \frac{(\bar{S}_b-\bar{S}_w) \cdot (\frac{N_w \cdot N_b}{N_t^2})^{0.5}}{s_d}} \cr 
\eqn{\bar{S}_w} is the sum of within-cluster distances divided by the number of within-cluster distances and 
\eqn{\bar{S}_b} is the sum of between-cluster distances divided by the number of between-cluster distances.\cr
\eqn{N_t} is the total number of pairs of objects in the data, \eqn{N_w} is the total number of pairs of 
objects belonging to the same cluster and \eqn{N_b} is the total number of pairs of objects belonging to different clusters.
\eqn{s_d} is the standard deviation of all distances.\cr
The maximum value of the index is used to indicate the optimal number of clusters.

    \item \code{silhouette} \deqn{Silhouette = \frac{1}{n} \sum_{i=1}^n \frac{b(i)-a(i)}{max(a(i),b(i))}} \cr 
\eqn{a(i)} is the average dissimilarity of the i\emph{th} object to all other objects of the same/own cluster.
\eqn{b(i)=min(d(i,C))}, where \eqn{d(i,C)} is the average dissimilarity of the i\emph{th} object to all the other clusters except the own/same cluster.\cr
The maximum value of the index is used to indicate the optimal number of clusters.
    
    \item \code{tau} \deqn{Tau = \frac{s(+) - s(-)}{((\frac{N_t(N_t-1)}{2}-t)\frac{N_t(N_t-1)}{2})^{0.5}}} \cr 
Comparisons are made between all within-cluster dissimilarities and all between-cluster dissimilarities. 
\eqn{s(+)} is the number of concordant comparisons and \eqn{s(-)} is the number of discordant comparisons.
A comparison is named concordant (resp. discordant) if a within-cluster dissimilarity is strictly less 
(resp. strictly greater) than a between-cluster dissimilarity.\cr
\eqn{N_t} is the total number of distances \eqn{\frac{n(n-1)}{2}} and \eqn{t} is the number of comparisons 
of two pairs of objects where both pairs represent within-cluster comparisons or both pairs are between-cluster
comparisons. \cr
The maximum value of the index is used to indicate the optimal number of clusters.
   
  }
}
\examples{
\dontrun{
# generate toy data with factors and numerics
n   <- 10
prb <- 0.99
muk <- 2.5 

x1 <- sample(c("A","B"), 2*n, replace = TRUE, prob = c(prb, 1-prb))
x1 <- c(x1, sample(c("A","B"), 2*n, replace = TRUE, prob = c(1-prb, prb)))
x1 <- as.factor(x1)
x2 <- sample(c("A","B"), 2*n, replace = TRUE, prob = c(prb, 1-prb))
x2 <- c(x2, sample(c("A","B"), 2*n, replace = TRUE, prob = c(1-prb, prb)))
x2 <- as.factor(x2)
x3 <- c(rnorm(n, mean = -muk), rnorm(n, mean = muk), rnorm(n, mean = -muk), rnorm(n, mean = muk))
x4 <- c(rnorm(n, mean = -muk), rnorm(n, mean = muk), rnorm(n, mean = -muk), rnorm(n, mean = muk))
x <- data.frame(x1,x2,x3,x4)


# calculate optimal number of cluster, index values and clusterpartition with Silhouette-index
val <- validation_kproto(method = "silhouette", data = x, k = 3:5, nstart = 5)


# apply k-prototypes
kpres <- kproto(x, 4, keep.data = TRUE)

# calculate cindex-value for the given clusterpartition
cindex_value <- validation_kproto(method = "cindex", object = kpres)
}

}
\references{
\itemize{
    \item Aschenbruck, R., Szepannek, G. (2020): 
    Cluster Validation for Mixed-Type Data. 
    \emph{Archives of Data Science, Series A, Vol 6, Issue 1}.
    \doi{10.5445/KSP/1000098011/02}.
    
    \item Charrad, M., Ghazzali, N., Boiteau, V., Niknafs, A. (2014): 
    NbClust: An R Package for Determining the Relevant Number of Clusters in a Data Set. 
    \emph{Journal of Statistical Software, Vol 61, Issue 6}.
    \doi{10.18637/jss.v061.i06}.
  }
}
\author{
Rabea Aschenbruck
}
