## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## -----------------------------------------------------------------------------
library(autoFlagR)
library(dplyr)

## -----------------------------------------------------------------------------
set.seed(123)

# Create dataset
n <- 1000
data <- data.frame(
  id = 1:n,
  feature1 = rnorm(n, 100, 15),
  feature2 = rnorm(n, 50, 10),
  feature3 = rpois(n, 5),
  category = sample(c("A", "B", "C"), n, replace = TRUE)
)

# Introduce known anomalies (ground truth)
anomaly_indices <- c(1:20, 50:55, 100:110)
data$feature1[anomaly_indices] <- data$feature1[anomaly_indices] * 5  # Extreme values
data$feature2[anomaly_indices] <- data$feature2[anomaly_indices] * 3
data$feature3[anomaly_indices] <- data$feature3[anomaly_indices] * 10

# Create ground truth labels and add to data
data$is_error <- rep(FALSE, n)
data$is_error[anomaly_indices] <- TRUE

cat("Total anomalies in ground truth:", sum(data$is_error), "\n")

## -----------------------------------------------------------------------------
# Score anomalies with ground truth for benchmarking
scored_data <- score_anomaly(
  data,
  method = "iforest",
  contamination = 0.05,
  ground_truth_col = "is_error"
)

# Check if benchmarking was performed
if (!is.null(attr(scored_data, "benchmark_metrics"))) {
  cat("Benchmarking metrics available!\n")
}

## -----------------------------------------------------------------------------
# Extract benchmark metrics
metrics <- extract_benchmark_metrics(scored_data)

# Display metrics
cat("AUC-ROC:", round(metrics$auc_roc, 4), "\n")
cat("AUC-PR:", round(metrics$auc_pr, 4), "\n")
cat("Contamination Rate:", round(metrics$contamination_rate * 100, 2), "%\n")
cat("\nTop-K Recall:\n")
for (k_name in names(metrics$top_k_recall)) {
  k_value <- gsub("top_", "", k_name)
  recall <- metrics$top_k_recall[[k_name]]
  cat("  Top", k_value, ":", round(recall * 100, 2), "%\n")
}

## -----------------------------------------------------------------------------
# Compare Isolation Forest vs LOF
methods <- c("iforest", "lof")
results <- list()

for (method in methods) {
  scored <- score_anomaly(
    data,
    method = method,
    contamination = 0.05,
    ground_truth_col = "is_error"
  )
  
  metrics <- extract_benchmark_metrics(scored)
  results[[method]] <- metrics
}

# Create comparison table
comparison <- data.frame(
  Method = c("Isolation Forest", "Local Outlier Factor"),
  AUC_ROC = c(results$iforest$auc_roc, results$lof$auc_roc),
  AUC_PR = c(results$iforest$auc_pr, results$lof$auc_pr),
  Top_10_Recall = c(results$iforest$top_k_recall$top_10, results$lof$top_k_recall$top_10)
)

comparison