## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## -----------------------------------------------------------------------------
library(autoFlagR)
library(dplyr)
library(ggplot2)

## -----------------------------------------------------------------------------
set.seed(123)

# Simulate healthcare data
n_patients <- 500
healthcare_data <- data.frame(
  patient_id = 1:n_patients,
  age = round(rnorm(n_patients, 55, 15)),
  systolic_bp = round(rnorm(n_patients, 120, 15)),
  diastolic_bp = round(rnorm(n_patients, 80, 10)),
  cholesterol = round(rnorm(n_patients, 200, 40)),
  glucose = round(rnorm(n_patients, 100, 20)),
  bmi = round(rnorm(n_patients, 28, 5), 1),
  gender = sample(c("Male", "Female"), n_patients, replace = TRUE),
  diagnosis = sample(c("Hypertension", "Diabetes", "Normal"), n_patients, replace = TRUE, prob = c(0.3, 0.2, 0.5))
)

# Introduce known anomalies
healthcare_data$age[1:10] <- c(250, 180, 200, 190, 185, 175, 170, 165, 160, 155)  # Impossible ages
healthcare_data$systolic_bp[11:15] <- c(300, 280, 290, 275, 285)  # Extreme blood pressure
healthcare_data$cholesterol[16:20] <- c(600, 580, 590, 570, 585)  # Very high cholesterol
healthcare_data$glucose[21:25] <- c(5, 3, 4, 2, 6)  # Unrealistically low glucose

# Create ground truth labels and add to data
healthcare_data$is_anomaly_truth <- rep(FALSE, n_patients)
healthcare_data$is_anomaly_truth[1:25] <- TRUE  # First 25 are anomalies

head(healthcare_data)

## -----------------------------------------------------------------------------
# Prepare data for anomaly detection
prepared <- prep_for_anomaly(
  healthcare_data,
  id_cols = "patient_id",
  scale_method = "mad"
)

# View preprocessing metadata
str(attr(prepared, "metadata"))

## -----------------------------------------------------------------------------
# Score anomalies using Isolation Forest
scored_data <- score_anomaly(
  healthcare_data,
  method = "iforest",
  contamination = 0.05,
  ground_truth_col = "is_anomaly_truth",
  id_cols = "patient_id"
)

# View summary statistics
summary(scored_data$anomaly_score)

## -----------------------------------------------------------------------------
# Flag top anomalies
flagged_data <- flag_top_anomalies(
  scored_data,
  contamination = 0.05
)

# Count anomalies
cat("Total anomalies flagged:", sum(flagged_data$is_anomaly), "\n")
cat("Anomaly rate:", mean(flagged_data$is_anomaly) * 100, "%\n")

## -----------------------------------------------------------------------------
# Plot anomaly score distribution
ggplot(flagged_data, aes(x = anomaly_score)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7, color = "black") +
  geom_vline(xintercept = attr(flagged_data, "anomaly_threshold"),
             color = "red", linetype = "dashed", linewidth = 1) +
  labs(
    title = "Distribution of Anomaly Scores",
    x = "Anomaly Score",
    y = "Frequency"
  ) +
  theme_minimal()

## -----------------------------------------------------------------------------
# Get top 10 anomalies
top_anomalies <- get_top_anomalies(flagged_data, n = 10)

# View top anomalies
top_anomalies[, c("patient_id", "age", "systolic_bp", "cholesterol", 
                  "glucose", "anomaly_score", "is_anomaly")]

## -----------------------------------------------------------------------------
# Extract benchmark metrics
if (!is.null(attr(scored_data, "benchmark_metrics"))) {
  metrics <- extract_benchmark_metrics(scored_data)
  
  cat("AUC-ROC:", metrics$auc_roc, "\n")
  cat("AUC-PR:", metrics$auc_pr, "\n")
  cat("Top-10 Recall:", metrics$top_k_recall$top_10, "\n")
  cat("Top-50 Recall:", metrics$top_k_recall$top_50, "\n")
}

## ----eval=FALSE---------------------------------------------------------------
# # Generate PDF audit report (saves to tempdir() by default)
# generate_audit_report(
#   healthcare_data,
#   filename = "healthcare_audit_report",
#   output_dir = tempdir(),
#   output_format = "pdf",
#   method = "iforest",
#   contamination = 0.05,
#   ground_truth_col = "is_anomaly_truth",
#   id_cols = "patient_id"
# )

