## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
set.seed(123)

## ----install, eval=FALSE------------------------------------------------------
# # Install from CRAN (when available)
# install.packages("tidydp")
# 
# # Or install development version from GitHub
# devtools::install_github("ttarler/tidydp")

## ----library------------------------------------------------------------------
library(tidydp)

## ----basic_noise--------------------------------------------------------------
# Create sample data
employee_data <- data.frame(
  name = c("Alice", "Bob", "Charlie", "Diana", "Eve"),
  age = c(28, 35, 42, 31, 38),
  salary = c(65000, 75000, 85000, 70000, 80000)
)

# View original data
head(employee_data)

# Add differential privacy noise
private_data <- employee_data %>%
  dp_add_noise(
    columns = c("age", "salary"),
    epsilon = 0.5,
    lower = c(age = 22, salary = 50000),
    upper = c(age = 65, salary = 150000)
  )

# View privatized data
head(private_data)

## ----counting-----------------------------------------------------------------
# Create sample data
city_data <- data.frame(
  city = rep(c("New York", "Los Angeles", "Chicago"), c(150, 120, 80)),
  category = sample(c("A", "B", "C"), 350, replace = TRUE)
)

# Overall count
overall_count <- city_data %>%
  dp_count(epsilon = 0.1)
print(overall_count)

# Grouped count by city
city_counts <- city_data %>%
  dp_count(epsilon = 0.1, group_by = "city")
print(city_counts)

# Count by multiple groups
city_category_counts <- city_data %>%
  dp_count(epsilon = 0.1, group_by = c("city", "category"))
head(city_category_counts)

## ----mean---------------------------------------------------------------------
# Create sample data
income_data <- data.frame(
  region = rep(c("North", "South", "East", "West"), each = 100),
  income = c(
    rnorm(100, mean = 60000, sd = 15000),
    rnorm(100, mean = 55000, sd = 12000),
    rnorm(100, mean = 65000, sd = 18000),
    rnorm(100, mean = 58000, sd = 14000)
  )
)

# Overall mean income
avg_income <- income_data %>%
  dp_mean(
    "income",
    epsilon = 0.2,
    lower = 20000,
    upper = 150000
  )
print(avg_income)

# Mean by region
regional_avg <- income_data %>%
  dp_mean(
    "income",
    epsilon = 0.2,
    lower = 20000,
    upper = 150000,
    group_by = "region"
  )
print(regional_avg)

## ----sum----------------------------------------------------------------------
# Create sales data
sales_data <- data.frame(
  store = rep(c("Store A", "Store B", "Store C"), each = 50),
  sales = c(
    rpois(50, lambda = 1000),
    rpois(50, lambda = 1200),
    rpois(50, lambda = 900)
  )
)

# Total sales by store
store_totals <- sales_data %>%
  dp_sum(
    "sales",
    epsilon = 0.3,
    lower = 0,
    upper = 5000,
    group_by = "store"
  )
print(store_totals)

## ----budget-------------------------------------------------------------------
# Create a privacy budget
budget <- new_privacy_budget(
  epsilon_total = 1.0,
  delta_total = 1e-5
)

print(budget)

# Perform first query
result1 <- city_data %>%
  dp_count(epsilon = 0.3, .budget = budget)

print(budget)

# Perform second query
result2 <- city_data %>%
  dp_count(epsilon = 0.4, group_by = "city", .budget = budget)

print(budget)

# Check if we have enough budget for another query
can_query <- check_privacy_budget(budget, epsilon_required = 0.5)
print(paste("Can perform query with epsilon=0.5?", can_query))

# We only have 0.3 epsilon remaining
can_query <- check_privacy_budget(budget, epsilon_required = 0.2)
print(paste("Can perform query with epsilon=0.2?", can_query))

## ----bounds_comparison--------------------------------------------------------
# Example: Impact of bounds on utility
test_data <- data.frame(age = c(25, 30, 35, 40, 45))

# Tight bounds (accurate)
tight_bounds <- test_data %>%
  dp_add_noise(
    columns = "age",
    epsilon = 0.5,
    lower = c(age = 20),
    upper = c(age = 50)
  )

# Loose bounds (less accurate)
loose_bounds <- test_data %>%
  dp_add_noise(
    columns = "age",
    epsilon = 0.5,
    lower = c(age = 0),
    upper = c(age = 100)
  )

# Compare results
data.frame(
  Original = test_data$age,
  Tight_Bounds = round(tight_bounds$age, 1),
  Loose_Bounds = round(loose_bounds$age, 1)
)

## ----mechanism_comparison-----------------------------------------------------
# Compare mechanisms
test_values <- data.frame(value = c(100, 200, 300, 400, 500))

# Laplace mechanism
laplace_result <- test_values %>%
  dp_add_noise(
    columns = "value",
    epsilon = 0.5,
    lower = c(value = 0),
    upper = c(value = 1000),
    mechanism = "laplace"
  )

# Gaussian mechanism
gaussian_result <- test_values %>%
  dp_add_noise(
    columns = "value",
    epsilon = 0.5,
    delta = 1e-5,
    lower = c(value = 0),
    upper = c(value = 1000),
    mechanism = "gaussian"
  )

data.frame(
  Original = test_values$value,
  Laplace = round(laplace_result$value, 1),
  Gaussian = round(gaussian_result$value, 1)
)

## ----complete_example---------------------------------------------------------
# Create employee dataset
employees <- data.frame(
  department = rep(c("Engineering", "Sales", "Marketing", "HR"), each = 25),
  salary = c(
    rnorm(25, 85000, 15000),  # Engineering
    rnorm(25, 70000, 12000),  # Sales
    rnorm(25, 65000, 10000),  # Marketing
    rnorm(25, 60000, 8000)    # HR
  ),
  years_experience = c(
    rpois(25, 5),
    rpois(25, 4),
    rpois(25, 3),
    rpois(25, 4)
  )
)

# Ensure realistic bounds
employees$salary <- pmax(40000, pmin(150000, employees$salary))
employees$years_experience <- pmax(0, pmin(20, employees$years_experience))

# Initialize privacy budget
analysis_budget <- new_privacy_budget(epsilon_total = 2.0)

# Query 1: Count by department (epsilon = 0.5)
dept_counts <- employees %>%
  dp_count(
    epsilon = 0.5,
    group_by = "department",
    .budget = analysis_budget
  )

cat("\nEmployee counts by department:\n")
print(dept_counts)

# Query 2: Average salary by department (epsilon = 0.8)
dept_salaries <- employees %>%
  dp_mean(
    "salary",
    epsilon = 0.8,
    lower = 40000,
    upper = 150000,
    group_by = "department",
    .budget = analysis_budget
  )

cat("\nAverage salaries by department:\n")
print(dept_salaries)

# Query 3: Average experience (epsilon = 0.4)
avg_experience <- employees %>%
  dp_mean(
    "years_experience",
    epsilon = 0.4,
    lower = 0,
    upper = 20,
    .budget = analysis_budget
  )

cat("\nAverage years of experience:\n")
print(avg_experience)

# Check remaining budget
cat("\nFinal budget status:\n")
print(analysis_budget)

## ----pitfall1, eval=FALSE-----------------------------------------------------
# # BAD: Running same query multiple times
# for (i in 1:10) {
#   result <- data %>% dp_count(epsilon = 0.1)
# }
# # Total cost: 10 * 0.1 = 1.0 epsilon!

## ----pitfall2, eval=FALSE-----------------------------------------------------
# # BETTER: Provide explicit bounds
# result <- data %>%
#   dp_mean("income", epsilon = 0.5, lower = 0, upper = 200000)
# 
# # WORSE: Let algorithm infer bounds from data
# result <- data %>%
#   dp_mean("income", epsilon = 0.5)

## ----pitfall3-----------------------------------------------------------------
# Very weak privacy
weak_privacy <- test_values %>%
  dp_add_noise(
    columns = "value",
    epsilon = 50,  # Too large!
    lower = c(value = 0),
    upper = c(value = 1000)
  )

# The noise is minimal
data.frame(
  Original = test_values$value,
  With_Noise = round(weak_privacy$value, 1),
  Difference = round(abs(test_values$value - weak_privacy$value), 1)
)

