# Install required packages (if not already installed)
if (!require("dunn.test")) install.packages("dunn.test", dependencies = TRUE)
if (!require("PMCMRplus")) install.packages("PMCMRplus", dependencies = TRUE)

library(dunn.test)
library(PMCMRplus)

# === File paths ===
# Use either of the following files:
# For flight dataset:
input_path <- "~/Documents/data_for_statistics_flight.csv"
# For car dataset (uncomment to use this one instead):
# input_path <- "~/Documents/data_for_statistics_car.csv"

# Output file
output_path <- "~/Documents/statistical_results_by_user.csv"

# Load the dataset
df <- read.csv(input_path, stringsAsFactors = FALSE)

# Ensure 'DiscomfortLevel' is treated as a factor
df$DiscomfortLevel <- as.factor(df$DiscomfortLevel)

# List of continuous variables to analyze
variables <- c("TimeStamp", "CameraRotationX", "CameraRotationY", "CameraRotationZ", 
               "GameFps", "PlayerSpeed", "ACC", "EDA", "ECG")

# List to store results
results <- list()

# Loop over each participant
for (user in unique(df$User)) {
  df_user <- df[df$User == user, ]

  for (var in variables) {
    cat(sprintf("\n User: %s | Variable: %s\n", user, var))

    # Shapiro–Wilk normality test by group (with protection against constant values)
    p_shapiro_0 <- NA
    if (sum(df_user$DiscomfortLevel == 0) >= 3) {
      x0 <- df_user[df_user$DiscomfortLevel == 0, var]
      if (length(unique(x0)) > 1) {
        p_shapiro_0 <- shapiro.test(x0)$p.value
      }
    }

    p_shapiro_1 <- NA
    if (sum(df_user$DiscomfortLevel == 1) >= 3) {
      x1 <- df_user[df_user$DiscomfortLevel == 1, var]
      if (length(unique(x1)) > 1) {
        p_shapiro_1 <- shapiro.test(x1)$p.value
      }
    }

    # Kruskal–Wallis test
    formula_kw <- as.formula(paste(var, "~ DiscomfortLevel"))
    kw_result <- tryCatch(kruskal.test(formula_kw, data = df_user), error = function(e) return(NULL))

    if (!is.null(kw_result)) {
      h <- kw_result$statistic
      p_kw <- kw_result$p.value

      if (p_kw < 0.05) {
        dunn_result <- dunn.test(df_user[[var]], df_user$DiscomfortLevel, method = "bonferroni", table = FALSE)
        z_value <- dunn_result$Z[1]
        p_adj <- dunn_result$P.adjusted[1]
        sig_kw <- TRUE
      } else {
        z_value <- NA
        p_adj <- NA
        sig_kw <- FALSE
      }

      # Print results to console
      cat(sprintf("  Shapiro-Wilk Group 0: %s\n", ifelse(is.na(p_shapiro_0), "not tested", sprintf("p = %.4f", p_shapiro_0))))
      cat(sprintf("  Shapiro-Wilk Group 1: %s\n", ifelse(is.na(p_shapiro_1), "not tested", sprintf("p = %.4f", p_shapiro_1))))
      cat(sprintf("  Kruskal-Wallis: H = %.4f, p = %.4f\n", h, p_kw))
      if (sig_kw) {
        cat(sprintf("  Dunn's Test: Z = %.4f, adjusted p = %.4g\n", z_value, p_adj))
      } else {
        cat("  Not statistically significant. Dunn's test skipped.\n")
      }

      # Interpretation of result
      interpretation <- if (sig_kw) {
        if (is.na(z_value)) {
          "Significant difference, but Z not computed"
        } else if (z_value > 0) {
          sprintf("Group 1 > Group 0 for '%s'", var)
        } else {
          sprintf("Group 0 > Group 1 for '%s'", var)
        }
      } else {
        sprintf("No significant difference for '%s'", var)
      }

      # Store result
      results[[length(results) + 1]] <- data.frame(
        User = user,
        Variable = var,
        Shapiro_p_0 = round(p_shapiro_0, 4),
        Shapiro_p_1 = round(p_shapiro_1, 4),
        Kruskal_H = round(h, 4),
        Kruskal_p = round(p_kw, 4),
        Significant_KW = sig_kw,
        Dunn_Z = round(z_value, 4),
        Dunn_p_adjusted = format.pval(p_adj, digits = 4, eps = .Machine$double.eps),
        Interpretation = interpretation,
        stringsAsFactors = FALSE
      )
    }
  }
}

# Combine all results and write to CSV
df_results <- do.call(rbind, results)
write.csv(df_results, output_path, row.names = FALSE)

cat("\nResults file saved to:", output_path, "\n")