--- title: "multivariate_smoothPLS_04" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{multivariate_smoothPLS_04} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(SmoothPLS) library(ggplot2) library(pls) ``` This vignette / notebook show how to mix a one state CFD, a multi-state CFD and a SFD into the same multivariate functional PLS model. # Parameters ```{r} N_states = 3 nind = 50 # number of individuals (train set) start = 0 # First time end = 100 # end time curve_type = 'cat' TTRatio = 0.2 # Train Test Ratio means we have floor(nind*TTRatio/(1-TTRatio)) NotS_ratio = 0.2 # noise variance over total variance for Y beta_0_real=65.4321 # Intercept value for the link between X(t) and Y nbasis = 7 # number of basis functions norder = 4 # 4 for cubic splines basis regul_time = seq(start, end, 1) # regularisation time sequence regul_time_0 = seq(start, end, 1) all_curves_in_pls = TRUE #Keep all or (all - 1) states in PLS method if FALSE. int_mode = 1 # in case of integration errors. ``` # CFD One state data generation ## df_cfd_os ```{r} df_cfd_os = generate_X_df(nind = nind, start = start, end = end, curve_type = 'cat') ``` ```{r} head(df_cfd_os) ``` ## Y_df_cfd_os ```{r} beta_func_cfd_os = beta_5_real_func Y_df_cfd_os = generate_Y_df(df_cfd_os, curve_type = 'cat', beta_real_func_or_list = beta_func_cfd_os, beta_0_real = beta_0_real, NotS_ratio, id_col = 'id', time_col = 'time') Y_cfd_os = Y_df_cfd_os$Y_noised ``` ```{r} plot(regul_time, beta_5_real_func(regul_time)) ``` # CFD Multistates Data generation ## lambda_determination ```{r} # Initialized the lambdas values lambdas = lambda_determination(N_states) lambdas ``` ## tranfer_probabilities ```{r} # Initialized the transition matrix transition_df = transfer_probabilities(N_states) transition_df ``` ## df_cfd ```{r} df_cfd = generate_X_df_multistates(nind = nind, N_states, start, end, lambdas, transition_df) head(df_cfd) ``` ```{r} plot_CFD_individuals(df_cfd) ``` ```{r} plot_CFD_individuals(df_cfd, by_cfda = TRUE) ``` ## beta list ```{r} ##### beta_real ##### ###### beta_0_real ###### beta_0_real ``` ```{r} beta_func_list = beta_list_generation(N_states = N_states) ``` ```{r} for(i in 1:N_states){ plot(0:end, beta_func_list[[i]](0:end, end_time = end), ylab=paste0("Beta(t) n°=", i), type = 'l') title(paste0("Beta(t) n°=", i)) } ``` ## Y evaluation Y generation is based on the following equation : $Y = \beta_0 + \sum_{i=1}^K \int_0^T \beta_i(t) ind_i(t) dt $ with $ind_i(t) = \{0, 1\}_{t \in [0, T]}$ the indicator function of the state $i$. We link $\beta_i$ with the $state_i$. ```{r} df_processed = cat_data_to_indicator(data = df_cfd) ``` ```{r} Y_df_cfd = generate_Y_df(df = df_processed, curve_type = curve_type, beta_real_func_or_list = beta_func_list, beta_0_real = beta_0_real, NotS_ratio = NotS_ratio) Y_cfd = Y_df_cfd$Y_noised names(Y_df_cfd) ``` ```{r} head(Y_df_cfd) ``` # SFD data generation For SFD for X_df two new arguments are important: the noise added to the signal and the seed for repeatability. ## df_sfd ```{r} df_sfd = generate_X_df(nind = nind, start = start, end = end, curve_type = 'num', noise_sd = 0.15, seed = 123) head(df_sfd) ``` ```{r} # Visualisation ggplot(df_sfd, aes(x = time, y = value, group = id, color = factor(id))) + geom_line(alpha = 0.8) + labs(title = "Noised cosinus curves", x = "Time", y = "Value", color = "Individual") + theme_minimal() + theme(legend.position = "none") ``` ## Data manipulation ```{r} df_sfd_regul = regularize_time_series(df_sfd, time_seq = regul_time, curve_type = 'num') df_sfd_regul ``` ```{r} df_sfd_long = convert_to_wide_format(df_sfd_regul) df_sfd_long ``` ## generate_Y_df WARNING! Here we have to regularize X(t) before evaluating Y! ```{r} beta_func_sfd = beta_1_real_func Y_df_sfd = generate_Y_df(df = df_sfd_regul, curve_type = 'num', beta_real_func_or_list = beta_func_sfd, beta_0_real = beta_0_real, NotS_ratio = NotS_ratio, seed = 123) Y_sfd = Y_df_sfd$Y_noised head(Y_df_sfd) ``` # beta_func_list_tot ```{r} beta_func_list_tot = append(beta_list_generation(N_states = N_states), append(beta_func_sfd, beta_func_cfd_os)) ``` ```{r hist Y noised} oldpar <- par(mfrow=c(1, 3)) hist(Y_df_sfd$Y_real) hist(Y_df_sfd$Y_noised) plot(regul_time, beta_4_real_func(regul_time)) par(oldpar) ``` # Y ```{r} Y = Y_cfd + Y_sfd + Y_cfd_os ``` # Test set ## CFD one state ### df_cfd_os_test ```{r} nind_test = floor(nind*TTRatio/(1-TTRatio)) df_cfd_os_test = generate_X_df(nind = nind_test, start = start, end = end, curve_type = 'cat') ``` ### Y_df_cfd_os_test ```{r} Y_df_cfd_os_test = generate_Y_df(df_cfd_os_test, curve_type = 'cat', beta_real_func_or_list = beta_5_real_func, beta_0_real = beta_0_real, NotS_ratio = NotS_ratio) Y_cfd_os_test = Y_df_cfd_os_test$Y_noised ``` ## CFD multistates ### df_cfd_test ```{r} nind_test = floor(nind*TTRatio/(1-TTRatio)) df_cfd_test = generate_X_df_multistates(nind = nind_test, N_states, start, end, lambdas, transition_df) ``` ### Y_df_cfd_test ```{r} df_cfd_test_processed = cat_data_to_indicator(df_cfd_test) Y_df_cfd_test = generate_Y_df(df_cfd_test_processed, curve_type = 'cat', beta_real_func_or_list = beta_func_list, beta_0_real = beta_0_real, NotS_ratio= NotS_ratio) Y_cfd_test = Y_df_cfd_test$Y_noised ``` ## SFD ### df_sfd_test ```{r} df_sfd_test = generate_X_df(nind = nind_test, start = start, end = end, curve_type = 'num', noise_sd = 0.15, seed = 123) ``` ### Y_df_cfd_test ```{r} df_sfd_regul_test = regularize_time_series(df_sfd_test, time_seq = regul_time, curve_type = 'num') Y_df_sfd_test = generate_Y_df(df = df_sfd_regul_test, curve_type = 'num', beta_real_func_or_list = beta_func_sfd, beta_0_real = beta_0_real, NotS_ratio = NotS_ratio, seed = 123) Y_sfd_test = Y_df_sfd_test$Y_noised ``` # Basis creation All the states will share the same basis. ```{r} basis = create_bspline_basis(start, end, nbasis, norder) #basis = fda::create.fourier.basis(c(start,end), nbasis = nbasis) # All the states will share the same basis. basis_list = obj_list_creation(N_rep = N_states, obj = basis) plot(basis, main=paste0(nbasis, " ", basis$type," functions basis")) ``` # SmoothPLS ## Multivariate ```{r} spls_obj = smoothPLS(df_list = list(df_cfd, df_sfd, df_cfd_os), Y = Y, basis_obj = basis, regul_time_obj = regul_time, curve_type_obj = list('cat', 'num', 'cat'), orth_obj = c(TRUE, TRUE, TRUE), id_col_obj = 'id', time_col_obj = 'time', print_steps = TRUE, plot_rmsep = TRUE, print_nbComp = TRUE, plot_reg_curves = FALSE) for(k in 1:(length(spls_obj$reg_obj)-1)){ y_lim = eval_max_min_y(f_list = list(beta_func_list_tot[[k]], spls_obj$reg_obj[[k+1]]), regul_time = regul_time_0) plot(regul_time_0, beta_func_list_tot[[k]](regul_time_0), ylim = y_lim) plot(spls_obj$reg_obj[[k+1]], add = TRUE, col = 'blue') title(paste0("Delta - ", names(spls_obj$reg_obj)[k+1])) } ``` ```{r} basis_2 = create_bspline_basis(start, end, (2*nbasis), norder) spls_obj_0 = smoothPLS(df_list = list(df_cfd, df_sfd, df_cfd_os), Y = Y_cfd + Y_sfd, basis_obj = list(basis_2, basis, basis), regul_time_obj = start:end, curve_type_obj = list('cat', 'num', 'cat'), orth_obj = c(TRUE, FALSE, TRUE), id_col_obj = 'id', time_col_obj = 'time', plot_reg_curves = FALSE) for(k in 1:(length(spls_obj_0$reg_obj)-1)){ y_lim = eval_max_min_y(f_list = list(beta_func_list_tot[[k]], spls_obj_0$reg_obj[[k+1]]), regul_time = regul_time_0) plot(regul_time_0, beta_func_list_tot[[k]](regul_time_0), ylim = y_lim) plot(spls_obj_0$reg_obj[[k+1]], add = TRUE, col = 'blue') title(paste0("Delta - ", names(spls_obj_0$reg_obj)[k+1])) } ``` ### Predictions Multivariate ```{r} Y_hat = smoothPLS_predict(df_predict_list = list(df_cfd, df_sfd, df_cfd_os), delta_list = spls_obj_0$reg_obj, curve_type_obj = list('cat', 'num', 'cat'), id_col_obj = 'id', time_col_obj = 'time', regul_time_obj = 0:100, int_mode = 1, nb_pt = 10, subdivision = 100) hist(Y_hat) ``` ## only SFD ```{r} spls_obj_2 = smoothPLS(df_list = df_sfd, Y = Y_df_sfd$Y_noised, basis_obj = basis, regul_time_obj = start:end, curve_type_obj = 'num', orth_obj = c(TRUE), id_col_obj = 'id', time_col_obj = 'time') for(k in 1:(length(spls_obj_2$reg_obj)-1)){ y_lim = eval_max_min_y(f_list = list(beta_func_sfd, spls_obj_2$reg_obj[[k+1]]), regul_time = regul_time_0) plot(regul_time_0, beta_func_sfd(regul_time_0), ylim = y_lim) plot(spls_obj_2$reg_obj[[k+1]], add = TRUE, col = 'blue') title(paste0("delta - ", names(spls_obj_2$reg_obj)[k+1])) } ``` ### Prediction SFD ```{r} Y_hat_2 = smoothPLS_predict(df_predict_list = df_sfd, delta_list = spls_obj_2$reg_obj, curve_type_obj = 'num', id_col_obj = 'id', time_col_obj = 'time', regul_time_obj = regul_time, int_mode = 1, nb_pt = 10, subdivision = 100) hist(Y_hat_2) ``` ## only CFD ```{r} spls_obj_3 = smoothPLS(df_list = df_cfd, Y = Y_df_cfd$Y_noised, basis_obj = basis, regul_time_obj = regul_time, curve_type_obj = 'cat', orth_obj = c(TRUE), id_col_obj = 'id', time_col_obj = 'time') for(k in 1:(length(spls_obj_3$reg_obj)-1)){ y_lim = eval_max_min_y(f_list = list(beta_func_list_tot[[k]], spls_obj_3$reg_obj[[k+1]]), regul_time = regul_time_0) plot(regul_time_0, beta_func_list_tot[[k]](regul_time_0), ylim = y_lim) plot(spls_obj_3$reg_obj[[k+1]], add = TRUE, col = 'blue') title(paste0("delta - ", names(spls_obj_3$reg_obj)[k+1])) } ``` ### Prediction CFD ```{r} Y_hat_3 = smoothPLS_predict(df_predict_list = df_sfd, delta_list = spls_obj_3$reg_obj, curve_type_obj = 'num', id_col_obj = 'id', time_col_obj = 'time', regul_time_obj = 0:100, int_mode = 1, nb_pt = 10, subdivision = 100) hist(Y_hat_3) ``` ## only CFD one state ```{r} spls_obj_4 = smoothPLS(df_list = df_cfd_os, Y = Y_df_cfd_os$Y_noised, basis_obj = basis, regul_time_obj = regul_time, curve_type_obj = 'cat', orth_obj = c(TRUE), id_col_obj = 'id', time_col_obj = 'time') for(k in 1:(length(spls_obj_4$reg_obj)-1)){ y_lim = eval_max_min_y(f_list = list(beta_func_cfd_os, spls_obj_4$reg_obj[[k+1]]), regul_time = regul_time_0) plot(regul_time_0, beta_func_cfd_os(regul_time_0), ylim = y_lim) plot(spls_obj_4$reg_obj[[k+1]], add = TRUE, col = 'blue') title(paste0("delta - ", names(spls_obj_4$reg_obj)[k+1])) } ``` ### Prediction CFD ```{r} Y_hat_4 = smoothPLS_predict(df_predict_list = df_sfd, delta_list = spls_obj_4$reg_obj, curve_type_obj = 'num', id_col_obj = 'id', time_col_obj = 'time', regul_time_obj = 0:100, int_mode = 1, nb_pt = 10, subdivision = 100) hist(Y_hat_4) ``` # Global PLS functions ## Naive PLS ```{r} naive_pls_obj = naivePLS(df_list = list(df_cfd, df_sfd, df_cfd_os), Y = Y, regul_time_obj = regul_time, curve_type_obj = list('cat', 'num', 'cat'), id_col_obj = 'id', time_col_obj = 'time', print_steps = TRUE, plot_rmsep = TRUE, print_nbComp = TRUE, plot_reg_curves = TRUE) ``` ## Functional PLS ```{r} fpls_obj = funcPLS(df_list = list(df_cfd, df_sfd, df_cfd_os), Y = Y, basis_obj = basis, curve_type_obj = list('cat', 'num', 'cat'), regul_time_obj = regul_time, id_col_obj = 'id', time_col_obj = 'time', print_steps = TRUE, plot_rmsep = TRUE, print_nbComp = TRUE, plot_reg_curves = TRUE) ``` ## Smooth PLS ```{r} spls_obj = smoothPLS(df_list = list(df_cfd, df_sfd, df_cfd_os), Y = Y, basis_obj = basis, int_mode = 1, regul_time_obj = regul_time, curve_type_obj = list('cat', 'num', 'cat'), orth_obj = c(TRUE, TRUE, TRUE), id_col_obj = 'id', time_col_obj = 'time', print_steps = TRUE, plot_rmsep = TRUE, print_nbComp = TRUE, plot_reg_curves = TRUE) ``` # Curves comparison ```{r} # Warning ms_spls_obj$delta_ms_list[[1]] is the intercept! cat("curve_1 : smooth PLS regression curve.\n") cat("curve_2 : functional PLS regression curve.\n") cat("curve_3 : naive PLS regression coefficients\n") N_curves = N_states + 2 # CFD multistate + sfd + cfd_one_state for(i in 1:N_curves){ start = 0 print(paste0("Curve_", i, " : ", names(spls_obj$reg_obj)[i+1])) evaluate_curves_distances(real_f = beta_func_list_tot[[i]], regul_time = regul_time, fun_fd_list = list(spls_obj$reg_obj[[i+1]], fpls_obj$reg_obj[[i+1]], approxfun( x = regul_time, y = naive_pls_obj$opti_reg_coef[ start:(start+length(regul_time) )]) ) ) start = start + length(regul_time) } ``` ```{r} for(i in 1:N_curves){ start = 0 y_lim = eval_max_min_y(f_list = list(spls_obj$reg_ob[[i+1]], fpls_obj$reg_ob[[i+1]], approxfun( x = regul_time, y = naive_pls_obj$opti_reg_coef[ start:(start+length(regul_time))]), beta_func_list_tot[[i]] ), regul_time = regul_time_0) plot(regul_time_0, beta_func_list_tot[[i]](regul_time_0), col = 'black', ylim = y_lim, xlab = 'Time', ylab = 'Value', type = 'l') lines(regul_time_0, approxfun(x = regul_time, y = naive_pls_obj$opti_reg_coef[ start:(start+ length(regul_time))])(regul_time_0), col = 'green') title(paste0(names(spls_obj$reg_obj)[i+1], " regression curves")) plot(spls_obj$reg_obj[[i+1]], col = 'blue', add = TRUE) plot(fpls_obj$reg_obj[[i+1]], col = 'red', add = TRUE) legend("topleft", legend = c("Real curve", "NaivePLS coef", "SmoothPLS reg curve", "FunctionalPLS reg curve"), col = c("black", "green", "blue", "red"), lty = 1, lwd = 1) start = start + length(regul_time) } ``` # Results ```{r} train_results = data.frame(matrix(ncol = 5, nrow = 3)) colnames(train_results) = c("PRESS", "RMSE", "MAE", "R2", "var_Y") rownames(train_results) = c("NaivePLS", "FPLS", "SmoothPLS") test_results = train_results ``` ```{r} print(paste0("There is ", 100*NotS_ratio, "% of noised in Y")) ``` ## Train set ```{r} Y_train = Y # Naive Y_hat = predict(naive_pls_obj$plsr_model, ncomp = naive_pls_obj$nbCP_opti, newdata = naive_pls_obj$plsr_model$model$`as.matrix(df_mod_wide)`) train_results["NaivePLS", ] = evaluate_results(Y_train, Y_hat) # FPLS Y_hat_fpls = (predict(fpls_obj$plsr_model, ncomp = fpls_obj$nbCP_opti, newdata = fpls_obj$trans_alphas) + fpls_obj$reg_obj$Intercept + mean(Y)) Y_hat_fpls = smoothPLS_predict(df_predict_list = list(df_cfd, df_sfd, df_cfd_os), delta_list = fpls_obj$reg_obj, curve_type_obj = list('cat', 'num', 'cat'), int_mode = int_mode, regul_time_obj = regul_time) train_results["FPLS", ] = evaluate_results(Y_train, Y_hat_fpls) # Smooth PLS Y_hat_spls = smoothPLS_predict(df_predict_list = list(df_cfd, df_sfd, df_cfd_os), delta_list = spls_obj$reg_obj, curve_type_obj = list('cat', 'num', 'cat'), int_mode = int_mode, regul_time_obj = regul_time) train_results["SmoothPLS", ] = evaluate_results(Y_train, Y_hat_spls) train_results["NaivePLS", "nb_cp"] = naive_pls_obj$nbCP_opti train_results["FPLS", "nb_cp"] = fpls_obj$nbCP_opti train_results["SmoothPLS", "nb_cp"] = spls_obj$nbCP_opti ``` ```{r} train_results ``` ## Test set ```{r} Y_test = Y_cfd_test + Y_sfd_test + Y_cfd_os_test # Naive df_test_wide = naivePLS_formatting(df_list = list(df_cfd_test, df_sfd_test, df_cfd_os_test), regul_time_obj = regul_time, curve_type_obj = list('cat', 'num', 'cat'), id_col_obj = 'id', time_col_obj = 'time') Y_hat = predict(naive_pls_obj$plsr_model, ncomp = naive_pls_obj$nbCP_opti, newdata = as.matrix(df_test_wide)) test_results["NaivePLS", ] = evaluate_results(Y_test, Y_hat) # FPLS Y_hat_fpls = smoothPLS_predict(df_predict_list = list(df_cfd_test, df_sfd_test, df_cfd_os_test), delta_list = fpls_obj$reg_obj, curve_type_obj = list('cat', 'num', 'cat'), int_mode = int_mode, regul_time_obj = regul_time) test_results["FPLS", ] = evaluate_results(Y_test, Y_hat_fpls) # Smooth PLS Y_hat_spls = smoothPLS_predict(df_predict_list = list(df_cfd_test, df_sfd_test, df_cfd_os_test), delta_list = spls_obj$reg_obj, curve_type_obj = list('cat', 'num', 'cat'), int_mode = int_mode, regul_time_obj = regul_time) test_results["SmoothPLS", ] = evaluate_results(Y_test, Y_hat_spls) test_results["NaivePLS", "nb_cp"] = naive_pls_obj$nbCP_opti test_results["FPLS", "nb_cp"] = fpls_obj$nbCP_opti test_results["SmoothPLS", "nb_cp"] = spls_obj$nbCP_opti ``` ```{r} test_results ``` ## Plot results ```{r} train_results test_results ``` ```{r} plot_model_metrics_base(train_results, test_results) ``` ```{r} plot_model_metrics_base(train_results, test_results, models_to_plot = c('FPLS', 'SmoothPLS')) ```