@article{uninimx6067, number = {04}, title = {Forecasting of Post-Graduate Students? Late Dropout Based on the Optimal Probability Threshold Adjustment Technique for Imbalanced Data}, pages = {120--155}, volume = {18}, author = {Carmen Lil{\'i} Rodr{\'i}guez Velasco and Eduardo Garc{\'i}a Villena and Juli{\'e}n Brito Ballester and Frigdiano {\'A}lvaro Dur{\'a}ntez Prados and Eduardo Ren{\'e} Silva Alvarado and Jorge Crespo {\'A}lvarez}, journal = {International Journal of Emerging Technologies in Learning (iJET)}, year = {2023}, keywords = {optimal likelihood threshold,, imbalanced data, student dropout prediction, resample techniques, distance learning courses}, url = {http://repositorio.unini.edu.mx/id/eprint/6067/}, abstract = {The purpose of this research article was to contrast the benefits of the optimal probability threshold adjustment technique with other imbalanced data processing techniques, in its application to the prediction of post-graduate students? late dropout from distance learning courses in two universities in the Ibero-American space. In this context, the optimization of the Logistic Regression, Random Forest, and Neural Network classifiers, together with different techniques, attributes, and algorithms (Hyperparameters, SMOTE, SMOTE\_SVM, and ADASYN) resulted in a set of metrics for decision-making, prioritizing the reduction of false negatives. The best model was the Neural Network model in combination with SMOTE\_SVM, obtaining a recall index of 0.75 and an f1-Score of 0.60. Likewise, the robustness of the Random Forest classifier for imbalanced data was demonstrated by achieving, with an optimal threshold of 0.427, very similar metrics to those obtained by the consensus of the three best models found. This demonstrates that, for Random Forest, the optimal prediction probability threshold is an excellent alternative to resampling techniques with different optimal thresholds. Finally, it is hoped that this research paper will contribute to boost the application of this simple but powerful technique, which is highly underrated with respect to data resampling techniques for imbalanced data.} }