rpmjp/portfolio
rpmjp/projects/student-management-system/train_model.py
CompletedApril to May 2026

AI-powered Student Management System

Production-grade full-stack platform with role-based portals, real-time analytics, and a Random Forest model that predicts academic risk with 96% accuracy.

Java 21Jakarta EEMySQL 8PythonFlaskscikit-learnTomcat 10
Languages
Java85.3%
CSS10.8%
Python2.8%
Other1.1%
train_model.py
"""
train_model.py: Random Forest training pipeline for the academic risk model.

Generates synthetic student profiles, labels them with rules that encode what
an experienced advisor would flag manually, trains a 100-tree Random Forest,
evaluates on a held-out test set, prints feature importance, and serializes
the model to disk for the Flask service to load at inference time.

Synthetic data is used here because real anonymized university data wasn't
available. The labeling rules are sensitive on purpose: the model learns
the patterns, then generalizes to combinations the rules don't explicitly
cover.
"""

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
import os

# Generate synthetic training data based on realistic student patterns
np.random.seed(42)
n_students = 500

data = {
    'gpa': np.round(np.random.uniform(0.5, 4.0, n_students), 2),
    'courses_taken': np.random.randint(1, 12, n_students),
    'courses_failed': np.random.randint(0, 5, n_students),
    'avg_grade_points': np.round(np.random.uniform(0.5, 4.0, n_students), 2),
    'credits_completed': np.random.randint(3, 60, n_students),
    'semesters_enrolled': np.random.randint(1, 8, n_students),
}

df = pd.DataFrame(data)

# Create "at_risk" label based on realistic rules
df['at_risk'] = 0
df.loc[df['gpa'] < 2.0, 'at_risk'] = 1
df.loc[df['courses_failed'] >= 3, 'at_risk'] = 1
df.loc[df['avg_grade_points'] < 1.5, 'at_risk'] = 1
df.loc[(df['gpa'] < 2.5) & (df['courses_failed'] >= 2), 'at_risk'] = 1
df.loc[(df['credits_completed'] < 15) & (df['semesters_enrolled'] >= 4), 'at_risk'] = 1

print(f"Dataset: {len(df)} students")
print(f"At risk: {df['at_risk'].sum()} ({df['at_risk'].mean()*100:.1f}%)")
print(f"Not at risk: {(1-df['at_risk']).sum()} ({(1-df['at_risk']).mean()*100:.1f}%)")
print()

# Split and train
features = ['gpa', 'courses_taken', 'courses_failed', 'avg_grade_points',
            'credits_completed', 'semesters_enrolled']
X = df[features]
y = df['at_risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Not At Risk', 'At Risk']))

# Feature importance
print("Feature Importance:")
for name, importance in sorted(zip(features, model.feature_importances_), key=lambda x: x[1], reverse=True):
    print(f"  {name}: {importance:.4f}")

# Save model
os.makedirs('model', exist_ok=True)
with open('model/student_risk_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("\nModel saved to model/student_risk_model.pkl")