packages = ["pandas", "numpy", "scikit-learn"]

Welcome

Workspaces

Configuration Settings

×
# PyScript initialization and CSV parsing import pandas as pd import numpy as np from io import StringIO import json import js print("PyScript starting...") print("✓ Basic libraries imported") # Global storage for trained models trained_models_storage = {} def list_stored_models(): """Debug function to list all stored models""" print("📋 Stored models in trained_models_storage:") for key, model_info in trained_models_storage.items(): print(f" • {key}: {model_info['algorithm']} (job: {model_info['job_id']})") return list(trained_models_storage.keys()) # Test DataFrame creation test_df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) print(f"✓ Test DataFrame created: {test_df.shape}") # CSV parsing function def parse_csv_with_pyscript(csv_content, file_name, use_first_row_as_headers=True): print(f"PyScript parsing CSV: {file_name}, use_headers: {use_first_row_as_headers}") try: # Parse CSV csv_data = StringIO(csv_content) if use_first_row_as_headers: df = pd.read_csv(csv_data) else: # Parse without headers, first row becomes data df = pd.read_csv(csv_data, header=None) # Generate column names df.columns = [f'Column{i+1}' for i in range(len(df.columns))] print(f"DataFrame shape: {df.shape}") print(f"Columns: {list(df.columns)}") # Basic validation if df.empty: raise ValueError("CSV file appears to be empty") # Get original data types before cleaning original_dtypes = df.dtypes.to_dict() # Detect numeric columns more intelligently numeric_columns = [] for col in df.columns: if pd.api.types.is_numeric_dtype(df[col]): numeric_columns.append(col) else: # Check if it could be numeric but has string NaN values non_null_values = df[col].dropna() if len(non_null_values) > 0: try: pd.to_numeric(non_null_values, errors='raise') numeric_columns.append(col) except (ValueError, TypeError): pass # Convert dtypes to JSON-serializable format with proper detection column_info = {} for col in df.columns: if col in numeric_columns: # Check if it's integer or float non_null_values = pd.to_numeric(df[col], errors='coerce').dropna() if len(non_null_values) > 0: if non_null_values.equals(non_null_values.astype(int)): column_info[col] = 'int64' else: column_info[col] = 'float64' else: column_info[col] = 'float64' else: column_info[col] = 'object' # Clean DataFrame for preview (replace NaN with empty strings only for display) df_for_preview = df.fillna('') # Get column information columns = df.columns.tolist() print(f"Columns: {columns}") print(f"Detected types: {column_info}") # Get preview data (first 5 rows) with NaN handling preview_data = df_for_preview.head().to_dict('records') print(f"Preview rows: {len(preview_data)}") # Return column information to JavaScript column_data = { 'columns': columns, 'dtypes': column_info, # Use our improved detection 'shape': list(df.shape), # Convert numpy array to list 'filename': file_name, 'preview': preview_data, 'success': True, 'parser': 'pandas' } # Store the original DataFrame globally for training globals()['current_dataframe'] = df globals()['current_filename'] = file_name # Convert to JSON string with proper handling json_data = json.dumps(column_data, default=str) js.handleParsedData(json_data) print(f"✓ Successfully parsed CSV: {df.shape[0]} rows, {df.shape[1]} columns") except Exception as e: print(f"✗ PyScript CSV parsing error: {str(e)}") error_data = { 'success': False, 'error': str(e), 'filename': file_name, 'parser': 'pandas' } json_data = json.dumps(error_data, default=str) js.handleParsedData(json_data) print(f"✗ CSV parsing failed: {e}") # Function to update DataFrame with custom headers def update_dataframe_headers(custom_headers): """Update the stored DataFrame with custom column headers""" if 'current_dataframe' in globals(): df = globals()['current_dataframe'] if len(custom_headers) == len(df.columns): print(f"Updating DataFrame headers from {list(df.columns)} to {custom_headers}") df.columns = custom_headers globals()['current_dataframe'] = df print("✓ DataFrame headers updated successfully") return True else: print(f"✗ Header count mismatch: {len(custom_headers)} vs {len(df.columns)}") return False return False # Make function available to JavaScript js.window.parse_csv_with_pyscript = parse_csv_with_pyscript js.window.update_dataframe_headers = update_dataframe_headers # Function to filter DataFrame by selected columns def filter_dataframe_columns(selected_columns): """Filter the stored DataFrame to only include selected columns""" if 'current_dataframe' in globals(): df = globals()['current_dataframe'] if selected_columns and len(selected_columns) > 0: # Verify all selected columns exist missing_columns = [col for col in selected_columns if col not in df.columns] if missing_columns: print(f"✗ Selected columns not found: {missing_columns}") return False # Filter DataFrame filtered_df = df[selected_columns] globals()['current_dataframe'] = filtered_df print(f"✓ DataFrame filtered from {df.shape[1]} to {filtered_df.shape[1]} columns") print(f" Selected columns: {selected_columns}") return True else: print("✗ No columns selected") return False print("✗ No DataFrame available") return False js.window.filter_dataframe_columns = filter_dataframe_columns # Function to get unique values count for a column def get_column_unique_count(column_name): """Get the number of unique values in a column""" if 'current_dataframe' in globals(): df = globals()['current_dataframe'] if column_name in df.columns: unique_count = len(df[column_name].dropna().unique()) print(f"Column '{column_name}' has {unique_count} unique values") return unique_count else: print(f"Column '{column_name}' not found in DataFrame") return 0 print("No DataFrame available") return 0 js.window.get_column_unique_count = get_column_unique_count # Training function for ML models - Full implementation with all metrics def train_ml_models_pyscript(job_data_json, data_info): from datetime import datetime import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from sklearn.metrics import roc_auc_score from sklearn.impute import SimpleImputer # Initialize comprehensive logging system training_logs = [] def log_message(message, level="info"): """Add a timestamped log entry""" # Create timestamp using Python datetime instead of js.Date() from datetime import datetime timestamp = datetime.now().strftime("%H:%M:%S") log_entry = { 'timestamp': timestamp, 'message': str(message), 'level': level } training_logs.append(log_entry) print(f"[{timestamp}] {message}") log_message("🚀 STARTING AUTOML TRAINING JOB") log_message("=" * 50) try: # Parse job data job_data = json.loads(job_data_json) log_message("📄 Parsing job configuration...") # Log all job settings with details log_message("🔍 JOB CONFIGURATION:") log_message(f" • Target column: {job_data.get('targetColumn')}") log_message(f" • Task type: {job_data.get('taskType')}") log_message(f" • Algorithms: {job_data.get('algorithms', [])}") log_message(f" • Primary metric: {job_data.get('primaryMetric')}") log_message(f" • Normalize features: {job_data.get('normalizeFeatures', False)}") log_message(f" • Categorical settings: {job_data.get('categoricalSettings', {})}") # Check if we have the DataFrame stored globally if 'current_dataframe' not in globals(): raise ValueError("No dataframe available for training. Please upload a CSV file first.") df = globals()['current_dataframe'] log_message(f"📊 Dataset loaded successfully:") log_message(f" • Original shape: {df.shape[0]} rows × {df.shape[1]} columns") log_message(f" • Original columns: {list(df.columns)}") # Filter DataFrame to only include selected columns selected_columns = job_data.get('selectedColumns', []) if selected_columns and len(selected_columns) > 0: log_message(f"🔍 Filtering dataset to selected columns:") log_message(f" • Selected columns: {selected_columns}") # Verify all selected columns exist in the DataFrame missing_columns = [col for col in selected_columns if col not in df.columns] if missing_columns: raise ValueError(f"Selected columns not found in dataset: {missing_columns}") # Filter DataFrame to only selected columns df = df[selected_columns] log_message(f" • Filtered shape: {df.shape[0]} rows × {df.shape[1]} columns") log_message(f" • Filtered columns: {list(df.columns)}") else: log_message("⚠️ No column selection provided, using all columns") log_message(f" • Using all {df.shape[1]} columns: {list(df.columns)}") # Get training parameters target_column = job_data['targetColumn'] task_type = job_data['taskType'] algorithms = job_data.get('algorithms', []) primary_metric = job_data.get('primaryMetric', 'accuracy' if task_type == 'classification' else 'mae') normalize_features = job_data.get('normalizeFeatures', False) categorical_settings = job_data.get('categoricalSettings', {}) # Get timeout settings and initialize timing experiment_timeout = job_data.get('experimentTimeout') # in minutes import time start_time = time.time() timed_out = False if experiment_timeout: timeout_seconds = experiment_timeout * 60 # Convert to seconds log_message(f" • Experiment timeout: {experiment_timeout} minutes ({timeout_seconds} seconds)") else: log_message(" • No experiment timeout set") # Get early stopping threshold if set metric_threshold = job_data.get('metricThreshold') early_stop_triggered = False log_message("🔄 Preparing training data with pipelines...") # Prepare features and target X = df.drop(columns=[target_column]) y = df[target_column] log_message(f" • Features: {X.shape[1]} columns") log_message(f" • Target values: {len(y)} samples") log_message(f" • Target column: '{target_column}'") # For classification, detect number of unique classes n_classes = None class_labels = None if task_type == 'classification': class_labels = sorted(y.unique()) n_classes = len(class_labels) log_message(f" • Classification type: {'Binary' if n_classes == 2 else 'Multiclass'} ({n_classes} classes)") log_message(f" • Class labels: {class_labels}") # Store original column names for user-friendly input original_columns = list(X.columns) log_message(f" • Original feature columns: {original_columns}") # Identify categorical and numeric columns for pipeline log_message("📋 Analyzing column types for pipeline...") categorical_cols = [] numeric_cols = [] columns_to_ignore = [] for col in X.columns: if col in categorical_settings: if categorical_settings[col] == 'ignore': columns_to_ignore.append(col) continue elif categorical_settings[col] == 'categorize': if X[col].dtype == 'object': categorical_cols.append(col) else: numeric_cols.append(col) else: # Auto-detect based on data type if X[col].dtype == 'object': categorical_cols.append(col) else: numeric_cols.append(col) log_message(f" • Categorical columns: {len(categorical_cols)} - {categorical_cols}") log_message(f" • Numeric columns: {len(numeric_cols)} - {numeric_cols}") log_message(f" • Ignored columns: {len(columns_to_ignore)} - {columns_to_ignore}") # Remove ignored columns if columns_to_ignore: X = X.drop(columns=columns_to_ignore) original_columns = [col for col in original_columns if col not in columns_to_ignore] log_message(f" • Dropped ignored columns, remaining: {X.shape[1]} features") # Handle missing data in target before split if y.isna().sum() > 0: log_message(f"🧹 Handling {y.isna().sum()} missing values in target...") if task_type == 'classification': y = y.fillna(y.mode()[0] if len(y.mode()) > 0 else 0) else: # regression y = y.fillna(y.mean()) log_message(" • Target missing values handled") log_message(f"🔧 Creating preprocessing pipeline for {task_type}...") # Import pipeline components from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder from sklearn.impute import SimpleImputer # Create preprocessing steps missing_data_strategy = job_data.get('missingDataStrategy', 'impute') # Default to impute for pipelines # Numeric pipeline numeric_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler() if normalize_features else 'passthrough') ]) # Categorical pipeline categorical_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')) ]) # Create column transformer preprocessor = ColumnTransformer([ ('num', numeric_pipeline, numeric_cols), ('cat', categorical_pipeline, categorical_cols) ]) log_message(f" • Numeric preprocessing: {len(numeric_cols)} columns - impute + {'scale' if normalize_features else 'no scaling'}") log_message(f" • Categorical preprocessing: {len(categorical_cols)} columns - impute + one-hot encode") # Create dynamic random state based on job ID and settings job_id = job_data.get('id', 1) settings_hash = hash(str(normalize_features) + str(categorical_settings) + str(algorithms)) random_state = abs((job_id + settings_hash) % 10000) log_message(f"🎲 Random state: {random_state} (based on job {job_id} and settings)") # Split data BEFORE preprocessing to avoid data leakage log_message("✂️ Splitting data into train/test sets...") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state) log_message(f" • Training set: {X_train.shape[0]} samples ({X_train.shape[0]/(X_train.shape[0]+X_test.shape[0])*100:.1f}%)") log_message(f" • Test set: {X_test.shape[0]} samples ({X_test.shape[0]/(X_train.shape[0]+X_test.shape[0])*100:.1f}%)") # Define base models (pipelines will be created for each) log_message(f"🤖 Initializing {task_type} models with pipelines...") if task_type == 'classification': # Configure algorithms for binary or multiclass classification if n_classes > 2: # Multiclass classification - only use tree-based algorithms log_message(f" • Configuring algorithms for multiclass classification ({n_classes} classes)") log_message(f" • Note: Logistic regression not supported for multiclass, using tree-based algorithms only") base_models = { 'decision_tree': DecisionTreeClassifier(random_state=random_state), 'random_forest': RandomForestClassifier(random_state=random_state, n_estimators=10) } # Filter out logistic_regression from requested algorithms if present if 'logistic_regression' in algorithms: log_message(f" • Removing logistic_regression from algorithm list (not supported for multiclass)") algorithms = [algo for algo in algorithms if algo != 'logistic_regression'] if not algorithms: raise ValueError("No valid algorithms selected for multiclass classification. Please select Decision Tree or Random Forest.") else: # Binary classification log_message(f" • Configuring algorithms for binary classification") base_models = { 'logistic_regression': LogisticRegression(random_state=random_state, max_iter=1000), 'decision_tree': DecisionTreeClassifier(random_state=random_state), 'random_forest': RandomForestClassifier(random_state=random_state, n_estimators=10) } else: base_models = { 'linear_regression': LinearRegression(), 'lasso': Lasso(random_state=random_state), 'decision_tree': DecisionTreeRegressor(random_state=random_state), 'random_forest': RandomForestRegressor(random_state=random_state, n_estimators=10) } log_message(f" • Available models: {list(base_models.keys())}") log_message(f" • Selected algorithms: {algorithms}") # Create full pipelines (preprocessor + model) for each algorithm model_map = {} for algo in algorithms: if algo in base_models: model_map[algo] = Pipeline([ ('preprocessor', preprocessor), ('classifier' if task_type == 'classification' else 'regressor', base_models[algo]) ]) log_message(f" • Created pipeline for {algo.replace('_', ' ').title()}") # Train models model_results = [] # Initialize best_score based on whether higher or lower is better for the primary metric higher_is_better_metrics = ['accuracy', 'precision', 'recall', 'f1', 'r2'] lower_is_better_metrics = ['mae', 'mse', 'rmse'] if primary_metric in higher_is_better_metrics: best_score = float('-inf') # Start with negative infinity for metrics where higher is better elif primary_metric in lower_is_better_metrics: best_score = float('inf') # Start with positive infinity for metrics where lower is better else: # Default: assume higher is better for unknown metrics best_score = float('-inf') best_model_name = None log_message(f"🎯 Primary metric: '{primary_metric}' (initialized best_score to {best_score})") if metric_threshold is not None: log_message(f"⏰ Early stopping threshold: {metric_threshold} (will stop if any model meets or beats this score)") else: log_message("⏰ No early stopping threshold set - will train all algorithms") log_message("=" * 50) log_message("🏋️ STARTING MODEL TRAINING...") for algo in algorithms: # Check for timeout before training each algorithm if experiment_timeout: elapsed_time = time.time() - start_time if elapsed_time > timeout_seconds: timed_out = True elapsed_minutes = elapsed_time / 60 log_message(f"⏰ EXPERIMENT TIMEOUT REACHED after {elapsed_minutes:.1f} minutes") log_message(f" Timeout limit was {experiment_timeout} minutes") log_message(f" Stopping training before algorithm: {algo}") break if algo in model_map: try: log_message(f"\n🚀 Training {algo.replace('_', ' ').title()}...") # Train model model = model_map[algo] log_message(f" • Model parameters: {model.get_params()}") model.fit(X_train, y_train) log_message(f" • ✅ Model training completed") # Make predictions log_message(f" • Making predictions on test set...") y_pred = model.predict(X_test) log_message(f" • ✅ Predictions completed ({len(y_pred)} predictions)") # Calculate ALL metrics for this task type log_message(f" • Calculating metrics...") metrics = {} if task_type == 'classification': # Calculate all classification metrics try: metrics['accuracy'] = float(accuracy_score(y_test, y_pred)) log_message(f" ✓ Accuracy: {metrics['accuracy']:.6f}") except Exception as e: metrics['accuracy'] = 0.0 log_message(f" ✗ Accuracy calculation failed: {e}", "warning") try: metrics['precision'] = float(precision_score(y_test, y_pred, average='weighted', zero_division=0)) log_message(f" ✓ Precision: {metrics['precision']:.6f}") except Exception as e: metrics['precision'] = 0.0 log_message(f" ✗ Precision calculation failed: {e}", "warning") try: metrics['recall'] = float(recall_score(y_test, y_pred, average='weighted', zero_division=0)) log_message(f" ✓ Recall: {metrics['recall']:.6f}") except Exception as e: metrics['recall'] = 0.0 log_message(f" ✗ Recall calculation failed: {e}", "warning") try: metrics['f1'] = float(f1_score(y_test, y_pred, average='weighted', zero_division=0)) log_message(f" ✓ F1-Score: {metrics['f1']:.6f}") except Exception as e: metrics['f1'] = 0.0 log_message(f" ✗ F1-Score calculation failed: {e}", "warning") # Calculate AUC metrics for binary and multiclass classification try: # Get unique classes to determine if binary or multiclass unique_classes = len(np.unique(y_test)) if unique_classes == 2: # Binary classification - use predict_proba for AUC if hasattr(model, 'predict_proba'): y_prob = model.predict_proba(X_test)[:, 1] # Get probability of positive class metrics['auc'] = float(roc_auc_score(y_test, y_prob)) log_message(f" ✓ AUC (binary): {metrics['auc']:.6f}") else: # Fallback to decision function if available if hasattr(model, 'decision_function'): y_scores = model.decision_function(X_test) metrics['auc'] = float(roc_auc_score(y_test, y_scores)) log_message(f" ✓ AUC (binary, decision function): {metrics['auc']:.6f}") else: metrics['auc'] = 0.5 # Random classifier baseline log_message(f" ⚠️ AUC: Model doesn't support probability prediction, using baseline: {metrics['auc']:.6f}", "warning") else: # Multiclass classification - use one-vs-rest approach if hasattr(model, 'predict_proba'): y_prob = model.predict_proba(X_test) metrics['auc'] = float(roc_auc_score(y_test, y_prob, multi_class='ovr', average='weighted')) log_message(f" ✓ AUC (multiclass, weighted): {metrics['auc']:.6f}") else: metrics['auc'] = 0.5 # Random classifier baseline log_message(f" ⚠️ AUC: Model doesn't support probability prediction for multiclass, using baseline: {metrics['auc']:.6f}", "warning") except Exception as e: metrics['auc'] = 0.5 # Random classifier baseline log_message(f" ✗ AUC calculation failed: {e}, using baseline: {metrics['auc']:.6f}", "warning") else: # Calculate all regression metrics try: metrics['mae'] = float(mean_absolute_error(y_test, y_pred)) log_message(f" ✓ MAE: {metrics['mae']:.6f}") except Exception as e: metrics['mae'] = float('inf') log_message(f" ✗ MAE calculation failed: {e}", "warning") try: metrics['mse'] = float(mean_squared_error(y_test, y_pred)) log_message(f" ✓ MSE: {metrics['mse']:.6f}") except Exception as e: metrics['mse'] = float('inf') log_message(f" ✗ MSE calculation failed: {e}", "warning") try: metrics['rmse'] = float(np.sqrt(mean_squared_error(y_test, y_pred))) log_message(f" ✓ RMSE: {metrics['rmse']:.6f}") except Exception as e: metrics['rmse'] = float('inf') log_message(f" ✗ RMSE calculation failed: {e}", "warning") try: metrics['r2'] = float(r2_score(y_test, y_pred)) log_message(f" ✓ R²: {metrics['r2']:.6f}") except Exception as e: metrics['r2'] = -float('inf') log_message(f" ✗ R² calculation failed: {e}", "warning") # Verify the primary metric exists if primary_metric not in metrics: log_message(f" ⚠️ WARNING: Primary metric '{primary_metric}' not found!", "warning") primary_metric = list(metrics.keys())[0] if metrics else 'accuracy' log_message(f" → Using '{primary_metric}' as primary metric instead", "warning") current_score = metrics[primary_metric] log_message(f" 🎯 Primary metric '{primary_metric}': {current_score:.6f}") # Define which metrics are better when higher vs lower higher_is_better = ['accuracy', 'precision', 'recall', 'f1', 'r2', 'auc'] lower_is_better = ['mae', 'mse', 'rmse'] if primary_metric in higher_is_better: is_better = current_score > best_score comparison = f"{current_score:.6f} > {best_score:.6f}" elif primary_metric in lower_is_better: is_better = current_score < best_score comparison = f"{current_score:.6f} < {best_score:.6f}" else: # Default: assume higher is better for unknown metrics is_better = current_score > best_score comparison = f"{current_score:.6f} > {best_score:.6f}" log_message(f" 📈 Best model comparison: {comparison} = {is_better}") if is_better: best_score = current_score best_model_name = algo log_message(f" 🏆 New best model: {algo.replace('_', ' ').title()}") # Store results and trained model pipeline with precise identifiers model_key = f"{job_data.get('id', 'unknown')}_{algo}" trained_models_storage[model_key] = { 'model': model, # This is now a full pipeline including preprocessing 'feature_columns': original_columns, # Store original column names for user input 'target_column': target_column, 'task_type': task_type, 'scaler': None, # Scaling is now part of the pipeline 'job_id': job_data.get('id', 'unknown'), 'algorithm': algo, 'model_key': model_key, # Store the key for exact matching 'job_name': job_data.get('name', 'unknown'), 'display_name': algo.replace('_', ' ').title(), 'categorical_columns': categorical_cols, 'numeric_columns': numeric_cols } model_results.append({ 'name': algo, 'display_name': algo.replace('_', ' ').title(), 'metrics': metrics, 'primary_score': current_score, 'created_at': datetime.now().isoformat(), 'is_best': False, # Will be updated later 'model_key': model_key }) log_message(f" ✅ {algo.replace('_', ' ').title()} completed successfully") # Check for early stopping condition if metric_threshold is not None: # Determine if threshold has been met based on metric direction threshold_met = False if primary_metric in higher_is_better: threshold_met = current_score >= metric_threshold comparison_op = ">=" elif primary_metric in lower_is_better: threshold_met = current_score <= metric_threshold comparison_op = "<=" else: # Default: assume higher is better for unknown metrics threshold_met = current_score >= metric_threshold comparison_op = ">=" log_message(f" 🎯 Threshold check: {current_score:.6f} {comparison_op} {metric_threshold} = {threshold_met}") if threshold_met: early_stop_triggered = True log_message(f" 🛑 EARLY STOPPING TRIGGERED!") log_message(f" Model '{algo.replace('_', ' ').title()}' achieved {primary_metric}={current_score:.6f}") log_message(f" This meets the threshold of {metric_threshold}") log_message(f" Stopping training - remaining algorithms will not be trained") break # Exit the training loop except Exception as e: log_message(f" ❌ Failed to train {algo.replace('_', ' ').title()}: {str(e)}", "error") else: log_message(f" ⚠️ Algorithm '{algo}' not available", "warning") # Mark best model for result in model_results: result['is_best'] = result['name'] == best_model_name log_message("=" * 50) log_message("🏆 TRAINING SUMMARY:") log_message(f" • Total models trained: {len(model_results)}") if early_stop_triggered: remaining_algorithms = len(algorithms) - len(model_results) log_message(f" • Early stopping triggered after {len(model_results)} model(s)") log_message(f" • Skipped {remaining_algorithms} remaining algorithm(s)") log_message(f" • Threshold met: {primary_metric} {'≥' if primary_metric in higher_is_better else '≤'} {metric_threshold}") log_message(f" • Primary metric used: {primary_metric}") if best_model_name: log_message(f" • Best model: {best_model_name.replace('_', ' ').title()}") log_message(f" • Best score: {best_score:.6f}") else: log_message(" • No models completed successfully", "warning") if timed_out: if len(model_results) > 0: log_message("⏰ TRAINING COMPLETED WITH TIMEOUT!") log_message(f" Experiment exceeded {experiment_timeout} minute limit") log_message(f" However, {len(model_results)} model(s) completed successfully") else: log_message("⏰ TRAINING FAILED - TIMEOUT REACHED!") log_message(f" Experiment exceeded {experiment_timeout} minute limit") log_message(" No models were completed before timeout") elif early_stop_triggered: log_message("🛑 TRAINING COMPLETED WITH EARLY STOP!") else: log_message("🎉 TRAINING COMPLETED SUCCESSFULLY!") log_message("=" * 50) # Return results to JavaScript with comprehensive logs if timed_out and len(model_results) > 0: job_status = 'Completed (timed out)' elif timed_out and len(model_results) == 0: job_status = 'Failed (timed out)' elif early_stop_triggered: job_status = 'Completed (stopped early)' else: job_status = 'Completed' training_results = { 'success': not (timed_out and len(model_results) == 0), # Success unless timed out with no results 'results': model_results, 'job_id': job_data.get('id', 'unknown'), 'early_stop_triggered': early_stop_triggered, 'timed_out': timed_out, 'job_info': { 'id': job_data.get('id', 'unknown'), 'name': job_data.get('name', 'ML Training Job'), 'training_logs': training_logs, 'status': job_status, 'models_count': len(model_results), 'best_model': best_model_name.replace('_', ' ').title() if best_model_name else None, 'best_score': best_score if best_model_name else None, 'primary_metric': primary_metric, 'early_stop_triggered': early_stop_triggered, 'timed_out': timed_out, 'experiment_timeout': experiment_timeout, 'metric_threshold': metric_threshold, 'n_classes': n_classes if task_type == 'classification' else None, 'class_labels': [str(label) for label in class_labels] if class_labels is not None else None, 'task_type': task_type } } js.handleTrainingComplete(json.dumps(training_results)) print("✓ Training completed successfully with comprehensive logs") except Exception as e: import traceback error_msg = str(e) log_message(f"❌ TRAINING FAILED: {error_msg}", "error") log_message(f"📋 Traceback: {traceback.format_exc()}", "error") error_results = { 'success': False, 'error': error_msg, 'job_id': job_data.get('id', 'error') if 'job_data' in locals() else 'error', 'job_info': { 'id': job_data.get('id', 'error') if 'job_data' in locals() else 'error', 'name': 'Failed Training Job', 'training_logs': training_logs, 'status': 'failed', 'error': error_msg } } js.handleTrainingComplete(json.dumps(error_results)) # Prediction function def predict_with_model_pyscript(model_key, input_data_json): """Make predictions using a trained model with exact model key""" try: print(f"🔮 Making prediction with model key: {model_key}") # Parse input data input_data = json.loads(input_data_json) # Validate input format if 'input_data' not in input_data: raise ValueError("Missing 'input_data' in request") data_info = input_data['input_data'] if not all(key in data_info for key in ['columns', 'data']): raise ValueError("Missing 'columns' or 'data' in input_data") # Find the model in storage using exact key model_found = None model_info = None # First try exact key match if model_key and model_key in trained_models_storage: model_found = trained_models_storage[model_key]['model'] model_info = trained_models_storage[model_key] print(f" ✓ Found model by exact key: {model_key}") else: # Fallback: Look for the model by checking if model_key contains identifiers print(f" ⚠️ Exact key '{model_key}' not found, trying fuzzy match...") for key, stored_model in trained_models_storage.items(): # Check if this is the right model (by job_id and algorithm) if (model_key and (model_key in key or key in model_key)) or \ (stored_model['algorithm'] in (model_key or '').lower()): model_found = stored_model['model'] model_info = stored_model print(f" ✓ Found model by fuzzy match: {key}") break if model_found is None: # If not found by algorithm, try finding the best model from the most recent training if trained_models_storage: # Get the most recently stored model (last one added) latest_key = list(trained_models_storage.keys())[-1] model_found = trained_models_storage[latest_key]['model'] model_info = trained_models_storage[latest_key] print(f" ✓ Using latest model: {latest_key}") else: raise ValueError(f"No trained model found for '{model_name}'") # Prepare input features for pipeline import pandas as pd import numpy as np print(f" • Expected feature columns: {model_info['feature_columns']}") print(f" • Input columns provided: {data_info['columns']}") # Validate column match if list(data_info['columns']) != model_info['feature_columns']: raise ValueError(f"Column mismatch! Model expects: {model_info['feature_columns']}, but got: {data_info['columns']}") # Create DataFrame with original column names and data types feature_df = pd.DataFrame(data_info['data'], columns=data_info['columns']) print(f" • Input shape: {feature_df.shape}") print(f" • Input data types: {dict(feature_df.dtypes)}") print(f" • Raw input sample: {feature_df.iloc[0].to_dict() if len(feature_df) > 0 else 'No data'}") # The pipeline handles all preprocessing (imputation, encoding, scaling) print(" • Using complete pipeline for preprocessing and prediction...") predictions = model_found.predict(feature_df) print(f" • Generated {len(predictions)} predictions") # Convert numpy types to Python types for JSON serialization predictions_list = [float(p) if hasattr(p, 'item') else p for p in predictions] print(f" ✓ Predictions: {predictions_list}") return json.dumps(predictions_list) except Exception as e: error_msg = f"Prediction error: {str(e)}" print(f" ❌ {error_msg}") return json.dumps({"error": error_msg}) # Model serialization function def serialize_model_pyscript(model_key): """Serialize a trained model to base64 for download using exact model key""" try: print(f"📦 Serializing model with key: {model_key}") # Find the model in storage using exact key model_found = None model_info = None # First try exact key match if model_key and model_key in trained_models_storage: model_found = trained_models_storage[model_key]['model'] model_info = trained_models_storage[model_key] print(f" ✓ Found model by exact key: {model_key}") else: # Fallback: fuzzy matching print(f" ⚠️ Exact key '{model_key}' not found, trying fuzzy match...") for key, stored_model in trained_models_storage.items(): if (model_key and (model_key in key or key in model_key)) or \ (stored_model['algorithm'] in (model_key or '').lower()): model_found = stored_model['model'] model_info = stored_model print(f" ✓ Found model by fuzzy match: {key}") break if model_found is None: if trained_models_storage: latest_key = list(trained_models_storage.keys())[-1] model_found = trained_models_storage[latest_key]['model'] model_info = trained_models_storage[latest_key] print(f" ✓ Using latest model: {latest_key}") else: raise ValueError(f"No trained model found for '{model_name}'") # Serialize the model using pickle import pickle import base64 # Create a complete model package model_package = { 'model': model_found, 'feature_columns': model_info['feature_columns'], 'target_column': model_info['target_column'], 'task_type': model_info['task_type'], 'scaler': model_info['scaler'], 'algorithm': model_info['algorithm'], 'job_id': model_info['job_id'], 'sklearn_version': '1.3.0', 'serialization_date': pd.Timestamp.now().isoformat() } # Serialize to bytes model_bytes = pickle.dumps(model_package) # Encode as base64 for transfer model_b64 = base64.b64encode(model_bytes).decode('utf-8') print(f" ✓ Model serialized ({len(model_bytes)} bytes)") return model_b64 except Exception as e: error_msg = f"Serialization error: {str(e)}" print(f" ❌ {error_msg}") return json.dumps({"error": error_msg}) # Make functions available to JavaScript js.window.train_ml_models_pyscript = train_ml_models_pyscript js.window.predict_with_model_pyscript = predict_with_model_pyscript js.window.serialize_model_pyscript = serialize_model_pyscript js.window.list_stored_models = list_stored_models # Set ready flag js.window.pyScriptReady = True print("✓ PyScript ready flag set") # Try to notify JavaScript try: js.notifyPyScriptReady() print("✓ JavaScript notification successful") except Exception as e: print(f"JavaScript notification failed: {e}") print("PyScript initialization complete")