packages = ["pandas", "numpy", "scikit-learn", "matplotlib"]

ML Lite

Simple Machine Learning Model Training

🔄 Loading ML Engine...

Model Type & Data

Choose Model Type

Select the type of machine learning model based on your prediction goal

Upload Training Data

No file chosen
Upload a CSV file containing your training data with headers

Training Settings

Target and Features

Choose the column you want the model to predict
Select the columns to use as input features for training the model

Training Parameters

70% training / 30% testing
Adjust the percentage of data used for training versus testing the model

Training Process

Configuration Summary

Begin training the machine learning model with your data and settings

Training Results

Test Your Model

# PyScript initialization import warnings import pandas as pd import numpy as np from io import StringIO import json import js # Suppress threadpoolctl warnings in PyScript/browser environment warnings.filterwarnings('ignore', message='.*libc not found.*') warnings.filterwarnings('ignore', category=RuntimeWarning, module='threadpoolctl') print("PyScript starting...") print("✓ Basic libraries imported") # Import ML libraries at module level try: from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.cluster import KMeans from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, silhouette_score from sklearn.decomposition import PCA import matplotlib.pyplot as plt import io import base64 print("✓ ML libraries imported successfully") except Exception as e: print(f"❌ Failed to import ML libraries: {e}") raise e # Global storage for trained model trained_model = None model_info = {} def train_model_simple(csv_content, has_headers, model_type, target_column, feature_columns, train_split, cluster_method=None, num_clusters=None): import json # Import json for this function global trained_model, model_info print(f"Training {model_type} model...") print(f"Parameters received:") print(f" csv_content type: {type(csv_content)}, length: {len(csv_content) if csv_content else 'None'}") print(f" has_headers: {has_headers} (type: {type(has_headers)})") print(f" model_type: {model_type} (type: {type(model_type)})") print(f" target_column: {target_column} (type: {type(target_column)})") print(f" feature_columns: {feature_columns} (type: {type(feature_columns)})") print(f" train_split: {train_split} (type: {type(train_split)})") if cluster_method is not None: print(f" cluster_method: {cluster_method} (type: {type(cluster_method)})") if num_clusters is not None: print(f" num_clusters: {num_clusters} (type: {type(num_clusters)})") # Validate parameters if not csv_content: return json.dumps({'success': False, 'error': 'No CSV content provided'}) if not model_type: return json.dumps({'success': False, 'error': 'No model type specified'}) if not target_column and model_type in ['regression', 'classification']: return json.dumps({'success': False, 'error': 'Target column required for supervised learning'}) if not feature_columns: return json.dumps({'success': False, 'error': 'No feature columns specified'}) try: print(f"Raw CSV content preview (first 500 chars): '{csv_content[:500]}'") # First, try to detect if we actually have headers lines = csv_content.strip().split('\n') if len(lines) < 2: return json.dumps({'success': False, 'error': 'CSV file must have at least 2 rows'}) first_row = lines[0].split(',') second_row = lines[1].split(',') if len(lines) > 1 else [] print(f"First row: {first_row}") print(f"Second row: {second_row}") # Simple heuristic: if first row contains mostly numeric data, it's probably not headers def looks_like_header(row): if not row: return False # Check if most values are non-numeric (likely headers) non_numeric_count = 0 for val in row: val = val.strip() try: float(val) except (ValueError, TypeError): if val and not val.replace('.', '').replace('-', '').isdigit(): non_numeric_count += 1 return non_numeric_count > len(row) / 2 has_actual_headers = looks_like_header(first_row) print(f"Header detection: {has_actual_headers}") # Parse CSV based on actual header detection if has_actual_headers: df = pd.read_csv(StringIO(csv_content)) print("Parsed with headers") else: # No headers, use generic column names df = pd.read_csv(StringIO(csv_content), header=None) # Create generic column names df.columns = [f'Column_{i+1}' for i in range(len(df.columns))] print(f"No headers detected, created generic columns: {list(df.columns)}") # Return error asking user to add headers return json.dumps({ 'success': False, 'error': 'CSV file appears to have no headers. Please ensure the first row contains column names like: Date,DayOfWeek,Month,Temperature,Rainfall,IceCreamsSold' }) print(f"✓ Data loaded: {df.shape[0]} rows, {df.shape[1]} columns") print(f"✓ DataFrame dtypes:") for col, dtype in df.dtypes.items(): print(f" '{col}': {dtype}") print(f"✓ Actual column names in DataFrame: {list(df.columns)}") print(f"✓ Column names (repr): {[repr(col) for col in df.columns]}") print(f"✓ Requested target column: '{target_column}' (repr: {repr(target_column)})") print(f"✓ Requested feature columns: {feature_columns}") print(f"✓ Feature columns (repr): {[repr(col) for col in feature_columns]}") # Clean column names (remove any whitespace/special chars) df.columns = df.columns.str.strip() print(f"✓ Cleaned column names: {list(df.columns)}") # Check if requested columns exist missing_cols = [] if target_column and target_column.strip() not in df.columns: missing_cols.append(f"target: '{target_column}'") for col in feature_columns: if col.strip() not in df.columns: missing_cols.append(f"feature: '{col}'") if missing_cols: error_msg = f"Column(s) not found in data: {', '.join(missing_cols)}" print(f"❌ {error_msg}") return json.dumps({'success': False, 'error': error_msg}) # Initialize scaler variable model_scaler = None if model_type in ['regression', 'classification']: # Supervised learning - use cleaned column names target_clean = target_column.strip() features_clean = [col.strip() for col in feature_columns] X = df[features_clean] y = df[target_clean] # Handle missing values and encode categorical variables X_processed = X.copy() for col in X_processed.columns: if X_processed[col].dtype == 'object': le = LabelEncoder() X_processed[col] = le.fit_transform(X_processed[col].astype(str)) else: X_processed[col] = pd.to_numeric(X_processed[col], errors='coerce').fillna(X_processed[col].median()) # Handle target variable if model_type == 'classification' or y.dtype == 'object': le_target = LabelEncoder() y_processed = le_target.fit_transform(y.astype(str)) else: y_processed = pd.to_numeric(y, errors='coerce').fillna(y.median()) # Split data X_train, X_test, y_train, y_test = train_test_split( X_processed, y_processed, test_size=(1-train_split), random_state=42 ) # Ensure we have enough data for meaningful metrics if len(X_test) < 2: print("Warning: Test set too small, using larger split") X_train, X_test, y_train, y_test = train_test_split( X_processed, y_processed, test_size=0.3, random_state=42 ) # Train model if model_type == 'regression': model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) # Calculate regression metrics with error handling try: r2 = r2_score(y_test, y_pred) if len(y_test) > 1 else 0.0 except: r2 = 0.0 metrics = { 'mae': round(mean_absolute_error(y_test, y_pred), 3), 'mse': round(mean_squared_error(y_test, y_pred), 3), 'rmse': round(np.sqrt(mean_squared_error(y_test, y_pred)), 3), 'r2': round(r2, 3) } # Create regression plot plt.figure(figsize=(10, 8)) plt.scatter(y_test, y_pred, alpha=0.6, color='#0078d4') min_val = min(min(y_test), min(y_pred)) max_val = max(max(y_test), max(y_pred)) plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Predictions') plt.xlabel('Actual Values') plt.ylabel('Predicted Values') plt.title('Regression Results: Predicted vs Actual') plt.legend() plt.grid(True, alpha=0.3) else: # classification model = LogisticRegression(random_state=42, max_iter=1000) model.fit(X_train, y_train) y_pred = model.predict(X_test) # Calculate classification metrics metrics = { 'accuracy': round(accuracy_score(y_test, y_pred), 3), 'precision': round(precision_score(y_test, y_pred, average='weighted', zero_division=0), 3), 'recall': round(recall_score(y_test, y_pred, average='weighted', zero_division=0), 3), 'f1': round(f1_score(y_test, y_pred, average='weighted', zero_division=0), 3) } # Create confusion matrix cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(8, 6)) plt.imshow(cm, interpolation='nearest', cmap='Blues') plt.title('Confusion Matrix') plt.colorbar() # Add text annotations for i in range(cm.shape[0]): for j in range(cm.shape[1]): plt.text(j, i, str(cm[i, j]), ha='center', va='center', color='white' if cm[i, j] > cm.max() / 2 else 'black') plt.xlabel('Predicted Label') plt.ylabel('True Label') trained_model = model else: # clustering features_clean = [col.strip() for col in feature_columns] X = df[features_clean] X_processed = X.copy() # Handle categorical variables and missing values for col in X_processed.columns: if X_processed[col].dtype == 'object': le = LabelEncoder() X_processed[col] = le.fit_transform(X_processed[col].astype(str)) else: X_processed[col] = pd.to_numeric(X_processed[col], errors='coerce').fillna(X_processed[col].median()) # Standardize features scaler = StandardScaler() X_scaled = scaler.fit_transform(X_processed) # Determine number of clusters if cluster_method == 'manual' and num_clusters is not None: # Use manually specified number of clusters n_clusters = int(num_clusters) print(f"Using manual cluster count: {n_clusters}") else: # Use automatic elbow method n_clusters = 3 # Default if len(X_scaled) > 10: inertias = [] k_range = range(2, min(8, len(X_scaled)//2)) for k in k_range: kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10) kmeans_temp.fit(X_scaled) inertias.append(kmeans_temp.inertia_) # Simple elbow detection if len(inertias) > 2: changes = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)] for i in range(1, len(changes)): if changes[i] < changes[i-1] * 0.3: n_clusters = k_range[i] break print(f"Using automatic cluster detection: {n_clusters}") # Train K-means model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) labels = model.fit_predict(X_scaled) # Store scaler for clustering predictions model_scaler = scaler # Calculate clustering metrics silhouette_avg = silhouette_score(X_scaled, labels) if len(set(labels)) > 1 else 0 metrics = { 'silhouette_score': round(silhouette_avg, 3), 'num_clusters': n_clusters, 'inertia': round(model.inertia_, 3) } # Create clustering visualization plt.figure(figsize=(10, 8)) if X_scaled.shape[1] > 2: # Use PCA for dimensionality reduction pca = PCA(n_components=2, random_state=42) X_2d = pca.fit_transform(X_scaled) centers_2d = pca.transform(model.cluster_centers_) plt.title('Clustering Results (PCA 2D Projection)') else: X_2d = X_scaled centers_2d = model.cluster_centers_ plt.title('Clustering Results') # Plot points with different colors for each cluster colors = plt.cm.Set3(np.linspace(0, 1, n_clusters)) for i in range(n_clusters): cluster_points = X_2d[labels == i] plt.scatter(cluster_points[:, 0], cluster_points[:, 1], c=[colors[i]], label=f'Cluster {i}', alpha=0.7, s=60) # Plot cluster centers plt.scatter(centers_2d[:, 0], centers_2d[:, 1], c='red', marker='x', s=200, linewidths=3, label='Centroids') plt.legend() plt.grid(True, alpha=0.3) trained_model = model # Save plot as base64 image buffer = io.BytesIO() plt.savefig(buffer, format='png', dpi=150, bbox_inches='tight') buffer.seek(0) image_png = buffer.getvalue() buffer.close() graphic = base64.b64encode(image_png).decode('utf-8') # Display the plot chart_container = js.document.getElementById('training-chart') if chart_container: img_element = js.document.createElement('img') img_element.src = 'data:image/png;base64,' + graphic img_element.style.maxWidth = '100%' img_element.style.height = 'auto' chart_container.innerHTML = '' chart_container.appendChild(img_element) plt.close() # Store model info globally global model_info model_info = { 'type': model_type, 'target': target_column, 'features': feature_columns, 'rows': df.shape[0], 'train_split': train_split, 'metrics': metrics, 'scaler': model_scaler if model_type == 'clustering' else None } print(f"✓ Model training completed with metrics: {metrics}") result = { 'success': True, 'message': f'{model_type} model trained successfully!', 'rows': df.shape[0], 'features': len(feature_columns), 'metrics': metrics } return json.dumps(result) except Exception as e: print(f"❌ Training error: {e}") import traceback traceback.print_exc() error_result = {'success': False, 'error': str(e)} return json.dumps(error_result) def make_prediction_simple(feature_values_json): import json global trained_model, model_info print("Making prediction...") print(f"Feature values JSON: {feature_values_json}") print(f"Feature values JSON type: {type(feature_values_json)}") # Parse the JSON string to get feature values try: if isinstance(feature_values_json, str): feature_values = json.loads(feature_values_json) else: feature_values = feature_values_json except Exception as e: return json.dumps({'success': False, 'error': f'Failed to parse feature values JSON: {e}'}) print(f"Feature values after parsing: {feature_values}") print(f"Trained model exists: {trained_model is not None}") print(f"Model info: {model_info}") if trained_model is None: return json.dumps({'success': False, 'error': 'No trained model available'}) if not model_info: return json.dumps({'success': False, 'error': 'Model info not available'}) try: # Convert feature values to the format used during training import numpy as np # Extract feature values in the correct order feature_list = [] model_features = model_info.get('features', []) print(f"Expected features: {model_features}") for feature in model_features: value = feature_values.get(feature, 0) # Convert to numeric try: numeric_value = float(value) except: numeric_value = 0.0 feature_list.append(numeric_value) print(f"Feature values for prediction: {feature_list}") # Create a DataFrame with proper feature names for prediction import pandas as pd prediction_data = pd.DataFrame([feature_list], columns=model_features) print(f"Prediction DataFrame: {prediction_data}") # Make prediction with the actual trained model model_type = model_info.get('type', '') if model_type == 'clustering': # For clustering, apply the same scaling used during training scaler = model_info.get('scaler') if scaler: prediction_data_scaled = scaler.transform(prediction_data) prediction = trained_model.predict(prediction_data_scaled)[0] else: prediction = trained_model.predict(prediction_data)[0] result = int(prediction) else: # For regression/classification prediction = trained_model.predict(prediction_data)[0] if model_type == 'regression': result = round(float(prediction), 2) else: result = int(prediction) print(f"✓ Prediction: {result}") return json.dumps({'success': True, 'prediction': result}) except Exception as e: print(f"❌ Prediction error: {e}") import traceback traceback.print_exc() return json.dumps({'success': False, 'error': str(e)}) def save_trained_model(): import json import pickle import base64 from io import BytesIO import copy try: if 'trained_model' not in globals() or trained_model is None: return json.dumps({"success": False, "error": "No trained model available"}) # Create a clean model package with only serializable components model_package = {} # Create a clean copy of the trained model to avoid JsProxy issues # We'll recreate the model with just the essential parameters clean_model = None if hasattr(trained_model, '__class__'): model_class = trained_model.__class__ if hasattr(trained_model, 'get_params'): # For sklearn models, get parameters and recreate params = trained_model.get_params() clean_params = {} # Filter out non-serializable parameters for key, value in params.items(): if isinstance(value, (str, int, float, bool, type(None))) or \ (hasattr(value, '__module__') and 'sklearn' in str(value.__module__)): clean_params[key] = value # Create new instance with clean parameters clean_model = model_class(**clean_params) # Copy the fitted attributes for attr in dir(trained_model): if attr.endswith('_') and not attr.startswith('_'): try: attr_value = getattr(trained_model, attr) # Test if attribute is serializable pickle.dumps(attr_value) setattr(clean_model, attr, attr_value) except: # Skip non-serializable attributes pass else: # Fallback: try to use the original model clean_model = trained_model else: clean_model = trained_model model_package['model'] = clean_model # Add scaler if it exists (for clustering) if 'model_scaler' in globals() and model_scaler is not None: model_package['scaler'] = model_scaler else: model_package['scaler'] = None # Clean model_info to remove any JsProxy objects clean_model_info = {} if 'model_info' in globals() and model_info and isinstance(model_info, dict): for key, value in model_info.items(): # Only include basic Python types if isinstance(value, (str, int, float, bool, list, dict, type(None))): try: # Test if the value is serializable pickle.dumps(value) clean_model_info[key] = value except: # Skip non-serializable values pass model_package['model_info'] = clean_model_info if clean_model_info else None # Test each component individually to isolate the problem try: pickle.dumps(model_package['model']) except Exception as model_error: return json.dumps({"success": False, "error": f"Model serialization failed: {str(model_error)}"}) try: pickle.dumps(model_package['scaler']) except Exception as scaler_error: return json.dumps({"success": False, "error": f"Scaler serialization failed: {str(scaler_error)}"}) try: pickle.dumps(model_package['model_info']) except Exception as info_error: return json.dumps({"success": False, "error": f"Model info serialization failed: {str(info_error)}"}) # If all components pass, serialize the complete package model_buffer = BytesIO() pickle.dump(model_package, model_buffer) model_bytes = model_buffer.getvalue() # Encode to base64 for safe transfer model_base64 = base64.b64encode(model_bytes).decode('utf-8') return json.dumps({ "success": True, "model_data": model_base64 }) except Exception as e: import traceback error_details = f"{str(e)}\n{traceback.format_exc()}" return json.dumps({"success": False, "error": error_details}) # Make functions available to JavaScript js.window.pyTrainModel = train_model_simple js.window.pyMakePrediction = make_prediction_simple js.window.pySaveModel = save_trained_model # Set ready flag js.window.pyScriptReady = True print("✓ PyScript ready flag set") print("PyScript initialization complete")