import os import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from pysr import PySRRegressor import sympy import logging from decimal import Decimal, getcontext # Set decimal precision getcontext().prec = 10 # Set up logging to capture output to a file log_file_path = '/groups/ocean/qifan/draft/symbolic_regression_output.txt' logging.basicConfig(filename=log_file_path, level=logging.INFO, format='%(message)s') logger = logging.getLogger() # Function to redirect print statements to the logger def print_and_log(message): print(message) logger.info(message) # Load the combined dataset from CSV combined_dataset_path = "/groups/ocean/qifan/draft/CombinedDatasetLongUnsmooth.csv" dataset = pd.read_csv(combined_dataset_path) # Rename 'MOC_30SAABW' to 'AABW' in the dataset if 'MOC_30SAABW' in dataset.columns: dataset = dataset.rename(columns={'MOC_30SAABW': 'AABW'}) # Drop 'AMOC_20S' and 'SALT_500m' columns from the dataset if they exist dataset = dataset.drop(columns=['AMOC_20S', 'SALT_500m'], errors='ignore') # Define the specific subsets of features feature_subsets = [ ['SFWF'], ['SFWF', 'PD_200m'], ['SFWF', 'PD_0m', 'PD_200m'] ] # Separate features and target features = dataset.drop(columns=['AMOC_40N']) target = dataset['AMOC_40N'] # Split the data into train and test sets (using last 30% as the test set) split_index = int(len(dataset) * 0.7) train_data = dataset.iloc[:split_index] test_data = dataset.iloc[split_index:] train_features = train_data.drop(columns=['AMOC_40N']) test_features = test_data.drop(columns=['AMOC_40N']) train_labels = train_data['AMOC_40N'] test_labels = test_data['AMOC_40N'] # Define window size and skip window window_size = 50 skip_window = 20 # Function to create lagged features with a specific skip window def create_lagged_features(data, labels, lag): X, y = [], [] for i in range(lag, len(data)): X.append(data[i - lag]) y.append(labels[i]) return np.array(X), np.array(y) # Custom implementation of EDR def custom_edr(ts1, ts2, epsilon=0.1): len_ts1, len_ts2 = len(ts1), len(ts2) D = np.zeros((len_ts1 + 1, len_ts2 + 1)) for i in range(1, len_ts1 + 1): D[i, 0] = i for j in range(1, len_ts2 + 1): D[0, j] = j for i in range(1, len_ts1 + 1): for j in range(1, len_ts2 + 1): cost = 0 if abs(ts1[i - 1] - ts2[j - 1]) <= epsilon else 1 D[i, j] = min(D[i - 1, j] + 1, # Deletion D[i, j - 1] + 1, # Insertion D[i - 1, j - 1] + cost) # Match or Substitution return D[len_ts1, len_ts2] # Define the custom cubic root function in Julia cubic_root_julia = "my_cbrt(x) = x >= 0 ? x^(1//3) : -(-x)^(1//3)" # Define the SymPy mapping for cubic root def sympy_cbrt(x): return sympy.sign(x) * sympy.Abs(x)**(1/3) # Initialize symbolic predictions to NaN across the entire dataset symbolic_predictions_combined = np.full(len(dataset), np.nan) # Log the number of time indices used for symbolic regression num_time_indices_predicted = len(test_labels) - window_size + 1 print_and_log(f"Number of time indices predicted and used for symbolic regression: {num_time_indices_predicted}") # Prepare a list to store performance results performance_results = [] # Prepare for multi-plot num_plots = len(feature_subsets) cols = 2 rows = (num_plots // cols) + (num_plots % cols > 0) fig, axs = plt.subplots(rows, cols, figsize=(15, 5 * rows)) axs = axs.flatten() # Iterate over each specified subset of features for idx, feature_set in enumerate(feature_subsets): print_and_log(f"Evaluating feature set: {feature_set}") # Select only the columns corresponding to the feature set selected_train_data = train_features[feature_set] selected_test_data = test_features[feature_set] # Create features using only the selected lag (e.g., t_20) train_features_lagged, train_labels_lagged = create_lagged_features(selected_train_data.values, train_labels.values, skip_window) test_features_lagged, test_labels_lagged = create_lagged_features(selected_test_data.values, test_labels.values, skip_window) # Create DataFrames with these columns feature_columns = [f'{feat}_t_20' for feat in feature_set] selected_train_features = pd.DataFrame(train_features_lagged, columns=feature_columns) selected_test_features = pd.DataFrame(test_features_lagged, columns=feature_columns) # Set constraints to ensure all features in the subset are used constraints = {col: 1 for col in selected_train_features.columns} constraints["^"] = (-4, 4) # Exponents are restricted to integer numbers from -4 to 4 # Perform symbolic regression using PySR on the training data pysr_model = PySRRegressor( # random_state=42, # get deterministic results with a seed of 42 niterations=700, binary_operators=["+", "-", "*", "/", "^"], unary_operators=["sqrt", "log", "abs", "neg", "sin", "cos", "tan", cubic_root_julia], extra_sympy_mappings={ "inv": lambda x: sympy.Piecewise((1 / x, x != 0), (0, True)), "my_cbrt": sympy_cbrt }, maxsize=20, verbosity=1, constraints=constraints, # Set constraints here should_optimize_constants=True, complexity_of_constants=1.0, complexity_of_operators={ "+": 0.5, "-": 0.5, "*": 0.5, "square": 1.5, "cube": 2.0, "/": 1.5, "^": 5.5, "sqrt": 3.0, "inv": 4.0, "my_cbrt": 2.0, "abs": 5.0, "exp": 5.5, "log": 6.0, "tan": 3.5, "tanh": 7.5, "erf": 8.0, "sin": 8.5, "cos": 8.5 }, model_selection="accuracy", warmup_maxsize_by=2, temp_equation_file=False ) # Fit the PySR model pysr_model.fit(selected_train_features, train_labels_lagged.flatten()) # Retrieve the best expression, its complexity, and evaluate RMSE on test data best_model_index = pysr_model.equations_['loss'].idxmin() best_equation = pysr_model.equations_.loc[best_model_index, 'equation'] best_complexity = pysr_model.equations_.loc[best_model_index, 'complexity'] symbolic_predictions = pysr_model.predict(selected_test_features) symbolic_rmse = np.sqrt(mean_squared_error(test_labels_lagged.flatten(), symbolic_predictions)) # Calculate the EDR metric symbolic_edr = custom_edr(test_labels_lagged.flatten(), symbolic_predictions, epsilon=0.1) # Log and save the best equation, RMSE, EDR, and complexity print_and_log(f"Best equation: {best_equation}") print_and_log(f"Symbolic Regression RMSE: {symbolic_rmse:.2f}") print_and_log(f"Symbolic Regression EDR: {symbolic_edr:.2f}") print_and_log(f"Symbolic Regression Complexity: {best_complexity}") performance_results.append({ "Features": feature_set, "RMSE": symbolic_rmse, "EDR": symbolic_edr, "Complexity": best_complexity, "Equation": best_equation }) # Calculate correct start and end indices for plotting, considering both window and skip window start_idx = split_index + window_size + skip_window - 1 end_idx = start_idx + len(symbolic_predictions) if end_idx > len(symbolic_predictions_combined): # Truncate the predictions to fit into the available space end_idx = len(symbolic_predictions_combined) symbolic_predictions = symbolic_predictions[:end_idx - start_idx] symbolic_predictions_combined[start_idx:end_idx] = symbolic_predictions # Adjust x values to match the length of predictions x_values = np.arange(start_idx, end_idx) # Plot both train and test data in the subplots axs[idx].plot(np.arange(len(dataset)), dataset['AMOC_40N'], label='Original Data', color="#8FD89F", alpha=1.0) axs[idx].plot(x_values, symbolic_predictions_combined[start_idx:end_idx], label='Symbolic Predictions', color="#FF850F", alpha=1.0) # orange # Add a vertical line to indicate the train-test split axs[idx].axvline(x=split_index, color="#191970", linestyle='--', label='Train-Test Split') axs[idx].set_xlabel('Index') axs[idx].set_ylabel('AMOC Value') axs[idx].set_title(f'{", ".join(feature_set)}') # Title with variable names in the subset axs[idx].legend() axs[idx].grid(True) # Adjust layout to avoid overlapping plt.tight_layout() # Save all symbolic regression plots to a single PNG file output_plot_path = '/groups/ocean/qifan/draft/FindEquationPlotOrigin.png' plt.savefig(output_plot_path) plt.show() print_and_log(f"All plots saved as: {output_plot_path}") # Convert performance results to a DataFrame performance_df = pd.DataFrame.from_dict(performance_results, orient='columns') performance_csv_path = '/groups/ocean/qifan/draft/feature_combinations_performance_origin.csv' performance_df.to_csv(performance_csv_path, index=False) print_and_log(f"Feature combinations performance CSV saved at: {performance_csv_path}") import sympy as sp import pandas as pd # Load the CSV file file_path = '/groups/ocean/qifan/draft/feature_combinations_performance_origin.csv' # path of the CSV file df = pd.read_csv(file_path) # Define a function to simplify the equations def simplify_equation(eq): try: simplified_eq = sp.simplify(eq) return str(simplified_eq) except: return eq # Apply the function to simplify the equations df['Simplified Equation'] = df['Equation'].apply(simplify_equation) # Save the updated DataFrame to a new CSV file output_file_path = '/groups/ocean/qifan/draft/feature_combinations_performance_origin.csv' df.to_csv(output_file_path, index=False) print(f"Simplified equations saved to {output_file_path}")