import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pysr import PySRRegressor
import sympy
import logging
from decimal import Decimal, getcontext

# Set decimal precision
getcontext().prec = 10

# Set up logging to capture output to a file
log_file_path = '/groups/ocean/qifan/draft/symbolic_regression_output.txt'
logging.basicConfig(filename=log_file_path, level=logging.INFO, format='%(message)s')
logger = logging.getLogger()

# Function to redirect print statements to the logger
def print_and_log(message):
    print(message)
    logger.info(message)

# Load the combined dataset from CSV
combined_dataset_path = "/groups/ocean/qifan/draft/CombinedDatasetLongUnsmooth.csv"
dataset = pd.read_csv(combined_dataset_path)

# Rename 'MOC_30SAABW' to 'AABW' in the dataset
if 'MOC_30SAABW' in dataset.columns:
    dataset = dataset.rename(columns={'MOC_30SAABW': 'AABW'})

# Drop 'AMOC_20S' and 'SALT_500m' columns from the dataset if they exist
dataset = dataset.drop(columns=['AMOC_20S', 'SALT_500m'], errors='ignore')

# Define the specific subsets of features
feature_subsets = [
    ['SFWF'], 
    ['SFWF', 'PD_200m'], 
    ['SFWF', 'PD_0m', 'PD_200m']
]

# Separate features and target
features = dataset.drop(columns=['AMOC_40N'])
target = dataset['AMOC_40N']

# Split the data into train and test sets (using last 30% as the test set)
split_index = int(len(dataset) * 0.7)
train_data = dataset.iloc[:split_index]
test_data = dataset.iloc[split_index:]

train_features = train_data.drop(columns=['AMOC_40N'])
test_features = test_data.drop(columns=['AMOC_40N'])
train_labels = train_data['AMOC_40N']
test_labels = test_data['AMOC_40N']

# Define window size and skip window
window_size = 50
skip_window = 20

# Function to create lagged features with a specific skip window
def create_lagged_features(data, labels, lag):
    X, y = [], []
    for i in range(lag, len(data)):
        X.append(data[i - lag])
        y.append(labels[i])
    return np.array(X), np.array(y)

# Custom implementation of EDR
def custom_edr(ts1, ts2, epsilon=0.1):
    len_ts1, len_ts2 = len(ts1), len(ts2)
    D = np.zeros((len_ts1 + 1, len_ts2 + 1))

    for i in range(1, len_ts1 + 1):
        D[i, 0] = i
    for j in range(1, len_ts2 + 1):
        D[0, j] = j

    for i in range(1, len_ts1 + 1):
        for j in range(1, len_ts2 + 1):
            cost = 0 if abs(ts1[i - 1] - ts2[j - 1]) <= epsilon else 1
            D[i, j] = min(D[i - 1, j] + 1,      # Deletion
                          D[i, j - 1] + 1,      # Insertion
                          D[i - 1, j - 1] + cost)  # Match or Substitution

    return D[len_ts1, len_ts2]

# Define the custom cubic root function in Julia
cubic_root_julia = "my_cbrt(x) = x >= 0 ? x^(1//3) : -(-x)^(1//3)"

# Define the SymPy mapping for cubic root
def sympy_cbrt(x):
    return sympy.sign(x) * sympy.Abs(x)**(1/3)

# Initialize symbolic predictions to NaN across the entire dataset
symbolic_predictions_combined = np.full(len(dataset), np.nan)

# Log the number of time indices used for symbolic regression
num_time_indices_predicted = len(test_labels) - window_size + 1
print_and_log(f"Number of time indices predicted and used for symbolic regression: {num_time_indices_predicted}")

# Prepare a list to store performance results
performance_results = []

# Prepare for multi-plot
num_plots = len(feature_subsets)
cols = 2
rows = (num_plots // cols) + (num_plots % cols > 0)

fig, axs = plt.subplots(rows, cols, figsize=(15, 5 * rows))
axs = axs.flatten()

# Iterate over each specified subset of features
for idx, feature_set in enumerate(feature_subsets):
    print_and_log(f"Evaluating feature set: {feature_set}")

    # Select only the columns corresponding to the feature set
    selected_train_data = train_features[feature_set]
    selected_test_data = test_features[feature_set]

    # Create features using only the selected lag (e.g., t_20)
    train_features_lagged, train_labels_lagged = create_lagged_features(selected_train_data.values, train_labels.values, skip_window)
    test_features_lagged, test_labels_lagged = create_lagged_features(selected_test_data.values, test_labels.values, skip_window)

    # Create DataFrames with these columns
    feature_columns = [f'{feat}_t_20' for feat in feature_set]
    selected_train_features = pd.DataFrame(train_features_lagged, columns=feature_columns)
    selected_test_features = pd.DataFrame(test_features_lagged, columns=feature_columns)

    # Set constraints to ensure all features in the subset are used
    constraints = {col: 1 for col in selected_train_features.columns}
    constraints["^"] = (-4, 4)  # Exponents are restricted to integer numbers from -4 to 4

    # Perform symbolic regression using PySR on the training data
    pysr_model = PySRRegressor(
        # random_state=42, # get deterministic results with a seed of 42
        niterations=700,
        binary_operators=["+", "-", "*", "/", "^"],
        unary_operators=["sqrt", "log", "abs", "neg", "sin", "cos", "tan", cubic_root_julia],
        extra_sympy_mappings={
            "inv": lambda x: sympy.Piecewise((1 / x, x != 0), (0, True)),
            "my_cbrt": sympy_cbrt
        },
        maxsize=20,
        verbosity=1,
        constraints=constraints,  # Set constraints here
        should_optimize_constants=True,
        complexity_of_constants=1.0,
        complexity_of_operators={
            "+": 0.5, "-": 0.5, "*": 0.5, "square": 1.5, "cube": 2.0, "/": 1.5,
            "^": 5.5, "sqrt": 3.0, "inv": 4.0, "my_cbrt": 2.0, "abs": 5.0, "exp": 5.5,
            "log": 6.0,  "tan": 3.5, "tanh": 7.5, "erf": 8.0,
            "sin": 8.5, "cos": 8.5
        },
        model_selection="accuracy",
        warmup_maxsize_by=2,
        temp_equation_file=False
    )

    # Fit the PySR model
    pysr_model.fit(selected_train_features, train_labels_lagged.flatten())

    # Retrieve the best expression, its complexity, and evaluate RMSE on test data
    best_model_index = pysr_model.equations_['loss'].idxmin()
    best_equation = pysr_model.equations_.loc[best_model_index, 'equation']
    best_complexity = pysr_model.equations_.loc[best_model_index, 'complexity']

    symbolic_predictions = pysr_model.predict(selected_test_features)
    symbolic_rmse = np.sqrt(mean_squared_error(test_labels_lagged.flatten(), symbolic_predictions))
    
    # Calculate the EDR metric
    symbolic_edr = custom_edr(test_labels_lagged.flatten(), symbolic_predictions, epsilon=0.1)

    # Log and save the best equation, RMSE, EDR, and complexity
    print_and_log(f"Best equation: {best_equation}")
    print_and_log(f"Symbolic Regression RMSE: {symbolic_rmse:.2f}")
    print_and_log(f"Symbolic Regression EDR: {symbolic_edr:.2f}")
    print_and_log(f"Symbolic Regression Complexity: {best_complexity}")
    performance_results.append({
        "Features": feature_set,
        "RMSE": symbolic_rmse,
        "EDR": symbolic_edr,
        "Complexity": best_complexity,
        "Equation": best_equation
    })

    # Calculate correct start and end indices for plotting, considering both window and skip window
    start_idx = split_index + window_size + skip_window - 1
    end_idx = start_idx + len(symbolic_predictions)

    if end_idx > len(symbolic_predictions_combined):
        # Truncate the predictions to fit into the available space
        end_idx = len(symbolic_predictions_combined)
        symbolic_predictions = symbolic_predictions[:end_idx - start_idx]

    symbolic_predictions_combined[start_idx:end_idx] = symbolic_predictions

    # Adjust x values to match the length of predictions
    x_values = np.arange(start_idx, end_idx)

    # Plot both train and test data in the subplots
    axs[idx].plot(np.arange(len(dataset)), dataset['AMOC_40N'], label='Original Data', color="#8FD89F", alpha=1.0)
    axs[idx].plot(x_values, symbolic_predictions_combined[start_idx:end_idx], 
                  label='Symbolic Predictions', color="#FF850F", alpha=1.0)  # orange

    # Add a vertical line to indicate the train-test split
    axs[idx].axvline(x=split_index, color="#191970", linestyle='--', label='Train-Test Split')

    axs[idx].set_xlabel('Index')
    axs[idx].set_ylabel('AMOC Value')
    axs[idx].set_title(f'{", ".join(feature_set)}')  # Title with variable names in the subset
    axs[idx].legend()
    axs[idx].grid(True)

# Adjust layout to avoid overlapping
plt.tight_layout()

# Save all symbolic regression plots to a single PNG file
output_plot_path = '/groups/ocean/qifan/draft/FindEquationPlotOrigin.png'
plt.savefig(output_plot_path)
plt.show()

print_and_log(f"All plots saved as: {output_plot_path}")

# Convert performance results to a DataFrame
performance_df = pd.DataFrame.from_dict(performance_results, orient='columns')
performance_csv_path = '/groups/ocean/qifan/draft/feature_combinations_performance_origin.csv'
performance_df.to_csv(performance_csv_path, index=False)
print_and_log(f"Feature combinations performance CSV saved at: {performance_csv_path}")

import sympy as sp
import pandas as pd

# Load the CSV file
file_path = '/groups/ocean/qifan/draft/feature_combinations_performance_origin.csv'  # path of the CSV file
df = pd.read_csv(file_path)

# Define a function to simplify the equations
def simplify_equation(eq):
    try:
        simplified_eq = sp.simplify(eq)
        return str(simplified_eq)
    except:
        return eq

# Apply the function to simplify the equations
df['Simplified Equation'] = df['Equation'].apply(simplify_equation)

# Save the updated DataFrame to a new CSV file
output_file_path = '/groups/ocean/qifan/draft/feature_combinations_performance_origin.csv'
df.to_csv(output_file_path, index=False)

print(f"Simplified equations saved to {output_file_path}")