import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load datasets
dataset1_path = "/groups/ocean/qifan/draft/combined_dataset185.csv"
dataset2_path = "/groups/ocean/qifan/draft/combined_dataset210.csv"
dataset3_path = "/groups/ocean/qifan/draft/combined_dataset220.csv"
dataset4_path = "/groups/ocean/qifan/draft/combined_dataset230i.csv" 

dataset1 = pd.read_csv(dataset1_path)
dataset2 = pd.read_csv(dataset2_path)
dataset3 = pd.read_csv(dataset3_path)
dataset4 = pd.read_csv(dataset4_path)  # Load the new dataset

# Remove the last row of data
def remove_last_row(data):
    return data.iloc[:-1]

dataset1 = remove_last_row(dataset1)
dataset2 = remove_last_row(dataset2)
dataset3 = remove_last_row(dataset3)
dataset4 = remove_last_row(dataset4)  

# Function to calculate and print statistics
def print_statistics(dataset, name):
    print(f"\nStatistics for {name}:")
    stats = dataset.describe()
    print(stats.loc[['min', 'max', 'mean', 'std']])

# Print statistics for each dataset
print_statistics(dataset1, 'Dataset 185')
print_statistics(dataset2, 'Dataset 210')
print_statistics(dataset3, 'Dataset 220')
print_statistics(dataset4, 'Dataset 230i') 

# Combine datasets
combined_dataset = pd.concat([dataset1, dataset2, dataset3, dataset4]).reset_index(drop=True)

# Print statistics for the combined dataset
print_statistics(combined_dataset, 'Combined Dataset')

# Print the shape and first few lines of the combined dataset
print("\nCombined dataset shape:", combined_dataset.shape)
print("First few lines of the combined dataset:")
print(combined_dataset.head())

# Save the combined dataset to a CSV file
combined_dataset_path = "/groups/ocean/qifan/draft/CombinedDatasetLongUnsmooth.csv"
combined_dataset.to_csv(combined_dataset_path, index=False)
print(f"Combined dataset saved at: {combined_dataset_path}")

# Plot the time series for AMOC
plt.figure(figsize=(17, 8))
plt.plot(combined_dataset.index, combined_dataset['AMOC'], label='AMOC', color='skyblue')

plt.xlabel('Time Index')
plt.ylabel('AMOC')
plt.title('Combined Time Series of AMOC')
plt.legend()
plt.grid(True)
plt.tight_layout()

combined_plot_path = "/groups/ocean/qifan/draft/CombinedLongTimeSeriesAMOC.png"
plt.savefig(combined_plot_path)
plt.show()
print(f"Combined time series plot saved at: {combined_plot_path}")

import xarray as xr
import numpy as np
import pandas as pd
import os

# Function to calculate the maximum AMOC at the equator between 500 and 2000m depth
def get_amoc_max_at_equator(file_path):
    try:
        print("Starting AMOC max calculation at the equator...")
        ds = xr.open_dataset(file_path, decode_times=False)
        lat_aux_grid = ds['lat_aux_grid']
        moc_z = ds['moc_z']

        # Select the equator (0 degrees latitude)
        lat_idx = np.abs(lat_aux_grid.values - 0).argmin()

        # Select the depth range (500m to 2000m)
        depth_min_idx = np.abs(moc_z.values - 50000).argmin()
        depth_max_idx = np.abs(moc_z.values - 200000).argmin()

        # Extract the data within the selected depth range
        amoc_data = ds['MOC'].isel(lat_aux_grid=lat_idx, moc_z=slice(depth_min_idx, depth_max_idx), transport_reg=1, moc_comp=0)
        
        # Get the maximum AMOC value and the corresponding depth
        amoc_max = amoc_data.max(dim='moc_z', skipna=True)
        amoc_max_depth = moc_z.isel(moc_z=amoc_data.argmax(dim='moc_z'))

        print(f"Max AMOC value: {amoc_max.values} occurs at depth: {amoc_max_depth.values / 100} meters")
        return amoc_max.values, amoc_max_depth.values / 100  # Return values in meters
    except Exception as e:
        print(f"Failed to calculate AMOC max at equator: {e}")
        return None, None

# Function to calculate the minimum MOC at 30S between 1000 and 4000m depth
def get_moc_min_at_30s(file_path):
    try:
        print("Starting MOC min calculation at 30S...")
        ds = xr.open_dataset(file_path, decode_times=False)
        lat_aux_grid = ds['lat_aux_grid']
        moc_z = ds['moc_z']

        # Select 30S latitude
        lat_idx = np.abs(lat_aux_grid.values - -30).argmin()

        # Select the depth range (1000m to 4000m)
        depth_min_idx = np.abs(moc_z.values - 100000).argmin()
        depth_max_idx = np.abs(moc_z.values - 400000).argmin()

        # Extract the data within the selected depth range
        moc_data = ds['MOC'].isel(lat_aux_grid=lat_idx, moc_z=slice(depth_min_idx, depth_max_idx), transport_reg=0, moc_comp=0)  # All ocean
        
        # Get the minimum MOC value and the corresponding depth
        moc_min = moc_data.min(dim='moc_z', skipna=True)
        moc_min_depth = moc_z.isel(moc_z=moc_data.argmin(dim='moc_z'))

        print(f"Min MOC value: {moc_min.values} occurs at depth: {moc_min_depth.values / 100} meters")
        return moc_min.values, moc_min_depth.values / 100  # Return values in meters
    except Exception as e:
        print(f"Failed to calculate MOC min at 30S: {e}")
        return None, None

# Paths to the .nc files
amoc_file_path = '/groups/ocean/qifan/annual_data/cesmi6gat31rblc220/cesmi6gat31rblc220_ocn_MOC_ANN.nc'
moc_file_path = '/groups/ocean/qifan/annual_data/cesmi6gat31rblc220/cesmi6gat31rblc220_ocn_MOC_ANN.nc'

# Get the max AMOC at equator and min MOC at 30S
amoc_max, amoc_max_depth = get_amoc_max_at_equator(amoc_file_path)
moc_min, moc_min_depth = get_moc_min_at_30s(moc_file_path)

# Load the existing CSV file
csv_file_path = '/groups/ocean/qifan/draft/combined_dataset220.csv'
df = pd.read_csv(csv_file_path)

# Update the DataFrame with the new values
df['AMOC'] = amoc_max
df['MOC_30SAABW'] = moc_min

# Drop the depth columns if they exist
if 'AMOC_Max_Depth' in df.columns:
    df = df.drop(columns=['AMOC_Max_Depth'])
if 'MOC_Min_Depth' in df.columns:
    df = df.drop(columns=['MOC_Min_Depth'])

# Save the updated DataFrame back to the CSV file
df.to_csv(csv_file_path, index=False)
print(f"Updated CSV file saved to {csv_file_path}")

# Print the dropped depth values
print(f"\nAMOC Max Depth: {amoc_max_depth} meters")
print(f"MOC Min Depth: {moc_min_depth} meters")

# Load the dataset
print("\nLoading and displaying updated dataset information...\n")

# Display the head of the dataset
print("Head of the dataset:")
print(df.head())

# Display the shape of the dataset
print("\nShape of the dataset:")
print(df.shape)

# Display the minimum values in the dataset
print("\nMinimum values in the dataset:")
print(df.min())

# Display the maximum values in the dataset
print("\nMaximum values in the dataset:")
print(df.max())

# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
print(df.iloc[:5])


##### Fix TAUX #####
import xarray as xr
import numpy as np
import pandas as pd

# Function to extract and calculate the Southern Ocean TAUX time series
def get_southern_ocean_taux(file_path, lat_range, lon_range):
    try:
        print("Starting Southern Ocean TAUX calculation...")
        ds = xr.open_dataset(file_path, decode_times=False)
        taux = ds['TAUX'].sel(lat=slice(*lat_range), lon=slice(*lon_range))

        # Average over latitude and longitude for each time step
        taux_avg = taux.mean(dim=['lat', 'lon'], skipna=True).values
        
        # Get the mean TAUX over time steps (this will return an array)
        print(f"Southern Ocean TAUX time series calculated successfully.")
        return taux_avg
    except Exception as e:
        print(f"Failed to calculate Southern Ocean TAUX: {e}")
        return None

# Path to the TAUX .nc file
taux_file_path = '/groups/ocean/qifan/annual_data/cesmi6gat31rblc230i/cesmi6gat31rblc230i_atm_TAUX_ANN.nc'

# Southern Ocean latitude and longitude ranges
southern_ocean_lat_range = (-70, -50)
southern_ocean_lon_range = (0, 360)

# Get the Southern Ocean TAUX time series
southern_ocean_taux_series = get_southern_ocean_taux(taux_file_path, southern_ocean_lat_range, southern_ocean_lon_range)

# Load the existing CSV file
csv_file_path = '/groups/ocean/qifan/draft/combined_dataset230i.csv'
df = pd.read_csv(csv_file_path)

# Check if the length of TAUX series matches the DataFrame's length
if len(southern_ocean_taux_series) == len(df):
    # Update the DataFrame with the new Southern Ocean TAUX time series
    df['Southern_Ocean_TAUX'] = southern_ocean_taux_series
else:
    print("Length mismatch between TAUX series and DataFrame. Please check the data alignment.")

# Save the updated DataFrame back to the CSV file
df.to_csv(csv_file_path, index=False)
print(f"Updated CSV file saved to {csv_file_path}")

# Load the dataset
print("\nLoading and displaying updated dataset information...\n")

# Display the head of the dataset
print("Head of the dataset:")
print(df.head())

# Display the shape of the dataset
print("\nShape of the dataset:")
print(df.shape)

# Display the minimum values in the dataset
print("\nMinimum values in the dataset:")
print(df.min())

# Display the maximum values in the dataset
print("\nMaximum values in the dataset:")
print(df.max())

# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
print(df.iloc[:5])

#import pandas as pd
# Load the existing CSV file
#csv_file_path = '/groups/ocean/qifan/draft/CombinedDatasetLongUnsmooth.csv'
#df = pd.read_csv(csv_file_path)
# Rename the 'Southern_Ocean_TAUX' column to 'TAUX'
#df.rename(columns={'Southern_Ocean_TAUX': 'TAUX'}, inplace=True)
# Save the updated DataFrame back to the CSV file
#df.to_csv(csv_file_path, index=False)
# Display the first few rows to confirm the change
#print(df.head())

# Add AMOC 40N 
import xarray as xr
import numpy as np
import pandas as pd
import os

# Function to calculate the maximum AMOC at 40 degrees north between 500 and 2000m depth
def get_amoc_max_at_40N(file_path):
    try:
        print("Starting AMOC max calculation at 40N...")
        ds = xr.open_dataset(file_path, decode_times=False)
        lat_aux_grid = ds['lat_aux_grid']
        moc_z = ds['moc_z']

        # Select 40N latitude
        lat_idx = np.abs(lat_aux_grid.values - 40).argmin()

        # Select the depth range (500m to 2000m)
        depth_min_idx = np.abs(moc_z.values - 50000).argmin()
        depth_max_idx = np.abs(moc_z.values - 200000).argmin()

        # Extract the data within the selected depth range
        amoc_data = ds['MOC'].isel(lat_aux_grid=lat_idx, moc_z=slice(depth_min_idx, depth_max_idx), transport_reg=1, moc_comp=0)
        
        # Get the maximum AMOC value and the corresponding depth
        amoc_max = amoc_data.max(dim='moc_z', skipna=True)
        amoc_max_depth = moc_z.isel(moc_z=amoc_data.argmax(dim='moc_z'))

        print(f"Max AMOC value at 40N: {amoc_max.values} occurs at depth: {amoc_max_depth.values / 100} meters")
        return amoc_max.values, amoc_max_depth.values / 100  # Return values in meters
    except Exception as e:
        print(f"Failed to calculate AMOC max at 40N: {e}")
        return None, None

# Paths to the .nc file
amoc_file_path = '/groups/ocean/qifan/annual_data/cesmi6gat31rblc230i/cesmi6gat31rblc230i_ocn_MOC_ANN.nc' #185, 210, 220, 230

# Get the max AMOC at 40N
amoc_max_40N, amoc_max_depth_40N = get_amoc_max_at_40N(amoc_file_path)

# Load the existing CSV file
csv_file_path = '/groups/ocean/qifan/draft/combined_dataset230i.csv'
df = pd.read_csv(csv_file_path)

# Update the DataFrame with the new AMOC_40N values
df['AMOC_40N'] = amoc_max_40N

# Save the updated DataFrame back to the CSV file
df.to_csv(csv_file_path, index=False)
print(f"Updated CSV file saved to {csv_file_path}")

# Print the dropped depth values for information
print(f"\nAMOC Max Depth at 40N: {amoc_max_depth_40N} meters")

# Load and display the updated dataset information
print("\nLoading and displaying updated dataset information...\n")

# Display the head of the dataset
print("Head of the dataset:")
print(df.head())

# Display the shape of the dataset
print("\nShape of the dataset:")
print(df.shape)

# Display the minimum values in the dataset
print("\nMinimum values in the dataset:")
print(df.min())

# Display the maximum values in the dataset
print("\nMaximum values in the dataset:")
print(df.max())

# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
print(df.iloc[:5])

###Fix SALT_150m### ###Fix PD###
import pandas as pd
import numpy as np
import xarray as xr

# Function to read .nc files (unchanged)
def read_nc_file(file_path, var_name, column_head, lat_range, lon_range, depth=None):
    try:
        dataset = xr.open_dataset(file_path, decode_times=False)
        data = dataset[var_name]

        if 'TLAT' in dataset:
            lat = dataset['TLAT']
            lon = dataset['TLONG']
        else:
            lat = dataset['lat']
            lon = dataset['lon']

        if 'z_t' in dataset.dims and depth is not None:
            z_t = dataset['z_t']
            depth_idx = np.abs(z_t.values - depth).argmin()
            data = data.isel(z_t=depth_idx)

        data = data.where((lat >= lat_range[0]) & (lat <= lat_range[1]) & 
                          (lon >= lon_range[0]) & (lon <= lon_range[1]), drop=True)

        time = dataset['time']
        print(f"Data read successfully with {len(time)} time steps for {column_head}")
        return time, data
    except KeyError:
        print(f"Variable '{var_name}' not found in the file {file_path}.")
        return None, None
    except Exception as e:
        print(f"Failed to read the .nc file {file_path} with variable {var_name}: {e}")
        return None, None

# Function to preprocess additional data (only the new variables)
def preprocess_additional_data(file_paths, var_names, column_heads, lat_ranges, lon_ranges, depths=None):
    all_data = []
    valid_var_names = []

    for file_path, var_name, column_head, lat_range, lon_range, depth in zip(file_paths, var_names, column_heads, lat_ranges, lon_ranges, depths):
        actual_var_name = var_name.split('_')[0]  # Get the actual variable name without depth
        time, data = read_nc_file(file_path, actual_var_name, column_head, lat_range, lon_range, depth)

        if time is not None and data is not None:
            valid_var_names.append(column_head)  # Append only if data is successfully read
            data = data.mean(dim=['nlat', 'nlon'] if 'nlat' in data.dims else ['lat', 'lon'], skipna=True).values

            print(f"Processed data for {column_head}:")
            print(f"First five values: {data[:5]}")
            print(f"Shape of data: {data.shape}")
            print(f"Max value in {column_head}: {np.nanmax(data)}")
            print(f"Min value in {column_head}: {np.nanmin(data)}")

            all_data.append(data)

    if all_data:
        combined_data = np.vstack(all_data).T
        print(f"Combined data shape for {valid_var_names}: {combined_data.shape}")
        return combined_data, valid_var_names
    else:
        return None, None

# Define additional variables to add
additional_variables = [
    {'name': 'SALT', 'column_head': 'SALT_150m', 'file_path': '/groups/ocean/qifan/annual_data/cesmi6gat31rblc230i/cesmi6gat31rblc230i_ocn_SALT_ANN.nc', 'lat_range': (50, 70), 'lon_range': (300, 360), 'depth': 15000}, 
    {'name': 'PD_0m', 'column_head': 'PD_0m_0to50', 'file_path': '/groups/ocean/qifan/annual_data/cesmi6gat31rblc230i/cesmi6gat31rblc230i_ocn_PD_ANN.nc', 'lat_range': (0, 50), 'lon_range': (300, 360), 'depth': 0}, 
    {'name': 'PD_200m', 'column_head': 'PD_200m_0to50', 'file_path': '/groups/ocean/qifan/annual_data/cesmi6gat31rblc230i/cesmi6gat31rblc230i_ocn_PD_ANN.nc', 'lat_range': (0, 50), 'lon_range': (300, 360), 'depth': 20000} 
]

# Extract file paths, variable names, column headers, latitudes, longitudes, and depths for the additional variables
additional_file_paths = [var['file_path'] for var in additional_variables]
additional_var_names = [var['name'] for var in additional_variables]
additional_column_heads = [var['column_head'] for var in additional_variables]
additional_lat_ranges = [var['lat_range'] for var in additional_variables]
additional_lon_ranges = [var['lon_range'] for var in additional_variables]
additional_depths = [var['depth'] for var in additional_variables]

# Preprocess the additional variables
additional_data, additional_valid_var_names = preprocess_additional_data(
    additional_file_paths, additional_var_names, additional_column_heads, additional_lat_ranges, additional_lon_ranges, additional_depths
)

# Load the existing CSV file
csv_path = '/groups/ocean/qifan/draft/combined_dataset230i.csv'
existing_df = pd.read_csv(csv_path)

# Add the new columns to the existing DataFrame
if additional_data is not None:
    for i, col_name in enumerate(additional_valid_var_names):
        existing_df[col_name] = additional_data[:, i]

    # Save the updated DataFrame to the CSV file
    updated_csv_path = '/groups/ocean/qifan/draft/combined_dataset_with_new_columns_230i.csv'
    existing_df.to_csv(updated_csv_path, index=False)

    # Print the head and first few rows of the updated dataset
    print(f"Updated dataset saved as {updated_csv_path}")
    print("First few rows of the updated dataset:")
    print(existing_df.head())
else:
    print("No additional data to add to the CSV.")

#New Combine dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load datasets
dataset1_path = "/groups/ocean/qifan/draft/combined_dataset_with_new_columns_185.csv"
dataset2_path = "/groups/ocean/qifan/draft/combined_dataset_with_new_columns_210.csv"
dataset3_path = "/groups/ocean/qifan/draft/combined_dataset_with_new_columns_220.csv"
dataset4_path = "/groups/ocean/qifan/draft/combined_dataset_with_new_columns_230i.csv" 

dataset1 = pd.read_csv(dataset1_path)
dataset2 = pd.read_csv(dataset2_path)
dataset3 = pd.read_csv(dataset3_path)
dataset4 = pd.read_csv(dataset4_path)  # Load the new dataset

# Remove the last row of data
def remove_last_row(data):
    return data.iloc[:-1]

dataset1 = remove_last_row(dataset1)
dataset2 = remove_last_row(dataset2)
dataset3 = remove_last_row(dataset3)
dataset4 = remove_last_row(dataset4)  

# Function to calculate and print statistics
def print_statistics(dataset, name):
    print(f"\nStatistics for {name}:")
    stats = dataset.describe()
    print(stats.loc[['min', 'max', 'mean', 'std']])

# Print statistics for each dataset
print_statistics(dataset1, 'Dataset 185')
print_statistics(dataset2, 'Dataset 210')
print_statistics(dataset3, 'Dataset 220')
print_statistics(dataset4, 'Dataset 230i') 

# Combine datasets
combined_dataset = pd.concat([dataset1, dataset2, dataset3, dataset4]).reset_index(drop=True)

# Print statistics for the combined dataset
print_statistics(combined_dataset, 'Combined Dataset')

# Print the shape and first few lines of the combined dataset
print("\nCombined dataset shape:", combined_dataset.shape)
print("First few lines of the combined dataset:")
print(combined_dataset.head())

# Save the combined dataset to a CSV file
combined_dataset_path = "/groups/ocean/qifan/draft/CombinedDatasetLongUnsmoothNewColumns.csv"
combined_dataset.to_csv(combined_dataset_path, index=False)
print(f"Combined dataset saved at: {combined_dataset_path}")

# Plot the time series for AMOC
plt.figure(figsize=(17, 8))
plt.plot(combined_dataset.index, combined_dataset['AMOC'], label='AMOC', color='skyblue')

plt.xlabel('Time Index')
plt.ylabel('AMOC')
plt.title('Combined Time Series of AMOC')
plt.legend()
plt.grid(True)
plt.tight_layout()

combined_plot_path = "/groups/ocean/qifan/draft/CombinedLongTimeSeriesAMOC.png"
plt.savefig(combined_plot_path)
plt.show()
print(f"Combined time series plot saved at: {combined_plot_path}")