# -*- coding: utf-8 -*-
"""
Created on Wed Nov 16 14:28:51 2022

@author: zmn159
"""

import os
import pandas as pd

# Desktop location:
data_root = r'C:\Users\zmn159\OneDrive - University of Copenhagen\Desktop\python_output\Comparrison paper'

''' Paths '''
# naive libraies
one_root = os.path.join(data_root, 'HTP_1uL_AA_Clean.csv')
two_root = os.path.join(data_root, 'HTP_10uL_AA_Clean.csv')

'''Load data'''
row_num = 10000000
# naive libraries
one_df = pd.read_csv(one_root, nrows=row_num)
two_df = pd.read_csv(two_root, nrows=row_num)

# add a rank column based on relative frequency column
one_df['Rank_1'] = one_df['Relative (%)'].rank(ascending=False, method='first')
two_df['Rank_2'] = two_df['Relative (%)'].rank(ascending=False, method='first')


''''concats the two df's and creates new df of duplicated from the concatted df'''
concatted_df = pd.concat([one_df, two_df], axis=0)
duplicate_2 = concatted_df.AA[concatted_df.AA.duplicated(keep="first")]

merge_1 = pd.merge(duplicate_2, one_df, how="left", on="AA")
merge_duplicates = pd.merge(merge_1, two_df, how="left", on="AA")
merge_duplicates.columns =['AA','Absolute_one','Relative (%)_one','Rank_one','Absolute_two','Relative (%)_two','Rank_two']

''''creates new df where the duplicates (except the first) are excluded from the concatted df'''                   
unique = concatted_df['AA'].drop_duplicates(keep=False)

merge_2 = pd.merge(unique, one_df, how="left", on="AA")
merge_unique = pd.merge(merge_2, two_df, how="left", on="AA")
merge_unique.columns =['AA','Absolute_one','Relative (%)_one','Rank_one','Absolute_two','Relative (%)_two','Rank_two']