# -*- coding: utf-8 -*- """ Created on Wed Nov 16 14:28:51 2022 @author: zmn159 """ import os import pandas as pd # Desktop location: data_root = r'C:\Users\zmn159\OneDrive - University of Copenhagen\Desktop\python_output\Comparrison paper' ''' Paths ''' # naive libraies one_root = os.path.join(data_root, 'HTP_1uL_AA_Clean.csv') two_root = os.path.join(data_root, 'HTP_10uL_AA_Clean.csv') '''Load data''' row_num = 10000000 # naive libraries one_df = pd.read_csv(one_root, nrows=row_num) two_df = pd.read_csv(two_root, nrows=row_num) # add a rank column based on relative frequency column one_df['Rank_1'] = one_df['Relative (%)'].rank(ascending=False, method='first') two_df['Rank_2'] = two_df['Relative (%)'].rank(ascending=False, method='first') ''''concats the two df's and creates new df of duplicated from the concatted df''' concatted_df = pd.concat([one_df, two_df], axis=0) duplicate_2 = concatted_df.AA[concatted_df.AA.duplicated(keep="first")] merge_1 = pd.merge(duplicate_2, one_df, how="left", on="AA") merge_duplicates = pd.merge(merge_1, two_df, how="left", on="AA") merge_duplicates.columns =['AA','Absolute_one','Relative (%)_one','Rank_one','Absolute_two','Relative (%)_two','Rank_two'] ''''creates new df where the duplicates (except the first) are excluded from the concatted df''' unique = concatted_df['AA'].drop_duplicates(keep=False) merge_2 = pd.merge(unique, one_df, how="left", on="AA") merge_unique = pd.merge(merge_2, two_df, how="left", on="AA") merge_unique.columns =['AA','Absolute_one','Relative (%)_one','Rank_one','Absolute_two','Relative (%)_two','Rank_two']