#!/usr/bin/env python
# coding: utf-8


import pandas as pd


# CSV file
# make sure csv is in same folder as notebook 
# or write a path to to it
csv_file ='/Users/johnhenrycruz/path_to_your_pca_results/we_pca/we_p_10/factors_20/we_p_10_20-_PCA_Results - Loadings.csv'
# read cvs with pandas read_csv
df = pd.read_csv(csv_file)


# rename column to factors
df = df.rename(columns={'Unnamed: 0': 'factors'})


# make empty copy
pca_clusters = pd.DataFrame(data=df.values, columns=df.columns)
pca_clusters.loc[:,1:] = ''


# create cols list to iterate through
cols = df.columns.tolist()
# adjust to take out factors column
cols = cols[1:]
# make tuples of factors and their correlation in that cluster
for i in range(len(cols)): 
    pca_clusters[cols[i]] = sorted(df[['factors',cols[i]]].apply(tuple, axis=1), key = lambda x: x[1], reverse=True)


# delete factors column
del pca_clusters['factors']


# replace tuple with only the word
for i in range(len(pca_clusters)):
    for j in range(len(pca_clusters.columns)):
        # change 0.2 depending on what cutoff you want to use
        if pca_clusters.iloc[i,j][1] < 0.2:
            pca_clusters.iloc[i,j] = ''
        # comment out this else statement if you want to keep the cell 
        # as a tuple
        else:
            pca_clusters.iloc[i,j] = pca_clusters.iloc[i,j][0]


# save file as a specific name of your choice
pca_clusters.to_csv('pca_clusters_results.csv', index = False)