In progress

Here, we solve several canonical problems in network analysis: entity resolution (determining when two observations correspond to the same entity), link prediction (inferring the existence of links), and node labeling (inferring hidden attributes).

Updates can be seen here:

This notebook stores each step of refactoring the graph data into PSL data

# No space between equals sign is necessary, so we can treat these as Bash variables as well.
FILE_GROUND_TRUTH_EMAIL_NODES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.NODE.email.tab'
FILE_GROUND_TRUTH_COREF_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.UNDIRECTED.coref.tab'
FILE_GROUND_TRUTH_MANAGES_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.UNDIRECTED.email-submgr.tab'
FILE_GROUND_TRUTH_COMMUNICATION_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.DIRECTED.sentto.tab'


FILE_SAMPLE_EMAIL_NODES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-6of6/sample-enron.NODE.email.tab'
FILE_SAMPLE_COREF_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-6of6/sample-enron.UNDIRECTED.coref.tab'
FILE_SAMPLE_MANAGES_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-6of6/sample-enron.UNDIRECTED.email-submgr.tab'

These functions help parse the .tab files.

import pandas as pd
import numpy as np
import re
import itertools # for cross products when filling in a full PSL dataset

# assigns types to each column
def resolve_column_type(table):
    for column in table.columns:
        if column in {'id', 'email', 'alt_email', 'other_email' , 'numsent', 'numreceived', 'numexchanged'}:
            table[column] = table[column].astype(str).astype(float).astype(int)
        # convert bag-of-words columns to floats (since ints won't take NaNs)
        elif re.match("w-", column):
            table[column] = table[column].astype(str).astype(float)

# extracts feature name from an element in a raw tab row
# returns: tuple (feature_name, feature_value, optional_value)
def get_feature_tuple(feature):
    feature_data = re.split(r"[:=]", feature)
    return feature_data
    

# loads the *.tab files into a Pandas Dataframe.
# returns: pd.DataFrame(columns=features)
def load_table(filename):

    # initialize the pandas dataframe
    node_data = pd.DataFrame()


    with open(filename) as infile:
        i = 0
        row_list = []
        for row in infile:
    
            #print('i is: ', i)

            if i == 0:
                # Skip non-useful first line
                print("Header: ", row)
            elif i == 1:
                # Prepare dataframe column labels
                tokens = row.split()
                if len(tokens) == 1:
                    print("This is not a NODE file, so don't load this row")
                else:  
                    features = ["id"] + [get_feature_tuple(feature)[1] for feature in tokens]
                    node_data = pd.DataFrame(columns=features)
            else:
          
                # this is to help the function generalize among the NODE and EDGE files.
                # EDGE files have a "|" character, which needs to be removed for proper feature decoupling
                row = re.sub(r'\|','', row)
            
                tokens = row.split()

                # the first token doesn't need splitting
                row_dict = {'id':tokens[0]}
                row_dict.update({get_feature_tuple(token)[0]:get_feature_tuple(token)[1] for token in tokens[1:]})
                row_list.append(row_dict)
        
            i += 1
        
        # Fill in rows
        node_data = pd.concat([node_data, pd.DataFrame(row_list)], ignore_index=True)

    return node_data

Process the email nodes

Get ground truth

email_nodes = load_table(FILE_GROUND_TRUTH_EMAIL_NODES)
# remove the (unnecessary) second to last column (it came from an ambiguous parse splits)
email_nodes.drop('other,manager,specialist,director,executive', axis=1, inplace=True)
resolve_column_type(email_nodes)

email_nodes.dtypes

Header:  NODE	email






id                int64
emailaddress     object
numsent           int64
numreceived       int64
numexchanged      int64
                 ...   
w-kinney        float64
w-veselack      float64
w-mwhitt        float64
w-jarnold       float64
title            object
Length: 5119, dtype: object

email_nodes

	id	emailaddress	numsent	numreceived	numexchanged	w-gerald	w-know	w-busi	w-mexicana	w-transact	...	w-bartlo	w-columbiagassubject	w-perron	w-coh	w-agl	w-kinney	w-veselack	w-mwhitt	w-jarnold	title
0	283	c..koehler@enron.com	128	606	734	1.0	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	director
1	98	scott.goodell@enron.com	98	607	705	1.0	1.0	1.0	NaN	1.0	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	NaN	NaN	specialist
2	183	p..south@enron.com	8	351	359	1.0	1.0	1.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	director
3	204	lavorato@enron.com	388	3	391	NaN	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	executive
4	318	mike.grigsby@enron.com	3702	490	4192	1.0	1.0	1.0	1.0	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	executive
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
206	114	vkamins@enron.com	0	12	12	NaN	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	director
207	270	david.duran@enron.com	7	145	152	NaN	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	director
208	282	sean.crandall@enron.com	94	138	232	NaN	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	director
209	243	kevin.presto@enron.com	248	198	446	1.0	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	executive
210	131	dave.fuller@enron.com	165	129	294	1.0	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	manager

211 rows × 5119 columns

# Takes a table and fills the missing pairs and values to specify a full, sufficient set
# So far it only works with binary predicates
def fill_observed_missing_possibilities(table, arguments, values):
    total_possibilities = set(itertools.product(list(table[arguments[0]]), values))
    already_observed_possibilities = set((table.loc[index][arguments[0]], table.loc[index][arguments[1]]) for index in table.index)

    missing_possibilities = total_possibilities - already_observed_possibilities
    row_list = []
    for arg_0, arg_1 in missing_possibilities:
        row_dict = {arguments[0]:arg_0, arguments[1]:arg_1, arguments[2]:0 }
        row_list.append(row_dict)
        
    return pd.concat([table, pd.DataFrame(row_list)], ignore_index=True)

# Grab necessary columns, in preparation for dumping the whole ground truth data
email_nodes_data = email_nodes[['id','title']].copy()

# convert titles to integers, so PSL can ground faster
title_map = {"other": 0, "manager": 1, "specialist": 2, "director": 3, "executive": 4}

email_nodes_data = email_nodes_data.replace({'title': title_map})
email_nodes_data['exists'] = 1.0

full_set_email_has_label_data = fill_observed_missing_possibilities(email_nodes_data, ['id', 'title', 'exists'], list(title_map.values()))
full_set_email_has_label_data

	id	title	exists
0	283	3	1.0
1	98	2	1.0
2	183	3	1.0
3	204	4	1.0
4	318	4	1.0
...	...	...	...
1050	182	4	0.0
1051	308	0	0.0
1052	46	0	0.0
1053	202	0	0.0
1054	26	3	0.0

1055 rows × 3 columns

# Outputs all data (obs+truth)
# full_set_email_has_label_data.to_csv('EmailHasLabel_data.csv', sep ='\t', index=False, header=False, columns=['id', 'title', 'exists'])

Calculate splits for PSL predicates

# Grab the sample from the original experiment, this will allow us to calculate observations and targets.
sample_email_nodes = load_table(FILE_SAMPLE_EMAIL_NODES)
# remove the (unnecessary) second to last column (it came from an ambiguous parse splits)
sample_email_nodes.drop('other,manager,specialist,director,executive', axis=1, inplace=True)
resolve_column_type(sample_email_nodes)

Header:  NODE	email

# Split data into observed and targets (AKA train and test)
email_nodes_obs = email_nodes[email_nodes['id'].isin(sample_email_nodes[sample_email_nodes['title'].notna()]['id'])]
email_nodes_truth = email_nodes[email_nodes['id'].isin(sample_email_nodes[sample_email_nodes['title'].isna()]['id'])]

# Grab the necessary columns
email_has_label_obs = email_nodes_obs[['id','title']].copy()
email_has_label_truth = email_nodes_truth[['id','title']].copy()

# convert titles to integers, so PSL can ground faster
email_has_label_obs = email_has_label_obs.replace({'title': title_map})
email_has_label_truth = email_has_label_truth.replace({'title': title_map})

# add in an existence column
email_has_label_obs['exists'] = 1.0
email_has_label_truth['exists'] = 1.0

# email_has_label_obs

# Add in the the non existent observations
full_set_email_has_label_obs = fill_observed_missing_possibilities(email_has_label_obs, ['id', 'title', 'exists'], list(title_map.values()))
full_set_email_has_label_truth = fill_observed_missing_possibilities(email_has_label_truth, ['id', 'title', 'exists'], list(title_map.values()))

# Outputs splits to file
full_set_email_has_label_obs.to_csv('EmailHasLabel_obs.csv', sep ='\t', index=False, header=False)
full_set_email_has_label_truth.to_csv('EmailHasLabel_truth.csv', sep ='\t', index=False, header=False)

Process the CoRef edges

Get ground truth

# need to rename one of the columns due to key collision
# use copy for safety

!cp $FILE_GROUND_TRUTH_COREF_EDGES .
!sed -i 's/email/alt_email/2g' enron.UNDIRECTED.coref.tab

coref_edges = load_table('enron.UNDIRECTED.coref.tab')
resolve_column_type(coref_edges)

coref_edges.dtypes

Header:  UNDIRECTED	coref

This is not a NODE file, so don't load this row

id            int64
email         int64
alt_email     int64
exists       object
dtype: object

coref_edges

	id	email	alt_email	exists
0	2856	265	141	NOTEXIST
1	18491	310	295	NOTEXIST
2	516	272	183	NOTEXIST
3	5131	201	19	NOTEXIST
4	12417	138	78	NOTEXIST
...	...	...	...	...
20776	15003	135	208	NOTEXIST
20777	4450	197	47	NOTEXIST
20778	20302	248	25	NOTEXIST
20779	12985	222	118	NOTEXIST
20780	19684	248	54	NOTEXIST

20781 rows × 4 columns

# Grab necessary columns, in preparation for dumping the whole ground truth data
coref_edges_data = coref_edges[['email','alt_email', 'exists']].copy()

# convert existence column to boolean, so PSL can ground faster
exists_map = {"NOTEXIST": 0.0, "EXIST": 1.0}
coref_edges_data = coref_edges_data.replace({'exists': exists_map})

# Since it's undirected, add in the reverse edges.
coref_edges_data_sym = coref_edges_data[['alt_email', 'email', 'exists']].copy()
coref_edges_data_sym.rename(columns = {'alt_email':'email', 'email':'alt_email'}, inplace = True)

coref_edges_data = pd.concat([coref_edges_data, coref_edges_data_sym])

# Calculated the missing edges that were blocked.
missing_edges = {pair for pair in itertools.permutations(email_nodes['id'], 2)} - {pair for pair in zip(coref_edges_data['email'], coref_edges_data['alt_email'])}

# add in the missing edges
row_list = []
for email, alt_email in missing_edges:
    row_dict = {'email':email, 'alt_email':alt_email, 'exists':0 }
    row_list.append(row_dict)

full_set_coref_edges_data = pd.concat([coref_edges_data, pd.DataFrame(row_list)], ignore_index=True)

# Outputs to file
# full_set_coref_edges_data.to_csv('CoRef_data.csv', sep ='\t', index=False, header=False, columns=['email', 'alt_email', 'exists'])

# Sanity Check: These should print pairs of the same people
# for index in full_set_coref_edges_data[full_set_coref_edges_data['exists'] == 1.0][['email', 'alt_email']].index:
#     email_id  = full_set_coref_edges_data.loc[index]['email'].iloc[0]
#     alt_email_id = full_set_coref_edges_data.loc[index]['alt_email'].iloc[0]
    
#     print(email_nodes[email_nodes['id'] == email_id]['emailaddress'])
#     print(email_nodes[email_nodes['id'] == alt_email_id]['emailaddress'])
#     print("------------------------------------------------")
    

Calculate splits for PSL predicates

# Grab the sample from the original experiment, this will allow us to calculate observations and targets.
sample_coref_edges = load_table(FILE_SAMPLE_COREF_EDGES)
resolve_column_type(sample_coref_edges)

Header:  UNDIRECTED	coref

This is not a NODE file, so don't load this row

# Split data into observed and targets (AKA train and test)
coref_edges_obs = coref_edges[coref_edges['id'].isin(sample_coref_edges[sample_coref_edges['exists'].notna()]['id'])]
coref_edges_truth = coref_edges[coref_edges['id'].isin(sample_coref_edges[sample_coref_edges['exists'].isna()]['id'])]

# Grab the necessary columns
coref_obs = coref_edges_obs[['email', 'alt_email', 'exists']].copy()
coref_truth = coref_edges_truth[['email', 'alt_email', 'exists']].copy()

# convert existence column to boolean, so PSL can ground faster
coref_obs = coref_obs.replace({'exists': exists_map})
coref_truth = coref_truth.replace({'exists': exists_map})

# Since it's undirected, add in the reverse edges.
coref_obs_sym = coref_obs[['alt_email', 'email', 'exists']].copy()
coref_truth_sym = coref_truth[['alt_email', 'email', 'exists']].copy()

coref_obs_sym.rename(columns = {'alt_email':'email', 'email':'alt_email'}, inplace = True)
coref_truth_sym.rename(columns = {'alt_email':'email', 'email':'alt_email'}, inplace = True)

coref_obs = pd.concat([coref_obs, coref_obs_sym], ignore_index=True)
coref_truth = pd.concat([coref_truth, coref_truth_sym], ignore_index=True)

# Calculated the missing edges that were blocked. Note the last set prevents cross contamination
missing_edges = {pair for pair in itertools.permutations(email_nodes['id'], 2)} - {pair for pair in zip(coref_obs['email'], coref_obs['alt_email'])} - {pair for pair in zip(coref_truth['email'], coref_truth['alt_email'])}

# add in the missing edges
row_list = []
for email, alt_email in missing_edges:
    row_dict = {'email':email, 'alt_email':alt_email, 'exists':0 }
    row_list.append(row_dict)

full_set_coref_edges_obs = pd.concat([coref_obs, pd.DataFrame(row_list)], ignore_index=True)

# Outputs splits to file
full_set_coref_edges_obs.to_csv('CoRef_obs.csv', sep ='\t', index=False, header=False, columns=['email', 'alt_email', 'exists'])
coref_truth.to_csv('CoRef_truth.csv', sep ='\t', index=False, header=False, columns=['email', 'alt_email', 'exists'])

Process the Manager edges

Get ground truth

# Load in the observed email-submgr.
# need to rename one of the columns due to key collision
# use copy for safety
!cp $FILE_GROUND_TRUTH_MANAGES_EDGES .
!sed -i 's/\temail/\tother_email/2g' enron.UNDIRECTED.email-submgr.tab

manager_edges = load_table('enron.UNDIRECTED.email-submgr.tab')

# FIXME: can probably omit this line
manager_edges.drop('NOTEXIST,EXIST', axis=1, inplace=True)

resolve_column_type(manager_edges)

manager_edges.dtypes

Header:  UNDIRECTED	email-submgr






id                int64
w-gerald        float64
w-know          float64
w-busi          float64
w-mexicana      float64
                 ...   
w-jarnold       float64
numexchanged      int64
email             int64
other_email       int64
exists           object
Length: 5118, dtype: object

manager_edges

	id	w-gerald	w-know	w-busi	w-mexicana	w-transact	w-want	w-thing	w-review	w-questar	...	w-coh	w-agl	w-kinney	w-veselack	w-mwhitt	w-jarnold	numexchanged	email	other_email	exists
0	2693	NaN	1.0	NaN	NaN	1.0	1.0	NaN	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	6	286	324	EXIST
1	2634	NaN	1.0	NaN	NaN	NaN	1.0	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	3	74	37	NOTEXIST
2	1256	NaN	1.0	1.0	NaN	NaN	1.0	1.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	14	148	131	NOTEXIST
3	1406	NaN	NaN	NaN	NaN	1.0	1.0	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	3	57	313	EXIST
4	3129	NaN	1.0	1.0	NaN	1.0	1.0	1.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	43	34	236	NOTEXIST
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2046	2105	NaN	1.0	NaN	NaN	1.0	NaN	NaN	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	13	67	288	NOTEXIST
2047	2374	1.0	1.0	NaN	NaN	1.0	1.0	1.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	237	198	212	NOTEXIST
2048	3464	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	1	210	160	NOTEXIST
2049	531	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	9	316	188	NOTEXIST
2050	2026	NaN	1.0	1.0	NaN	1.0	1.0	1.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	8	34	273	NOTEXIST

2051 rows × 5118 columns

# Grab necessary columns, in preparation for dumping the whole ground truth data
manager_edges_data = manager_edges[['email','other_email', 'exists']].copy()

# convert existence column to boolean, so PSL can ground faster
manager_edges_data = manager_edges_data.replace({'exists': exists_map})

# Since it's undirected, add in the reverse edges.
manager_edges_data_sym = manager_edges_data[['other_email', 'email', 'exists']].copy()
manager_edges_data_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)

manager_edges_data = pd.concat([manager_edges_data, manager_edges_data_sym])

# Calculated the missing edges that were blocked.
missing_edges = {pair for pair in itertools.permutations(email_nodes['id'], 2)} - {pair for pair in zip(manager_edges_data['email'], manager_edges_data['other_email'])}

# add in the missing edges
row_list = []
for email, other_email in missing_edges:
    row_dict = {'email':email, 'other_email':other_email, 'exists':0 }
    row_list.append(row_dict)

full_set_manager_edges_data = pd.concat([manager_edges_data, pd.DataFrame(row_list)], ignore_index=True)

# Outputs to file
# full_set_manager_edges_data.to_csv('Manages_data.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'exists'])

Calculate splits for PSL predicates

# Grab the sample from the original experiment, this will allow us to calculate observations and targets.
sample_manager_edges = load_table(FILE_SAMPLE_MANAGES_EDGES)
resolve_column_type(sample_manager_edges)

Header:  UNDIRECTED	email-submgr

# Split data into observed and targets (AKA train and test)
manager_edges_obs = manager_edges[manager_edges['id'].isin(sample_manager_edges[sample_manager_edges['exists'].notna()]['id'])]
manager_edges_truth = manager_edges[manager_edges['id'].isin(sample_manager_edges[sample_manager_edges['exists'].isna()]['id'])]

print(len(manager_edges_obs))
print(len(manager_edges_truth))

1642
409

# Grab the necessary columns
manages_obs = manager_edges_obs[['email', 'other_email', 'exists']].copy()
manages_truth = manager_edges_truth[['email', 'other_email', 'exists']].copy()

# convert existence column to boolean, so PSL can ground faster
manages_obs = manages_obs.replace({'exists': exists_map})
manages_truth = manages_truth.replace({'exists': exists_map})

# Since it's undirected, add in the reverse edges.
manages_obs_sym = manages_obs[['other_email', 'email', 'exists']].copy()
manages_truth_sym = manages_truth[['other_email', 'email', 'exists']].copy()

manages_obs_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)
manages_truth_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)

manages_obs = pd.concat([manages_obs, manages_obs_sym])
manages_truth = pd.concat([manages_truth, manages_truth_sym])

# Calculated the missing edges that were blocked. Note the last set prevents cross contamination
missing_edges = {pair for pair in itertools.permutations(email_nodes['id'], 2)} - {pair for pair in zip(manages_obs['email'], manages_obs['other_email'])} - {pair for pair in zip(manages_truth['email'], manages_truth['other_email'])}

# add in the missing edges
row_list = []
for email, other_email in missing_edges:
    row_dict = {'email':email, 'other_email':other_email, 'exists':0 }
    row_list.append(row_dict)

full_set_manages_obs = pd.concat([manages_obs, pd.DataFrame(row_list)], ignore_index=True)

full_set_manages_obs.to_csv('Manages_obs.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'exists'])
manages_truth.to_csv('Manages_truth.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'exists'])

Train a local classifier/model

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

Node Labeling

email_nodes_obs

	id	emailaddress	numsent	numreceived	numexchanged	w-gerald	w-know	w-busi	w-mexicana	w-transact	...	w-bartlo	w-columbiagassubject	w-perron	w-coh	w-agl	w-kinney	w-veselack	w-mwhitt	w-jarnold	title
0	283	c..koehler@enron.com	128	606	734	1.0	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	director
1	98	scott.goodell@enron.com	98	607	705	1.0	1.0	1.0	NaN	1.0	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	NaN	NaN	specialist
2	183	p..south@enron.com	8	351	359	1.0	1.0	1.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	director
4	318	mike.grigsby@enron.com	3702	490	4192	1.0	1.0	1.0	1.0	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	executive
5	303	t..hodge@enron.com	95	570	665	1.0	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	executive
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
206	114	vkamins@enron.com	0	12	12	NaN	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	director
207	270	david.duran@enron.com	7	145	152	NaN	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	director
208	282	sean.crandall@enron.com	94	138	232	NaN	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	director
209	243	kevin.presto@enron.com	248	198	446	1.0	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	executive
210	131	dave.fuller@enron.com	165	129	294	1.0	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	manager

171 rows × 5119 columns

train_x = email_nodes_obs.drop(['id', 'emailaddress', 'title', 'numsent', 'numreceived', 'numexchanged'], axis = 1).fillna(0)
train_y = email_nodes_obs['title']

test_x = email_nodes_truth.drop(['id', 'emailaddress', 'title', 'numsent', 'numreceived', 'numexchanged'], axis = 1).fillna(0)
test_y = email_nodes_truth['title']

# classifier = svm.LinearSVC()
classifier = LogisticRegression(max_iter=300)
classifier.fit(train_x, train_y)

LogisticRegression(max_iter=300)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

predictions = classifier.predict(test_x)

classifier.score(test_x, test_y)

0.475

predictions

array(['executive', 'executive', 'other', 'executive', 'director',
       'executive', 'director', 'specialist', 'executive', 'executive',
       'specialist', 'executive', 'specialist', 'other', 'manager',
       'director', 'manager', 'specialist', 'other', 'executive',
       'executive', 'director', 'other', 'director', 'director',
       'director', 'specialist', 'director', 'specialist', 'specialist',
       'director', 'director', 'specialist', 'executive', 'executive',
       'director', 'director', 'executive', 'manager', 'director'],
      dtype=object)

# title_map = {"other": 0, "manager": 1, "specialist": 2, "director": 3, "executive": 4}

classifier.classes_

array(['director', 'executive', 'manager', 'other', 'specialist'],
      dtype=object)

Use probabilities for PSL observed data.

local_EmailHasTitle_probabilities = classifier.predict_proba(test_x)

local_EmailHasTitle_obs = pd.DataFrame()
row_list = []
# build a table
for index, probabilities in enumerate(local_EmailHasTitle_probabilities):
    for class_index, probability in enumerate(probabilities):
        row_dict = {'id': email_nodes_truth.iloc[index]['id'], 'title': title_map[classifier.classes_[class_index]], 'exists': probability}
        row_list.append(row_dict)
        #print(email_nodes_truth.iloc[index]['id'], "\t", title_map[classifier.classes_[class_index]], "\t", probability)

local_EmailHasTitle_obs = pd.concat([local_EmailHasTitle_obs, pd.DataFrame(row_list)], ignore_index=True)

local_EmailHasTitle_obs.to_csv('Local_EmailHasLabel_obs.csv', sep ='\t', index=False, header=False, columns=['id', 'title', 'exists'])

Link Prediction

manager_edges_obs

	id	w-gerald	w-know	w-busi	w-mexicana	w-transact	w-want	w-thing	w-review	w-questar	...	w-coh	w-agl	w-kinney	w-veselack	w-mwhitt	w-jarnold	numexchanged	email	other_email	exists
0	2693	NaN	1.0	NaN	NaN	1.0	1.0	NaN	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	6	286	324	EXIST
2	1256	NaN	1.0	1.0	NaN	NaN	1.0	1.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	14	148	131	NOTEXIST
3	1406	NaN	NaN	NaN	NaN	1.0	1.0	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	3	57	313	EXIST
4	3129	NaN	1.0	1.0	NaN	1.0	1.0	1.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	43	34	236	NOTEXIST
6	989	1.0	1.0	1.0	NaN	1.0	1.0	1.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	313	195	318	NOTEXIST
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2044	4135	NaN	1.0	NaN	NaN	1.0	1.0	1.0	NaN	1.0	...	NaN	NaN	NaN	NaN	NaN	NaN	31	318	46	NOTEXIST
2046	2105	NaN	1.0	NaN	NaN	1.0	NaN	NaN	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	13	67	288	NOTEXIST
2047	2374	1.0	1.0	NaN	NaN	1.0	1.0	1.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	237	198	212	NOTEXIST
2048	3464	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	1	210	160	NOTEXIST
2049	531	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	9	316	188	NOTEXIST

1642 rows × 5118 columns

train_x = manager_edges_obs.drop(['id', 'numexchanged', 'email', 'other_email', 'exists'], axis = 1).fillna(0)
train_y = manager_edges_obs['exists']

test_x = manager_edges_truth.drop(['id', 'numexchanged', 'email', 'other_email', 'exists'], axis = 1).fillna(0)
test_y = manager_edges_truth['exists']

train_x.dtypes

w-gerald      float64
w-know        float64
w-busi        float64
w-mexicana    float64
w-transact    float64
               ...   
w-agl         float64
w-kinney      float64
w-veselack    float64
w-mwhitt      float64
w-jarnold     float64
Length: 5113, dtype: object

classifier = LogisticRegression(max_iter=300)
classifier.fit(train_x, train_y)

LogisticRegression(max_iter=300)

predictions = classifier.predict(test_x)

classifier.score(test_x, test_y)

0.8997555012224939

Use probabilities for PSL observed data.

local_Manages_probabilities = classifier.predict_proba(test_x)

local_Manages_obs = pd.DataFrame()
row_list = []
# build a table
for index, probabilities in enumerate(local_Manages_probabilities):
    row_dict = {'email': manager_edges_truth.iloc[index]['email'], 'other_email': manager_edges_truth.iloc[index]['other_email'], 'exists': exists_map[classifier.classes_[np.argmax(probabilities)]]}
    row_list.append(row_dict)
    #print(email_nodes_truth.iloc[index]['id'], "\t", title_map[classifier.classes_[class_index]], "\t", probability)

local_Manages_obs = pd.concat([local_Manages_obs, pd.DataFrame(row_list)])

local_Manages_obs

	email	other_email	exists
0	74	37	0.0
1	24	170	0.0
2	174	136	0.0
3	108	46	0.0
4	209	202	0.0
...	...	...	...
404	117	285	0.0
405	219	193	0.0
406	91	167	0.0
407	101	208	0.0
408	34	273	0.0

409 rows × 3 columns

# Since it's undirected, add in the reverse edges.
local_Manages_obs_sym = local_Manages_obs[['other_email', 'email', 'exists']].copy()

local_Manages_obs_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)

local_Manages_obs = pd.concat([local_Manages_obs, local_Manages_obs_sym])

local_Manages_obs.to_csv('Local_Manages_obs.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'exists'])

Entity Resolution

from strsimpy.qgram import QGram
from scipy.spatial import distance

qgram = QGram(1)

node_to_email = dict(zip(email_nodes['id'], email_nodes['emailaddress']))

# Calculate features for Training set

train_x = full_set_coref_edges_obs.copy()

train_x['address_similarity'] = 0.0
train_x['bow_cosine_similarity'] = 0.0
train_x['bow_jaccard_similarity'] = 0.0


for index in train_x.index:
    string_similarity = qgram.distance(node_to_email[train_x.iloc[index]['email']], node_to_email[train_x.iloc[index]['alt_email']])
    train_x.loc[index, 'address_similarity'] = string_similarity

    bow_cosine_similarity = distance.cosine(np.nan_to_num(list(email_nodes[email_nodes['id'] == train_x.iloc[index]['email']].iloc[0][5:-1])), np.nan_to_num(list(email_nodes[email_nodes['id'] == train_x.iloc[index]['alt_email']].iloc[0][5:-1])))
    train_x.loc[index, 'bow_cosine_similarity'] = bow_cosine_similarity

    bow_jaccard_similarity = distance.jaccard(np.nan_to_num(list(email_nodes[email_nodes['id'] == train_x.iloc[index]['email']].iloc[0][5:-1])), np.nan_to_num(list(email_nodes[email_nodes['id'] == train_x.iloc[index]['alt_email']].iloc[0][5:-1])))
    train_x.loc[index, 'bow_jaccard_similarity'] = bow_jaccard_similarity
    

train_x = train_x.drop(['email', 'alt_email', 'exists'], axis = 1)
train_y = full_set_coref_edges_obs['exists'].copy()

# Test set

test_x = coref_truth.copy()
test_y = coref_truth['exists'].copy()

test_x['address_similarity'] = 0.0
test_x['bow_cosine_similarity'] = 0.0
test_x['bow_jaccard_similarity'] = 0.0

for index in test_x.index:
    string_similarity = qgram.distance(node_to_email[test_x.iloc[index]['email']], node_to_email[test_x.iloc[index]['alt_email']])
    test_x.loc[index, 'address_similarity'] = string_similarity

    bow_cosine_similarity = distance.cosine(np.nan_to_num(list(email_nodes[email_nodes['id'] == test_x.iloc[index]['email']].iloc[0][5:-1])), np.nan_to_num(list(email_nodes[email_nodes['id'] == test_x.iloc[index]['alt_email']].iloc[0][5:-1])))
    test_x.loc[index, 'bow_cosine_similarity'] = bow_cosine_similarity

    bow_jaccard_similarity = distance.jaccard(np.nan_to_num(list(email_nodes[email_nodes['id'] == test_x.iloc[index]['email']].iloc[0][5:-1])), np.nan_to_num(list(email_nodes[email_nodes['id'] == test_x.iloc[index]['alt_email']].iloc[0][5:-1])))
    test_x.loc[index, 'bow_jaccard_similarity'] = bow_jaccard_similarity
    
test_x = test_x.drop(['email', 'alt_email', 'exists'], axis = 1)
test_y = coref_truth['exists'].copy()

classifier = LogisticRegression()
classifier.fit(train_x, train_y)

LogisticRegression()

classifier.score(test_x, test_y)

0.9966313763233878

predictions = classifier.predict(test_x)

f1_score(test_y, predictions)

0.5

Use probabilities for PSL observed data.

coref_truth

	email	alt_email	exists
0	201	19	0.0
1	228	243	0.0
2	20	217	0.0
3	38	118	0.0
4	310	69	0.0
...	...	...	...
8307	228	20	0.0
8308	69	38	0.0
8309	209	107	0.0
8310	273	188	0.0
8311	241	195	0.0

8312 rows × 3 columns

local_CoRef_probabilities = classifier.predict_proba(test_x)

local_CoRef_obs = pd.DataFrame()
row_list = []
# build a table
for index, probabilities in enumerate(local_CoRef_probabilities):
    row_dict = {'email': int(coref_truth.iloc[index]['email']), 'alt_email': int(coref_truth.iloc[index]['alt_email']), 'exists': probabilities[1]}
    row_list.append(row_dict)
    #print(email_nodes_truth.iloc[index]['id'], "\t", title_map[classifier.classes_[class_index]], "\t", probability)

local_CoRef_obs = pd.concat([local_CoRef_obs, pd.DataFrame(row_list)], ignore_index=True)

local_CoRef_obs.to_csv('Local_CoRef_obs.csv', sep ='\t', index=False, header=False, columns=['email', 'alt_email', 'exists'])

# Sanity check the positive instances
set_1 = {(pair[0], pair[1]) for pair in zip(local_CoRef_obs[local_CoRef_obs['exists'] > 0.5]['email'], local_CoRef_obs[local_CoRef_obs['exists'] > 0.5]['alt_email'])}
set_2 = {(pair[0], pair[1]) for pair in zip(coref_truth[coref_truth['exists'] == 1]['email'], coref_truth[coref_truth['exists'] == 1]['alt_email'])}
set_2 & set_1

{(2, 3),
 (3, 2),
 (30, 31),
 (31, 30),
 (109, 114),
 (113, 115),
 (114, 109),
 (115, 113),
 (137, 138),
 (138, 137),
 (140, 141),
 (141, 140),
 (150, 152),
 (152, 150)}

Calculate Similarity Metrics

Entity Resolution

Email Address similarity

# qgram = QGram(1)
# print(qgram.distance('ABCD', 'ABCE'))
email_nodes[email_nodes['id'] == 268]['emailaddress'].iloc[0]

'laura.luce@enron.com'

email_pairs = {pair for pair in itertools.combinations(email_nodes['id'], 2)}

qgram = QGram(1)

sim_email = pd.DataFrame()
row_list = []

for pair in email_pairs:
    #print(pair)

    email_1 = email_nodes[email_nodes['id'] == pair[0]]['emailaddress'].iloc[0]
    email_2 = email_nodes[email_nodes['id'] == pair[1]]['emailaddress'].iloc[0]    
        
    string_similarity = qgram.distance(email_1, email_2)

    row_dict = {'email':pair[0], 'other_email':pair[1], 'qgram_sim':string_similarity}
    row_list.append(row_dict)
    
sim_email = pd.concat([sim_email, pd.DataFrame(row_list)])

# Sanity Check, this should print mostly "exists=1.0"

# for pair in zip(sim_email[sim_email['qgram_sim'] < 5]['email'], sim_email[sim_email['qgram_sim'] < 5]['other_email']):
#     print(full_set_coref_edges_data[(full_set_coref_edges_data['email'] == pair[0]) & (full_set_coref_edges_data['alt_email'] == pair[1])]['exists'])
#     print("--------")

Bag of words similarity

# These similarity features will be used for PSL predicates instead of a local classifier.
from scipy.spatial import distance
from matplotlib import pyplot as plt

sim_bow = pd.DataFrame()
row_list = []


for pair in email_pairs:
    entity_1 = email_nodes[email_nodes['id'] == pair[0]]
    entity_2 = email_nodes[email_nodes['id'] == pair[1]]
    
    bow_1 = entity_1.iloc[0][5:-1]
    bow_2 = entity_2.iloc[0][5:-1]
    # FIXME: Jaccard distance needs to be on sets
    row_dict = {'email':pair[0], 'other_email':pair[1], 'jaccard_sim_bow':distance.jaccard(list(bow_1), list(bow_2)), 'cosine_sim_bow':distance.cosine(np.nan_to_num(list(bow_1)), np.nan_to_num(list(bow_2)))}
    row_list.append(row_dict)
    
sim_bow = pd.concat([sim_bow, pd.DataFrame(row_list)])

sim_bow['jaccard_sim_bow'].describe()

count    22155.000000
mean         0.863022
std          0.121956
min          0.188539
25%          0.796010
50%          0.885781
75%          0.971054
max          1.000000
Name: jaccard_sim_bow, dtype: float64

sim_bow['cosine_sim_bow'].describe()

count    22155.000000
mean         0.585444
std          0.195204
min          0.060462
25%          0.442353
50%          0.547103
75%          0.734201
max          1.000000
Name: cosine_sim_bow, dtype: float64

# Since it's undirected, add in the reverse edges.
sim_bow_sym = sim_bow[['other_email', 'email', 'jaccard_sim_bow', 'cosine_sim_bow']].copy()
sim_bow_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)

sim_bow = pd.concat([sim_bow, sim_bow_sym])

# sim_bow.to_csv('Sim_Jaccard_Bow.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'jaccard_sim_bow'])

# sim_bow.to_csv('Sim_Cosine_Bow.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'cosine_sim_bow'])

Network Based silmilarity

# Load in the observed communication network
# need to rename one of the columns due to key collision
# use copy for safety
!cp $FILE_GROUND_TRUTH_COMMUNICATION_EDGES .
!sed -i 's/\temail/\tother_email/2g' enron.DIRECTED.sentto.tab

communication_edges = load_table('enron.DIRECTED.sentto.tab')

# FIXME: can probably omit this line
# manager_edges.drop('NOTEXIST,EXIST', axis=1, inplace=True)

resolve_column_type(communication_edges)

communication_edges.dtypes

Header:  DIRECTED	sentto

This is not a NODE file, so don't load this row

id              int64
email           int64
other_email     int64
numexchanged    int64
dtype: object

communication_edges

	id	email	other_email	numexchanged
0	2856	291	136	2
1	1937	299	1	1
2	516	316	14	3
3	1049	207	188	4
4	5131	174	318	18
...	...	...	...	...
2832	3497	117	243	4
2833	3665	198	219	129
2834	2831	25	291	1
2835	1462	202	280	10
2836	1708	271	231	2

2837 rows × 4 columns

# Add in existence
communication_edges['exists'] = 1.0

# Calculated the missing edges that were blocked.
missing_edges = {pair for pair in itertools.permutations(email_nodes['id'], 2)} - {pair for pair in zip(communication_edges['email'], communication_edges['other_email'])}
# add in the missing edges
row_list = []
for email, alt_email in missing_edges:
    row_dict = {'email':email, 'other_email':alt_email, 'exists':0 }
    row_list.append(row_dict)

full_set_communication_edges = pd.concat([communication_edges, pd.DataFrame(row_list)], ignore_index=True)

# full_set_communication_edges.to_csv('Communicates.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'exists'])

# prepare ground truth
coref_map = {(int(full_set_coref_edges_data.iloc[index]['email']), int(full_set_coref_edges_data.iloc[index]['alt_email'])):full_set_coref_edges_data.iloc[index]['exists'] for index in full_set_coref_edges_data.index}

sim_network = pd.DataFrame()
row_list = []

for id_1, id_2 in email_pairs:
    
    adjacent_nodes_1 = set(communication_edges[communication_edges['email'] == id_1]['other_email'])
    adjacent_nodes_2 = set(communication_edges[communication_edges['email'] == id_2]['other_email'])

    entity_1 = email_nodes[email_nodes['id'] == id_1]
    entity_2 = email_nodes[email_nodes['id'] == id_2]
    
    bow_1 = entity_1.iloc[0][5:-1]
    bow_2 = entity_2.iloc[0][5:-1]

    jaccard_sim =  len(adjacent_nodes_1 & adjacent_nodes_2) / len(adjacent_nodes_1 | adjacent_nodes_2 ) if len(adjacent_nodes_1 | adjacent_nodes_2) != 0 else 0
    dice_sim =  (2 * len(adjacent_nodes_1 & adjacent_nodes_2) ) / (len(adjacent_nodes_1) + len(adjacent_nodes_2)) if len(adjacent_nodes_1) + len(adjacent_nodes_2) != 0 else 0
    
    # dice_sim = 
    
    row_dict = {'email':id_1, 'other_email':id_2, 'jaccard_sim_network':jaccard_sim, 'dice_sim_network':dice_sim, 'jaccard_sim_bow':distance.jaccard(list(bow_1), list(bow_2)), 'cosine_sim_bow':distance.cosine(np.nan_to_num(list(bow_1)), np.nan_to_num(list(bow_2))), 'is_coref': coref_map[(id_1, id_2)]}
    row_list.append(row_dict)
    
sim_network = pd.concat([sim_network, pd.DataFrame(row_list)])

# Since it's undirected, add in the reverse edges.
sim_network_sym = sim_network[['other_email', 'email', 'jaccard_sim_network', 'dice_sim_network', 'jaccard_sim_bow', 'cosine_sim_bow', 'is_coref']].copy()
sim_network_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)

sim_network = pd.concat([sim_network, sim_network_sym], ignore_index=True)

sim_network

	email	other_email	jaccard_sim_network	dice_sim_network	jaccard_sim_bow	cosine_sim_bow	is_coref
0	132	268	0.000000	0.000000	0.789947	0.413429	0.0
1	266	268	0.000000	0.000000	0.863876	0.521341	0.0
2	185	279	0.000000	0.000000	0.937806	0.615457	0.0
3	160	222	0.022222	0.043478	0.803442	0.464823	0.0
4	220	273	0.034483	0.066667	0.752396	0.448393	0.0
...	...	...	...	...	...	...	...
44305	83	138	0.000000	0.000000	0.994524	0.844011	0.0
44306	201	229	0.000000	0.000000	0.999804	0.968901	0.0
44307	150	303	0.157895	0.272727	0.788774	0.415224	0.0
44308	47	14	0.000000	0.000000	0.983376	0.828495	0.0
44309	150	41	0.000000	0.000000	0.893018	0.527444	0.0

44310 rows × 7 columns

sim_network.to_csv('Sim_network.csv', sep ='\t', index=False)

Feature analysis

duplicates = {(pair[0], pair[1]) for pair in zip(full_set_coref_edges_data[full_set_coref_edges_data['exists'] == 1.0]['email'], full_set_coref_edges_data[full_set_coref_edges_data['exists'] == 1.0]['alt_email'])}

filtered_frame = pd.DataFrame()
row_list = []

for index in sim_network.index:
    if (sim_network.iloc[index]['email'], sim_network.iloc[index]['other_email']) in duplicates:
        row_dict = {'email': sim_network.iloc[index]['email'], 'other_email': sim_network.iloc[index]['other_email'], "jaccard_sim_network": sim_network.iloc[index]['jaccard_sim_network'], "dice_sim_network": sim_network.iloc[index]['dice_sim_network']}
        row_list.append(row_dict)
        
filtered_frame = pd.concat([filtered_frame, pd.DataFrame(row_list)], ignore_index=True)

filtered_frame["jaccard_sim_network"].describe()

count    212.000000
mean       0.073877
std        0.142050
min        0.000000
25%        0.000000
50%        0.000000
75%        0.133333
max        0.816327
Name: jaccard_sim_network, dtype: float64

from matplotlib import pyplot as plt

fig, axis = plt.subplots(figsize = (10, 5))
axis.hist(filtered_frame["jaccard_sim_network"], bins = [0, 0.05, 0.1, 0.15, .20, 0.25, 0.255, 0.30, 0.35, 0.40, 0.50, 0.55, .60, 0.65, 0.70, 0.75, .80, 0.85, 0.90, 0.95, 1.00])
# Displaying the graph
plt.show()

png

sim_network["jaccard_sim_network"].describe()

count    44310.000000
mean         0.033539
std          0.070383
min          0.000000
25%          0.000000
50%          0.000000
75%          0.040000
max          1.000000
Name: jaccard_sim_network, dtype: float64

filtered_frame["dice_sim_network"].describe()

count    212.000000
mean       0.112360
std        0.194552
min        0.000000
25%        0.000000
50%        0.000000
75%        0.235294
max        0.898876
Name: dice_sim_network, dtype: float64

# sim_network.to_csv('Sim_Jaccard_Network.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'jaccard_sim_network'])

fig, axis = plt.subplots(figsize = (10, 5))
axis.hist(filtered_frame["dice_sim_network"], bins = [0, 0.05, 0.1, 0.15, .20, 0.25, 0.255, 0.30, 0.35, 0.40, 0.50, 0.55, .60, 0.65, 0.70, 0.75, .80, 0.85, 0.90, 0.95, 1.00] )
# Displaying the graph
plt.show()

png

# TODO: why isn't this printing mostly "exists"?
# for pair in zip(sim_network[sim_network['jaccard_sim_network'] > 0.5]['email'], sim_network[sim_network['jaccard_sim_network'] > 0.5]['other_email']):
#     print(full_set_coref_edges_data[(full_set_coref_edges_data['email'] == pair[0]) & (full_set_coref_edges_data['alt_email'] == pair[1])]['exists'])
#     print("--------")

# TODO: why isn't this printing mostly "exists"?
# for pair in zip(sim_network[sim_network['jaccard_sim_network'] > 0.5]['email'], sim_network[sim_network['jaccard_sim_network'] > 0.5]['other_email']):
#     print(full_set_coref_edges_data[(full_set_coref_edges_data['email'] == pair[0]) & (full_set_coref_edges_data['alt_email'] == pair[1])]['exists'])
#     print("--------")

	email	other_email	exists
0	74	37	0.0
1	24	170	0.0
2	174	136	0.0
3	108	46	0.0
4	209	202	0.0
...	...	...	...
404	117	285	0.0
405	219	193	0.0
406	91	167	0.0
407	101	208	0.0
408	34	273	0.0

	email	other_email	exists
0	74	37	0.0
1	24	170	0.0
2	174	136	0.0
3	108	46	0.0
4	209	202	0.0
...	...	...	...
404	117	285	0.0
405	219	193	0.0
406	91	167	0.0
407	101	208	0.0
408	34	273	0.0

	email	other_email	exists
0	74	37	0.0
1	24	170	0.0
2	174	136	0.0
3	108	46	0.0
4	209	202	0.0
...	...	...	...
404	117	285	0.0
405	219	193	0.0
406	91	167	0.0
407	101	208	0.0
408	34	273	0.0