Collective Graph Identification

../

In progress

Here, we solve several canonical problems in network analysis: entity resolution (determining when two observations correspond to the same entity), link prediction (inferring the existence of links), and node labeling (inferring hidden attributes).

Updates can be seen here:

This notebook stores each step of refactoring the graph data into PSL data

# No space between equals sign is necessary, so we can treat these as Bash variables as well.
FILE_GROUND_TRUTH_EMAIL_NODES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.NODE.email.tab'
FILE_GROUND_TRUTH_COREF_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.UNDIRECTED.coref.tab'
FILE_GROUND_TRUTH_MANAGES_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.UNDIRECTED.email-submgr.tab'
FILE_GROUND_TRUTH_COMMUNICATION_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.DIRECTED.sentto.tab'


FILE_SAMPLE_EMAIL_NODES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-6of6/sample-enron.NODE.email.tab'
FILE_SAMPLE_COREF_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-6of6/sample-enron.UNDIRECTED.coref.tab'
FILE_SAMPLE_MANAGES_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-6of6/sample-enron.UNDIRECTED.email-submgr.tab'

These functions help parse the .tab files.

import pandas as pd
import numpy as np
import re
import itertools # for cross products when filling in a full PSL dataset
# assigns types to each column
def resolve_column_type(table):
    for column in table.columns:
        if column in {'id', 'email', 'alt_email', 'other_email' , 'numsent', 'numreceived', 'numexchanged'}:
            table[column] = table[column].astype(str).astype(float).astype(int)
        # convert bag-of-words columns to floats (since ints won't take NaNs)
        elif re.match("w-", column):
            table[column] = table[column].astype(str).astype(float)

# extracts feature name from an element in a raw tab row
# returns: tuple (feature_name, feature_value, optional_value)
def get_feature_tuple(feature):
    feature_data = re.split(r"[:=]", feature)
    return feature_data
    

# loads the *.tab files into a Pandas Dataframe.
# returns: pd.DataFrame(columns=features)
def load_table(filename):

    # initialize the pandas dataframe
    node_data = pd.DataFrame()


    with open(filename) as infile:
        i = 0
        row_list = []
        for row in infile:
    
            #print('i is: ', i)

            if i == 0:
                # Skip non-useful first line
                print("Header: ", row)
            elif i == 1:
                # Prepare dataframe column labels
                tokens = row.split()
                if len(tokens) == 1:
                    print("This is not a NODE file, so don't load this row")
                else:  
                    features = ["id"] + [get_feature_tuple(feature)[1] for feature in tokens]
                    node_data = pd.DataFrame(columns=features)
            else:
          
                # this is to help the function generalize among the NODE and EDGE files.
                # EDGE files have a "|" character, which needs to be removed for proper feature decoupling
                row = re.sub(r'\|','', row)
            
                tokens = row.split()

                # the first token doesn't need splitting
                row_dict = {'id':tokens[0]}
                row_dict.update({get_feature_tuple(token)[0]:get_feature_tuple(token)[1] for token in tokens[1:]})
                row_list.append(row_dict)
        
            i += 1
        
        # Fill in rows
        node_data = pd.concat([node_data, pd.DataFrame(row_list)], ignore_index=True)

    return node_data

Process the email nodes

Get ground truth

email_nodes = load_table(FILE_GROUND_TRUTH_EMAIL_NODES)
# remove the (unnecessary) second to last column (it came from an ambiguous parse splits)
email_nodes.drop('other,manager,specialist,director,executive', axis=1, inplace=True)
resolve_column_type(email_nodes)

email_nodes.dtypes
Header:  NODE	email






id                int64
emailaddress     object
numsent           int64
numreceived       int64
numexchanged      int64
                 ...   
w-kinney        float64
w-veselack      float64
w-mwhitt        float64
w-jarnold       float64
title            object
Length: 5119, dtype: object
email_nodes
id emailaddress numsent numreceived numexchanged w-gerald w-know w-busi w-mexicana w-transact ... w-bartlo w-columbiagassubject w-perron w-coh w-agl w-kinney w-veselack w-mwhitt w-jarnold title
0 283 c..koehler@enron.com 128 606 734 1.0 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN director
1 98 scott.goodell@enron.com 98 607 705 1.0 1.0 1.0 NaN 1.0 ... 1.0 1.0 1.0 1.0 1.0 1.0 1.0 NaN NaN specialist
2 183 p..south@enron.com 8 351 359 1.0 1.0 1.0 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN director
3 204 lavorato@enron.com 388 3 391 NaN 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN executive
4 318 mike.grigsby@enron.com 3702 490 4192 1.0 1.0 1.0 1.0 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN executive
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
206 114 vkamins@enron.com 0 12 12 NaN 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN director
207 270 david.duran@enron.com 7 145 152 NaN 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN director
208 282 sean.crandall@enron.com 94 138 232 NaN 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN director
209 243 kevin.presto@enron.com 248 198 446 1.0 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN executive
210 131 dave.fuller@enron.com 165 129 294 1.0 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN manager

211 rows × 5119 columns

# Takes a table and fills the missing pairs and values to specify a full, sufficient set
# So far it only works with binary predicates
def fill_observed_missing_possibilities(table, arguments, values):
    total_possibilities = set(itertools.product(list(table[arguments[0]]), values))
    already_observed_possibilities = set((table.loc[index][arguments[0]], table.loc[index][arguments[1]]) for index in table.index)

    missing_possibilities = total_possibilities - already_observed_possibilities
    row_list = []
    for arg_0, arg_1 in missing_possibilities:
        row_dict = {arguments[0]:arg_0, arguments[1]:arg_1, arguments[2]:0 }
        row_list.append(row_dict)
        
    return pd.concat([table, pd.DataFrame(row_list)], ignore_index=True)
# Grab necessary columns, in preparation for dumping the whole ground truth data
email_nodes_data = email_nodes[['id','title']].copy()

# convert titles to integers, so PSL can ground faster
title_map = {"other": 0, "manager": 1, "specialist": 2, "director": 3, "executive": 4}

email_nodes_data = email_nodes_data.replace({'title': title_map})
email_nodes_data['exists'] = 1.0

full_set_email_has_label_data = fill_observed_missing_possibilities(email_nodes_data, ['id', 'title', 'exists'], list(title_map.values()))
full_set_email_has_label_data
id title exists
0 283 3 1.0
1 98 2 1.0
2 183 3 1.0
3 204 4 1.0
4 318 4 1.0
... ... ... ...
1050 182 4 0.0
1051 308 0 0.0
1052 46 0 0.0
1053 202 0 0.0
1054 26 3 0.0

1055 rows × 3 columns

# Outputs all data (obs+truth)
# full_set_email_has_label_data.to_csv('EmailHasLabel_data.csv', sep ='\t', index=False, header=False, columns=['id', 'title', 'exists'])

Calculate splits for PSL predicates

# Grab the sample from the original experiment, this will allow us to calculate observations and targets.
sample_email_nodes = load_table(FILE_SAMPLE_EMAIL_NODES)
# remove the (unnecessary) second to last column (it came from an ambiguous parse splits)
sample_email_nodes.drop('other,manager,specialist,director,executive', axis=1, inplace=True)
resolve_column_type(sample_email_nodes)
Header:  NODE	email
# Split data into observed and targets (AKA train and test)
email_nodes_obs = email_nodes[email_nodes['id'].isin(sample_email_nodes[sample_email_nodes['title'].notna()]['id'])]
email_nodes_truth = email_nodes[email_nodes['id'].isin(sample_email_nodes[sample_email_nodes['title'].isna()]['id'])]
# Grab the necessary columns
email_has_label_obs = email_nodes_obs[['id','title']].copy()
email_has_label_truth = email_nodes_truth[['id','title']].copy()

# convert titles to integers, so PSL can ground faster
email_has_label_obs = email_has_label_obs.replace({'title': title_map})
email_has_label_truth = email_has_label_truth.replace({'title': title_map})

# add in an existence column
email_has_label_obs['exists'] = 1.0
email_has_label_truth['exists'] = 1.0

# email_has_label_obs
# Add in the the non existent observations
full_set_email_has_label_obs = fill_observed_missing_possibilities(email_has_label_obs, ['id', 'title', 'exists'], list(title_map.values()))
full_set_email_has_label_truth = fill_observed_missing_possibilities(email_has_label_truth, ['id', 'title', 'exists'], list(title_map.values()))
# Outputs splits to file
full_set_email_has_label_obs.to_csv('EmailHasLabel_obs.csv', sep ='\t', index=False, header=False)
full_set_email_has_label_truth.to_csv('EmailHasLabel_truth.csv', sep ='\t', index=False, header=False)

Process the CoRef edges

Get ground truth

# need to rename one of the columns due to key collision
# use copy for safety

!cp $FILE_GROUND_TRUTH_COREF_EDGES .
!sed -i 's/email/alt_email/2g' enron.UNDIRECTED.coref.tab

coref_edges = load_table('enron.UNDIRECTED.coref.tab')
resolve_column_type(coref_edges)

coref_edges.dtypes
Header:  UNDIRECTED	coref

This is not a NODE file, so don't load this row





id            int64
email         int64
alt_email     int64
exists       object
dtype: object
coref_edges
id email alt_email exists
0 2856 265 141 NOTEXIST
1 18491 310 295 NOTEXIST
2 516 272 183 NOTEXIST
3 5131 201 19 NOTEXIST
4 12417 138 78 NOTEXIST
... ... ... ... ...
20776 15003 135 208 NOTEXIST
20777 4450 197 47 NOTEXIST
20778 20302 248 25 NOTEXIST
20779 12985 222 118 NOTEXIST
20780 19684 248 54 NOTEXIST

20781 rows × 4 columns

# Grab necessary columns, in preparation for dumping the whole ground truth data
coref_edges_data = coref_edges[['email','alt_email', 'exists']].copy()

# convert existence column to boolean, so PSL can ground faster
exists_map = {"NOTEXIST": 0.0, "EXIST": 1.0}
coref_edges_data = coref_edges_data.replace({'exists': exists_map})

# Since it's undirected, add in the reverse edges.
coref_edges_data_sym = coref_edges_data[['alt_email', 'email', 'exists']].copy()
coref_edges_data_sym.rename(columns = {'alt_email':'email', 'email':'alt_email'}, inplace = True)

coref_edges_data = pd.concat([coref_edges_data, coref_edges_data_sym])

# Calculated the missing edges that were blocked.
missing_edges = {pair for pair in itertools.permutations(email_nodes['id'], 2)} - {pair for pair in zip(coref_edges_data['email'], coref_edges_data['alt_email'])}

# add in the missing edges
row_list = []
for email, alt_email in missing_edges:
    row_dict = {'email':email, 'alt_email':alt_email, 'exists':0 }
    row_list.append(row_dict)

full_set_coref_edges_data = pd.concat([coref_edges_data, pd.DataFrame(row_list)], ignore_index=True)
# Outputs to file
# full_set_coref_edges_data.to_csv('CoRef_data.csv', sep ='\t', index=False, header=False, columns=['email', 'alt_email', 'exists'])
# Sanity Check: These should print pairs of the same people
# for index in full_set_coref_edges_data[full_set_coref_edges_data['exists'] == 1.0][['email', 'alt_email']].index:
#     email_id  = full_set_coref_edges_data.loc[index]['email'].iloc[0]
#     alt_email_id = full_set_coref_edges_data.loc[index]['alt_email'].iloc[0]
    
#     print(email_nodes[email_nodes['id'] == email_id]['emailaddress'])
#     print(email_nodes[email_nodes['id'] == alt_email_id]['emailaddress'])
#     print("------------------------------------------------")
    

Calculate splits for PSL predicates

# Grab the sample from the original experiment, this will allow us to calculate observations and targets.
sample_coref_edges = load_table(FILE_SAMPLE_COREF_EDGES)
resolve_column_type(sample_coref_edges)

Header:  UNDIRECTED	coref

This is not a NODE file, so don't load this row
# Split data into observed and targets (AKA train and test)
coref_edges_obs = coref_edges[coref_edges['id'].isin(sample_coref_edges[sample_coref_edges['exists'].notna()]['id'])]
coref_edges_truth = coref_edges[coref_edges['id'].isin(sample_coref_edges[sample_coref_edges['exists'].isna()]['id'])]
# Grab the necessary columns
coref_obs = coref_edges_obs[['email', 'alt_email', 'exists']].copy()
coref_truth = coref_edges_truth[['email', 'alt_email', 'exists']].copy()

# convert existence column to boolean, so PSL can ground faster
coref_obs = coref_obs.replace({'exists': exists_map})
coref_truth = coref_truth.replace({'exists': exists_map})

# Since it's undirected, add in the reverse edges.
coref_obs_sym = coref_obs[['alt_email', 'email', 'exists']].copy()
coref_truth_sym = coref_truth[['alt_email', 'email', 'exists']].copy()

coref_obs_sym.rename(columns = {'alt_email':'email', 'email':'alt_email'}, inplace = True)
coref_truth_sym.rename(columns = {'alt_email':'email', 'email':'alt_email'}, inplace = True)

coref_obs = pd.concat([coref_obs, coref_obs_sym], ignore_index=True)
coref_truth = pd.concat([coref_truth, coref_truth_sym], ignore_index=True)

# Calculated the missing edges that were blocked. Note the last set prevents cross contamination
missing_edges = {pair for pair in itertools.permutations(email_nodes['id'], 2)} - {pair for pair in zip(coref_obs['email'], coref_obs['alt_email'])} - {pair for pair in zip(coref_truth['email'], coref_truth['alt_email'])}

# add in the missing edges
row_list = []
for email, alt_email in missing_edges:
    row_dict = {'email':email, 'alt_email':alt_email, 'exists':0 }
    row_list.append(row_dict)

full_set_coref_edges_obs = pd.concat([coref_obs, pd.DataFrame(row_list)], ignore_index=True)
# Outputs splits to file
full_set_coref_edges_obs.to_csv('CoRef_obs.csv', sep ='\t', index=False, header=False, columns=['email', 'alt_email', 'exists'])
coref_truth.to_csv('CoRef_truth.csv', sep ='\t', index=False, header=False, columns=['email', 'alt_email', 'exists'])

Process the Manager edges

Get ground truth

# Load in the observed email-submgr.
# need to rename one of the columns due to key collision
# use copy for safety
!cp $FILE_GROUND_TRUTH_MANAGES_EDGES .
!sed -i 's/\temail/\tother_email/2g' enron.UNDIRECTED.email-submgr.tab

manager_edges = load_table('enron.UNDIRECTED.email-submgr.tab')

# FIXME: can probably omit this line
manager_edges.drop('NOTEXIST,EXIST', axis=1, inplace=True)

resolve_column_type(manager_edges)

manager_edges.dtypes
Header:  UNDIRECTED	email-submgr






id                int64
w-gerald        float64
w-know          float64
w-busi          float64
w-mexicana      float64
                 ...   
w-jarnold       float64
numexchanged      int64
email             int64
other_email       int64
exists           object
Length: 5118, dtype: object
manager_edges
id w-gerald w-know w-busi w-mexicana w-transact w-want w-thing w-review w-questar ... w-coh w-agl w-kinney w-veselack w-mwhitt w-jarnold numexchanged email other_email exists
0 2693 NaN 1.0 NaN NaN 1.0 1.0 NaN 1.0 NaN ... NaN NaN NaN NaN NaN NaN 6 286 324 EXIST
1 2634 NaN 1.0 NaN NaN NaN 1.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 3 74 37 NOTEXIST
2 1256 NaN 1.0 1.0 NaN NaN 1.0 1.0 1.0 NaN ... NaN NaN NaN NaN NaN NaN 14 148 131 NOTEXIST
3 1406 NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 3 57 313 EXIST
4 3129 NaN 1.0 1.0 NaN 1.0 1.0 1.0 1.0 NaN ... NaN NaN NaN NaN NaN NaN 43 34 236 NOTEXIST
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2046 2105 NaN 1.0 NaN NaN 1.0 NaN NaN 1.0 NaN ... NaN NaN NaN NaN NaN NaN 13 67 288 NOTEXIST
2047 2374 1.0 1.0 NaN NaN 1.0 1.0 1.0 1.0 NaN ... NaN NaN NaN NaN NaN NaN 237 198 212 NOTEXIST
2048 3464 NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN ... NaN NaN NaN NaN NaN NaN 1 210 160 NOTEXIST
2049 531 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 9 316 188 NOTEXIST
2050 2026 NaN 1.0 1.0 NaN 1.0 1.0 1.0 1.0 NaN ... NaN NaN NaN NaN NaN NaN 8 34 273 NOTEXIST

2051 rows × 5118 columns

# Grab necessary columns, in preparation for dumping the whole ground truth data
manager_edges_data = manager_edges[['email','other_email', 'exists']].copy()

# convert existence column to boolean, so PSL can ground faster
manager_edges_data = manager_edges_data.replace({'exists': exists_map})

# Since it's undirected, add in the reverse edges.
manager_edges_data_sym = manager_edges_data[['other_email', 'email', 'exists']].copy()
manager_edges_data_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)

manager_edges_data = pd.concat([manager_edges_data, manager_edges_data_sym])

# Calculated the missing edges that were blocked.
missing_edges = {pair for pair in itertools.permutations(email_nodes['id'], 2)} - {pair for pair in zip(manager_edges_data['email'], manager_edges_data['other_email'])}

# add in the missing edges
row_list = []
for email, other_email in missing_edges:
    row_dict = {'email':email, 'other_email':other_email, 'exists':0 }
    row_list.append(row_dict)

full_set_manager_edges_data = pd.concat([manager_edges_data, pd.DataFrame(row_list)], ignore_index=True)

# Outputs to file
# full_set_manager_edges_data.to_csv('Manages_data.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'exists'])

Calculate splits for PSL predicates

# Grab the sample from the original experiment, this will allow us to calculate observations and targets.
sample_manager_edges = load_table(FILE_SAMPLE_MANAGES_EDGES)
resolve_column_type(sample_manager_edges)
Header:  UNDIRECTED	email-submgr
# Split data into observed and targets (AKA train and test)
manager_edges_obs = manager_edges[manager_edges['id'].isin(sample_manager_edges[sample_manager_edges['exists'].notna()]['id'])]
manager_edges_truth = manager_edges[manager_edges['id'].isin(sample_manager_edges[sample_manager_edges['exists'].isna()]['id'])]
print(len(manager_edges_obs))
print(len(manager_edges_truth))
1642
409
# Grab the necessary columns
manages_obs = manager_edges_obs[['email', 'other_email', 'exists']].copy()
manages_truth = manager_edges_truth[['email', 'other_email', 'exists']].copy()

# convert existence column to boolean, so PSL can ground faster
manages_obs = manages_obs.replace({'exists': exists_map})
manages_truth = manages_truth.replace({'exists': exists_map})

# Since it's undirected, add in the reverse edges.
manages_obs_sym = manages_obs[['other_email', 'email', 'exists']].copy()
manages_truth_sym = manages_truth[['other_email', 'email', 'exists']].copy()

manages_obs_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)
manages_truth_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)

manages_obs = pd.concat([manages_obs, manages_obs_sym])
manages_truth = pd.concat([manages_truth, manages_truth_sym])

# Calculated the missing edges that were blocked. Note the last set prevents cross contamination
missing_edges = {pair for pair in itertools.permutations(email_nodes['id'], 2)} - {pair for pair in zip(manages_obs['email'], manages_obs['other_email'])} - {pair for pair in zip(manages_truth['email'], manages_truth['other_email'])}

# add in the missing edges
row_list = []
for email, other_email in missing_edges:
    row_dict = {'email':email, 'other_email':other_email, 'exists':0 }
    row_list.append(row_dict)

full_set_manages_obs = pd.concat([manages_obs, pd.DataFrame(row_list)], ignore_index=True)
full_set_manages_obs.to_csv('Manages_obs.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'exists'])
manages_truth.to_csv('Manages_truth.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'exists'])

Train a local classifier/model

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

Node Labeling

email_nodes_obs
id emailaddress numsent numreceived numexchanged w-gerald w-know w-busi w-mexicana w-transact ... w-bartlo w-columbiagassubject w-perron w-coh w-agl w-kinney w-veselack w-mwhitt w-jarnold title
0 283 c..koehler@enron.com 128 606 734 1.0 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN director
1 98 scott.goodell@enron.com 98 607 705 1.0 1.0 1.0 NaN 1.0 ... 1.0 1.0 1.0 1.0 1.0 1.0 1.0 NaN NaN specialist
2 183 p..south@enron.com 8 351 359 1.0 1.0 1.0 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN director
4 318 mike.grigsby@enron.com 3702 490 4192 1.0 1.0 1.0 1.0 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN executive
5 303 t..hodge@enron.com 95 570 665 1.0 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN executive
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
206 114 vkamins@enron.com 0 12 12 NaN 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN director
207 270 david.duran@enron.com 7 145 152 NaN 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN director
208 282 sean.crandall@enron.com 94 138 232 NaN 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN director
209 243 kevin.presto@enron.com 248 198 446 1.0 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN executive
210 131 dave.fuller@enron.com 165 129 294 1.0 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN manager

171 rows × 5119 columns

train_x = email_nodes_obs.drop(['id', 'emailaddress', 'title', 'numsent', 'numreceived', 'numexchanged'], axis = 1).fillna(0)
train_y = email_nodes_obs['title']

test_x = email_nodes_truth.drop(['id', 'emailaddress', 'title', 'numsent', 'numreceived', 'numexchanged'], axis = 1).fillna(0)
test_y = email_nodes_truth['title']
# classifier = svm.LinearSVC()
classifier = LogisticRegression(max_iter=300)
classifier.fit(train_x, train_y)
LogisticRegression(max_iter=300)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
predictions = classifier.predict(test_x)
classifier.score(test_x, test_y)
0.475
predictions
array(['executive', 'executive', 'other', 'executive', 'director',
       'executive', 'director', 'specialist', 'executive', 'executive',
       'specialist', 'executive', 'specialist', 'other', 'manager',
       'director', 'manager', 'specialist', 'other', 'executive',
       'executive', 'director', 'other', 'director', 'director',
       'director', 'specialist', 'director', 'specialist', 'specialist',
       'director', 'director', 'specialist', 'executive', 'executive',
       'director', 'director', 'executive', 'manager', 'director'],
      dtype=object)
# title_map = {"other": 0, "manager": 1, "specialist": 2, "director": 3, "executive": 4}

classifier.classes_
array(['director', 'executive', 'manager', 'other', 'specialist'],
      dtype=object)

Use probabilities for PSL observed data.

local_EmailHasTitle_probabilities = classifier.predict_proba(test_x)
local_EmailHasTitle_obs = pd.DataFrame()
row_list = []
# build a table
for index, probabilities in enumerate(local_EmailHasTitle_probabilities):
    for class_index, probability in enumerate(probabilities):
        row_dict = {'id': email_nodes_truth.iloc[index]['id'], 'title': title_map[classifier.classes_[class_index]], 'exists': probability}
        row_list.append(row_dict)
        #print(email_nodes_truth.iloc[index]['id'], "\t", title_map[classifier.classes_[class_index]], "\t", probability)

local_EmailHasTitle_obs = pd.concat([local_EmailHasTitle_obs, pd.DataFrame(row_list)], ignore_index=True)
local_EmailHasTitle_obs.to_csv('Local_EmailHasLabel_obs.csv', sep ='\t', index=False, header=False, columns=['id', 'title', 'exists'])
manager_edges_obs
id w-gerald w-know w-busi w-mexicana w-transact w-want w-thing w-review w-questar ... w-coh w-agl w-kinney w-veselack w-mwhitt w-jarnold numexchanged email other_email exists
0 2693 NaN 1.0 NaN NaN 1.0 1.0 NaN 1.0 NaN ... NaN NaN NaN NaN NaN NaN 6 286 324 EXIST
2 1256 NaN 1.0 1.0 NaN NaN 1.0 1.0 1.0 NaN ... NaN NaN NaN NaN NaN NaN 14 148 131 NOTEXIST
3 1406 NaN NaN NaN NaN 1.0 1.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 3 57 313 EXIST
4 3129 NaN 1.0 1.0 NaN 1.0 1.0 1.0 1.0 NaN ... NaN NaN NaN NaN NaN NaN 43 34 236 NOTEXIST
6 989 1.0 1.0 1.0 NaN 1.0 1.0 1.0 1.0 NaN ... NaN NaN NaN NaN NaN NaN 313 195 318 NOTEXIST
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2044 4135 NaN 1.0 NaN NaN 1.0 1.0 1.0 NaN 1.0 ... NaN NaN NaN NaN NaN NaN 31 318 46 NOTEXIST
2046 2105 NaN 1.0 NaN NaN 1.0 NaN NaN 1.0 NaN ... NaN NaN NaN NaN NaN NaN 13 67 288 NOTEXIST
2047 2374 1.0 1.0 NaN NaN 1.0 1.0 1.0 1.0 NaN ... NaN NaN NaN NaN NaN NaN 237 198 212 NOTEXIST
2048 3464 NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN ... NaN NaN NaN NaN NaN NaN 1 210 160 NOTEXIST
2049 531 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 9 316 188 NOTEXIST

1642 rows × 5118 columns

train_x = manager_edges_obs.drop(['id', 'numexchanged', 'email', 'other_email', 'exists'], axis = 1).fillna(0)
train_y = manager_edges_obs['exists']

test_x = manager_edges_truth.drop(['id', 'numexchanged', 'email', 'other_email', 'exists'], axis = 1).fillna(0)
test_y = manager_edges_truth['exists']
train_x.dtypes
w-gerald      float64
w-know        float64
w-busi        float64
w-mexicana    float64
w-transact    float64
               ...   
w-agl         float64
w-kinney      float64
w-veselack    float64
w-mwhitt      float64
w-jarnold     float64
Length: 5113, dtype: object
classifier = LogisticRegression(max_iter=300)
classifier.fit(train_x, train_y)
LogisticRegression(max_iter=300)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
predictions = classifier.predict(test_x)
classifier.score(test_x, test_y)
0.8997555012224939

Use probabilities for PSL observed data.

local_Manages_probabilities = classifier.predict_proba(test_x)
local_Manages_obs = pd.DataFrame()
row_list = []
# build a table
for index, probabilities in enumerate(local_Manages_probabilities):
    row_dict = {'email': manager_edges_truth.iloc[index]['email'], 'other_email': manager_edges_truth.iloc[index]['other_email'], 'exists': exists_map[classifier.classes_[np.argmax(probabilities)]]}
    row_list.append(row_dict)
    #print(email_nodes_truth.iloc[index]['id'], "\t", title_map[classifier.classes_[class_index]], "\t", probability)

local_Manages_obs = pd.concat([local_Manages_obs, pd.DataFrame(row_list)])
local_Manages_obs
email other_email exists
0 74 37 0.0
1 24 170 0.0
2 174 136 0.0
3 108 46 0.0
4 209 202 0.0
... ... ... ...
404 117 285 0.0
405 219 193 0.0
406 91 167 0.0
407 101 208 0.0
408 34 273 0.0

409 rows × 3 columns

# Since it's undirected, add in the reverse edges.
local_Manages_obs_sym = local_Manages_obs[['other_email', 'email', 'exists']].copy()

local_Manages_obs_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)

local_Manages_obs = pd.concat([local_Manages_obs, local_Manages_obs_sym])

local_Manages_obs.to_csv('Local_Manages_obs.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'exists'])

Entity Resolution

from strsimpy.qgram import QGram
from scipy.spatial import distance

qgram = QGram(1)

node_to_email = dict(zip(email_nodes['id'], email_nodes['emailaddress']))

# Calculate features for Training set

train_x = full_set_coref_edges_obs.copy()

train_x['address_similarity'] = 0.0
train_x['bow_cosine_similarity'] = 0.0
train_x['bow_jaccard_similarity'] = 0.0


for index in train_x.index:
    string_similarity = qgram.distance(node_to_email[train_x.iloc[index]['email']], node_to_email[train_x.iloc[index]['alt_email']])
    train_x.loc[index, 'address_similarity'] = string_similarity

    bow_cosine_similarity = distance.cosine(np.nan_to_num(list(email_nodes[email_nodes['id'] == train_x.iloc[index]['email']].iloc[0][5:-1])), np.nan_to_num(list(email_nodes[email_nodes['id'] == train_x.iloc[index]['alt_email']].iloc[0][5:-1])))
    train_x.loc[index, 'bow_cosine_similarity'] = bow_cosine_similarity

    bow_jaccard_similarity = distance.jaccard(np.nan_to_num(list(email_nodes[email_nodes['id'] == train_x.iloc[index]['email']].iloc[0][5:-1])), np.nan_to_num(list(email_nodes[email_nodes['id'] == train_x.iloc[index]['alt_email']].iloc[0][5:-1])))
    train_x.loc[index, 'bow_jaccard_similarity'] = bow_jaccard_similarity
    

train_x = train_x.drop(['email', 'alt_email', 'exists'], axis = 1)
train_y = full_set_coref_edges_obs['exists'].copy()
# Test set

test_x = coref_truth.copy()
test_y = coref_truth['exists'].copy()

test_x['address_similarity'] = 0.0
test_x['bow_cosine_similarity'] = 0.0
test_x['bow_jaccard_similarity'] = 0.0

for index in test_x.index:
    string_similarity = qgram.distance(node_to_email[test_x.iloc[index]['email']], node_to_email[test_x.iloc[index]['alt_email']])
    test_x.loc[index, 'address_similarity'] = string_similarity

    bow_cosine_similarity = distance.cosine(np.nan_to_num(list(email_nodes[email_nodes['id'] == test_x.iloc[index]['email']].iloc[0][5:-1])), np.nan_to_num(list(email_nodes[email_nodes['id'] == test_x.iloc[index]['alt_email']].iloc[0][5:-1])))
    test_x.loc[index, 'bow_cosine_similarity'] = bow_cosine_similarity

    bow_jaccard_similarity = distance.jaccard(np.nan_to_num(list(email_nodes[email_nodes['id'] == test_x.iloc[index]['email']].iloc[0][5:-1])), np.nan_to_num(list(email_nodes[email_nodes['id'] == test_x.iloc[index]['alt_email']].iloc[0][5:-1])))
    test_x.loc[index, 'bow_jaccard_similarity'] = bow_jaccard_similarity
    
test_x = test_x.drop(['email', 'alt_email', 'exists'], axis = 1)
test_y = coref_truth['exists'].copy()

classifier = LogisticRegression()
classifier.fit(train_x, train_y)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
classifier.score(test_x, test_y)
0.9966313763233878
predictions = classifier.predict(test_x)
f1_score(test_y, predictions)
0.5

Use probabilities for PSL observed data.

coref_truth
email alt_email exists
0 201 19 0.0
1 228 243 0.0
2 20 217 0.0
3 38 118 0.0
4 310 69 0.0
... ... ... ...
8307 228 20 0.0
8308 69 38 0.0
8309 209 107 0.0
8310 273 188 0.0
8311 241 195 0.0

8312 rows × 3 columns

local_CoRef_probabilities = classifier.predict_proba(test_x)
local_CoRef_obs = pd.DataFrame()
row_list = []
# build a table
for index, probabilities in enumerate(local_CoRef_probabilities):
    row_dict = {'email': int(coref_truth.iloc[index]['email']), 'alt_email': int(coref_truth.iloc[index]['alt_email']), 'exists': probabilities[1]}
    row_list.append(row_dict)
    #print(email_nodes_truth.iloc[index]['id'], "\t", title_map[classifier.classes_[class_index]], "\t", probability)

local_CoRef_obs = pd.concat([local_CoRef_obs, pd.DataFrame(row_list)], ignore_index=True)
local_CoRef_obs.to_csv('Local_CoRef_obs.csv', sep ='\t', index=False, header=False, columns=['email', 'alt_email', 'exists'])
# Sanity check the positive instances
set_1 = {(pair[0], pair[1]) for pair in zip(local_CoRef_obs[local_CoRef_obs['exists'] > 0.5]['email'], local_CoRef_obs[local_CoRef_obs['exists'] > 0.5]['alt_email'])}
set_2 = {(pair[0], pair[1]) for pair in zip(coref_truth[coref_truth['exists'] == 1]['email'], coref_truth[coref_truth['exists'] == 1]['alt_email'])}
set_2 & set_1
{(2, 3),
 (3, 2),
 (30, 31),
 (31, 30),
 (109, 114),
 (113, 115),
 (114, 109),
 (115, 113),
 (137, 138),
 (138, 137),
 (140, 141),
 (141, 140),
 (150, 152),
 (152, 150)}

Calculate Similarity Metrics

Entity Resolution

Email Address similarity


# qgram = QGram(1)
# print(qgram.distance('ABCD', 'ABCE'))
email_nodes[email_nodes['id'] == 268]['emailaddress'].iloc[0]
'laura.luce@enron.com'
email_pairs = {pair for pair in itertools.combinations(email_nodes['id'], 2)}

qgram = QGram(1)

sim_email = pd.DataFrame()
row_list = []

for pair in email_pairs:
    #print(pair)

    email_1 = email_nodes[email_nodes['id'] == pair[0]]['emailaddress'].iloc[0]
    email_2 = email_nodes[email_nodes['id'] == pair[1]]['emailaddress'].iloc[0]    
        
    string_similarity = qgram.distance(email_1, email_2)

    row_dict = {'email':pair[0], 'other_email':pair[1], 'qgram_sim':string_similarity}
    row_list.append(row_dict)
    
sim_email = pd.concat([sim_email, pd.DataFrame(row_list)])
# Sanity Check, this should print mostly "exists=1.0"

# for pair in zip(sim_email[sim_email['qgram_sim'] < 5]['email'], sim_email[sim_email['qgram_sim'] < 5]['other_email']):
#     print(full_set_coref_edges_data[(full_set_coref_edges_data['email'] == pair[0]) & (full_set_coref_edges_data['alt_email'] == pair[1])]['exists'])
#     print("--------")

Bag of words similarity

# These similarity features will be used for PSL predicates instead of a local classifier.
from scipy.spatial import distance
from matplotlib import pyplot as plt
sim_bow = pd.DataFrame()
row_list = []


for pair in email_pairs:
    entity_1 = email_nodes[email_nodes['id'] == pair[0]]
    entity_2 = email_nodes[email_nodes['id'] == pair[1]]
    
    bow_1 = entity_1.iloc[0][5:-1]
    bow_2 = entity_2.iloc[0][5:-1]
    # FIXME: Jaccard distance needs to be on sets
    row_dict = {'email':pair[0], 'other_email':pair[1], 'jaccard_sim_bow':distance.jaccard(list(bow_1), list(bow_2)), 'cosine_sim_bow':distance.cosine(np.nan_to_num(list(bow_1)), np.nan_to_num(list(bow_2)))}
    row_list.append(row_dict)
    
sim_bow = pd.concat([sim_bow, pd.DataFrame(row_list)])

sim_bow['jaccard_sim_bow'].describe()
count    22155.000000
mean         0.863022
std          0.121956
min          0.188539
25%          0.796010
50%          0.885781
75%          0.971054
max          1.000000
Name: jaccard_sim_bow, dtype: float64
sim_bow['cosine_sim_bow'].describe()
count    22155.000000
mean         0.585444
std          0.195204
min          0.060462
25%          0.442353
50%          0.547103
75%          0.734201
max          1.000000
Name: cosine_sim_bow, dtype: float64
# Since it's undirected, add in the reverse edges.
sim_bow_sym = sim_bow[['other_email', 'email', 'jaccard_sim_bow', 'cosine_sim_bow']].copy()
sim_bow_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)

sim_bow = pd.concat([sim_bow, sim_bow_sym])
# sim_bow.to_csv('Sim_Jaccard_Bow.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'jaccard_sim_bow'])
# sim_bow.to_csv('Sim_Cosine_Bow.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'cosine_sim_bow'])

Network Based silmilarity

# Load in the observed communication network
# need to rename one of the columns due to key collision
# use copy for safety
!cp $FILE_GROUND_TRUTH_COMMUNICATION_EDGES .
!sed -i 's/\temail/\tother_email/2g' enron.DIRECTED.sentto.tab

communication_edges = load_table('enron.DIRECTED.sentto.tab')

# FIXME: can probably omit this line
# manager_edges.drop('NOTEXIST,EXIST', axis=1, inplace=True)

resolve_column_type(communication_edges)

communication_edges.dtypes
Header:  DIRECTED	sentto

This is not a NODE file, so don't load this row





id              int64
email           int64
other_email     int64
numexchanged    int64
dtype: object
communication_edges
id email other_email numexchanged
0 2856 291 136 2
1 1937 299 1 1
2 516 316 14 3
3 1049 207 188 4
4 5131 174 318 18
... ... ... ... ...
2832 3497 117 243 4
2833 3665 198 219 129
2834 2831 25 291 1
2835 1462 202 280 10
2836 1708 271 231 2

2837 rows × 4 columns

# Add in existence
communication_edges['exists'] = 1.0

# Calculated the missing edges that were blocked.
missing_edges = {pair for pair in itertools.permutations(email_nodes['id'], 2)} - {pair for pair in zip(communication_edges['email'], communication_edges['other_email'])}
# add in the missing edges
row_list = []
for email, alt_email in missing_edges:
    row_dict = {'email':email, 'other_email':alt_email, 'exists':0 }
    row_list.append(row_dict)

full_set_communication_edges = pd.concat([communication_edges, pd.DataFrame(row_list)], ignore_index=True)
# full_set_communication_edges.to_csv('Communicates.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'exists'])
# prepare ground truth
coref_map = {(int(full_set_coref_edges_data.iloc[index]['email']), int(full_set_coref_edges_data.iloc[index]['alt_email'])):full_set_coref_edges_data.iloc[index]['exists'] for index in full_set_coref_edges_data.index}

sim_network = pd.DataFrame()
row_list = []


for id_1, id_2 in email_pairs:
    
    adjacent_nodes_1 = set(communication_edges[communication_edges['email'] == id_1]['other_email'])
    adjacent_nodes_2 = set(communication_edges[communication_edges['email'] == id_2]['other_email'])

    entity_1 = email_nodes[email_nodes['id'] == id_1]
    entity_2 = email_nodes[email_nodes['id'] == id_2]
    
    bow_1 = entity_1.iloc[0][5:-1]
    bow_2 = entity_2.iloc[0][5:-1]


    jaccard_sim =  len(adjacent_nodes_1 & adjacent_nodes_2) / len(adjacent_nodes_1 | adjacent_nodes_2 ) if len(adjacent_nodes_1 | adjacent_nodes_2) != 0 else 0
    dice_sim =  (2 * len(adjacent_nodes_1 & adjacent_nodes_2) ) / (len(adjacent_nodes_1) + len(adjacent_nodes_2)) if len(adjacent_nodes_1) + len(adjacent_nodes_2) != 0 else 0
    
    # dice_sim = 
    
    row_dict = {'email':id_1, 'other_email':id_2, 'jaccard_sim_network':jaccard_sim, 'dice_sim_network':dice_sim, 'jaccard_sim_bow':distance.jaccard(list(bow_1), list(bow_2)), 'cosine_sim_bow':distance.cosine(np.nan_to_num(list(bow_1)), np.nan_to_num(list(bow_2))), 'is_coref': coref_map[(id_1, id_2)]}
    row_list.append(row_dict)
    
sim_network = pd.concat([sim_network, pd.DataFrame(row_list)])
    
# Since it's undirected, add in the reverse edges.
sim_network_sym = sim_network[['other_email', 'email', 'jaccard_sim_network', 'dice_sim_network', 'jaccard_sim_bow', 'cosine_sim_bow', 'is_coref']].copy()
sim_network_sym.rename(columns = {'other_email':'email', 'email':'other_email'}, inplace = True)

sim_network = pd.concat([sim_network, sim_network_sym], ignore_index=True)
sim_network
email other_email jaccard_sim_network dice_sim_network jaccard_sim_bow cosine_sim_bow is_coref
0 132 268 0.000000 0.000000 0.789947 0.413429 0.0
1 266 268 0.000000 0.000000 0.863876 0.521341 0.0
2 185 279 0.000000 0.000000 0.937806 0.615457 0.0
3 160 222 0.022222 0.043478 0.803442 0.464823 0.0
4 220 273 0.034483 0.066667 0.752396 0.448393 0.0
... ... ... ... ... ... ... ...
44305 83 138 0.000000 0.000000 0.994524 0.844011 0.0
44306 201 229 0.000000 0.000000 0.999804 0.968901 0.0
44307 150 303 0.157895 0.272727 0.788774 0.415224 0.0
44308 47 14 0.000000 0.000000 0.983376 0.828495 0.0
44309 150 41 0.000000 0.000000 0.893018 0.527444 0.0

44310 rows × 7 columns

sim_network.to_csv('Sim_network.csv', sep ='\t', index=False)

Feature analysis

duplicates = {(pair[0], pair[1]) for pair in zip(full_set_coref_edges_data[full_set_coref_edges_data['exists'] == 1.0]['email'], full_set_coref_edges_data[full_set_coref_edges_data['exists'] == 1.0]['alt_email'])}
filtered_frame = pd.DataFrame()
row_list = []

for index in sim_network.index:
    if (sim_network.iloc[index]['email'], sim_network.iloc[index]['other_email']) in duplicates:
        row_dict = {'email': sim_network.iloc[index]['email'], 'other_email': sim_network.iloc[index]['other_email'], "jaccard_sim_network": sim_network.iloc[index]['jaccard_sim_network'], "dice_sim_network": sim_network.iloc[index]['dice_sim_network']}
        row_list.append(row_dict)
        
filtered_frame = pd.concat([filtered_frame, pd.DataFrame(row_list)], ignore_index=True)
filtered_frame["jaccard_sim_network"].describe()
count    212.000000
mean       0.073877
std        0.142050
min        0.000000
25%        0.000000
50%        0.000000
75%        0.133333
max        0.816327
Name: jaccard_sim_network, dtype: float64
from matplotlib import pyplot as plt

fig, axis = plt.subplots(figsize = (10, 5))
axis.hist(filtered_frame["jaccard_sim_network"], bins = [0, 0.05, 0.1, 0.15, .20, 0.25, 0.255, 0.30, 0.35, 0.40, 0.50, 0.55, .60, 0.65, 0.70, 0.75, .80, 0.85, 0.90, 0.95, 1.00])
# Displaying the graph
plt.show()

png

sim_network["jaccard_sim_network"].describe()
count    44310.000000
mean         0.033539
std          0.070383
min          0.000000
25%          0.000000
50%          0.000000
75%          0.040000
max          1.000000
Name: jaccard_sim_network, dtype: float64
filtered_frame["dice_sim_network"].describe()
count    212.000000
mean       0.112360
std        0.194552
min        0.000000
25%        0.000000
50%        0.000000
75%        0.235294
max        0.898876
Name: dice_sim_network, dtype: float64
# sim_network.to_csv('Sim_Jaccard_Network.csv', sep ='\t', index=False, header=False, columns=['email', 'other_email', 'jaccard_sim_network'])
fig, axis = plt.subplots(figsize = (10, 5))
axis.hist(filtered_frame["dice_sim_network"], bins = [0, 0.05, 0.1, 0.15, .20, 0.25, 0.255, 0.30, 0.35, 0.40, 0.50, 0.55, .60, 0.65, 0.70, 0.75, .80, 0.85, 0.90, 0.95, 1.00] )
# Displaying the graph
plt.show()

png

# TODO: why isn't this printing mostly "exists"?
# for pair in zip(sim_network[sim_network['jaccard_sim_network'] > 0.5]['email'], sim_network[sim_network['jaccard_sim_network'] > 0.5]['other_email']):
#     print(full_set_coref_edges_data[(full_set_coref_edges_data['email'] == pair[0]) & (full_set_coref_edges_data['alt_email'] == pair[1])]['exists'])
#     print("--------")

# TODO: why isn't this printing mostly "exists"?
# for pair in zip(sim_network[sim_network['jaccard_sim_network'] > 0.5]['email'], sim_network[sim_network['jaccard_sim_network'] > 0.5]['other_email']):
#     print(full_set_coref_edges_data[(full_set_coref_edges_data['email'] == pair[0]) & (full_set_coref_edges_data['alt_email'] == pair[1])]['exists'])
#     print("--------")