-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_intgraph.py
135 lines (109 loc) · 5.89 KB
/
preprocess_intgraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# List of columns that will be generated (one csv file for each mailing list):
# sender/receiver email - email address of the person sending/receiving the email (sender is sending a message that is replying to something previously sent by the receiver)
# message_id, uid, uid_validity - these fields uniquely identify the current message and can be used to look it up in the mailarchive
# reply_to_message_id, reply_to_uid, reply_to_uid_validity - identify the message that is being replied to
# timestamp - time of the message
# maling_list_name - name of the mailing list
# Things that were in the previous version but slightly different
# There is no PersonID, only email, but it will be easy to map email -> person_id, when we decide how we generate the person_ids and where we keep them
# unique message identifiers (message_id, uid, and uid_validity) are included for the current and the replied to message
# - these can be used to look up the message in the mailarchive if extra metadata about them is needed
# Things missing wrt previous version:
# type: "reply_to" or "reply_self" is no longer there (we can bring this back if needed, not sure)
# mailing list type (at the group, year, month level) - e.g., "wg", "meeting", or "iab" ... is also omitted, if it is needed you need to pull this straight from the DT
# - there is some code from Prashant somewhere that does exactly this but atm is not included here
# "time since first email", and "max time" for both sender and recipient, are ommited, can be brought back if relevant
# Notes:
# spam is *not excluded*, but can be filtered by looking up the message in the mailarchive and getting it's metadata (see metadata-spam in the documentation)
# automated mailing lists are also *not excluded* but can be filtered by ignoring the lists in the file automated_emails.txt (there is probably code somewhere that generates that too)
from ietfdata.mailarchive2 import *
import traceback
import re
TEST_MODE = False
def header_message_id(e):
h = e.header("message-id")
if len(h) > 0:
return h[0]
else:
return None
# version of iterate over thread that explicitly returns all edges it traversed
def iterate_over_thread_E(node_pair: (Envelope, Envelope)) -> list[Envelope]:
visited_ids, results, queue = set(), [], [node_pair]
while len(queue) > 0:
current_node, parent_node = queue.pop(0)
curr_id = header_message_id(current_node)
if curr_id in visited_ids or curr_id is None: # curr_id == None seems to make the db replies() request below hang
continue
results.append((current_node, parent_node))
visited_ids.add(curr_id)
direct_children = current_node.replies()
for child in direct_children:
queue.append((child, current_node))
return results
ren = r'(?:\.?)([\w\-_+#~!$&\'\.]+(?<!\.)(@|[ ]?\(?[ ]?(at|AT)[ ]?\)?[ ]?)(?<!\.)[\w]+[\w\-\.]*\.[a-zA-Z-]{2,3})(?:[^\w])'
ren2 = r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)'
def extract_addr(e):
e = e.replace("'", "__apostrophe__")
x = re.findall(ren, str(e))
email = ""
if len(x) == 0:
x = re.findall(ren2, str(e))
if len(x) > 0:
email = x[0]
else:
email = x[0][0]
email = email.replace("__apostrophe__", "'").lower()
return email
def process_email_header(h):
addrs = []
if len(h) == 0:
return ""
for h_elem in h[0].split(","): # the lib usually returns a single entry with comma separated name/emails (but it might rarely return more than a single entry, which is in this code ignored)
addrs.append(extract_addr(h_elem))
return ",".join(addrs)
def make_intgraph(ma, outfilename):
data = []
for ml_name in ma.mailing_list_names():
print("Working on list:" + ml_name)
if TEST_MODE:
if ml_name != "ietf-languages":
continue
ml = ma.mailing_list(ml_name)
total, no = 0, 0
thr_root_dict = ml.threads(this_list_only = True)
for thr_root_key in thr_root_dict:
thr_root = thr_root_dict[thr_root_key][0]
for (msg, parent) in iterate_over_thread_E((thr_root, None)):
total+=1
mid = header_message_id(msg)
if parent is not None:
parent_mid = header_message_id(parent)
else:
continue # this will be the case only for the first message in the thread
data.append((
process_email_header(msg.header("from")),
process_email_header(msg.header("to")),
process_email_header(parent.header("from")),
mid,
msg.uid(),
msg.uidvalidity(),
parent_mid,
parent.uid(),
parent.uidvalidity(),
msg.date(),
ml_name
))
print("Finished, total msgs: " + str(total))
out_df = pd.DataFrame(data, columns = ["current_from_email",
"current_to_email",
"reply_to_from_email",
"current_message_id",
"current_uid",
"current_uidvalidity",
"reply_to_message_id",
"reply_to_uid",
"reply_to_uidvalidity",
"date",
"mailing_list_name"])
#print(out_df.head())
out_df.to_csv(outfilename, index = False)