-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess-seg-to-db.py
50 lines (36 loc) · 1.57 KB
/
preprocess-seg-to-db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from ietfdata.mailarchive2 import *
import traceback
import pickle
from email_segmentation import SegmentationSerializer, iterate_over_thread, header_message_id
total_start_time = time.time()
ma = MailArchive(mongodb_username = "admin", mongodb_password = "DzKvurBMsKtAEOQ9s9r")
TEST_MODE = False
if __name__ == "__main__":
for ml_name in ma.mailing_list_names():
if TEST_MODE:
if ml_name != "100attendees":
continue
#print("Working on list:" + ml_name)
ml = ma.mailing_list(ml_name)
try:
mid2seg = pickle.load(open("./segmented-texts/" + ml_name + "-full.pickle", "rb"))
except:
print("Error loading pickle for " + ml_name)
traceback.print_exc()
continue
total, no = 0, 0
mlthreads = ml.threads(this_list_only = True)
for thread_root_key in list(mlthreads.keys()):
thr_root = mlthreads[thread_root_key][0]
for msg in iterate_over_thread(thr_root):
total+=1
mid = header_message_id(msg)
if mid not in mid2seg:
no += 1
continue
seg_json = SegmentationSerializer().serialize_to_json(mid2seg[mid])
msg.clear_metadata("seg")
msg.add_metadata("seg", "data", seg_json)
if no + total != 0:
print("Finished for mailing list %s --> Number of missing segmentations %d / %d (%.3f)" % (ml_name, no, total, no / total))
#print()