-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
62 lines (49 loc) · 1.54 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import sys
import numpy as np
import pandas as pd
from preprocess import *
from features import *
from form_matrix import *
from predict import *
from variables import *
"""
1. ./compile.sh
2. ./train.sh trainfile.json devfile.json model_file
3. ./test.sh model_file testfile.json outputfile.txt
"""
############################## VARIABLES ###########################################
FILE_TRAIN = sys.argv[1]
FILE_DEV = sys.argv[2]
MODEL_FILE = sys.argv[3]
# clean, stem
# df = get_dataframe(FILE_TRAIN,train=True)
df_train = get_dataframe(FILE_TRAIN,train=True)
df_dev = get_dataframe(FILE_DEV,train=True)
df = pd.concat([df_train,df_dev], axis=0)
# count length of each review
df = count_length(df)
# features related to sentiment lexicons in reviews
df = get_sentiments(df,train=True,get_scores=get_scores)
# count different types of POS tags -----> fast without this
if TAG:
df = run_parallel(df, count_POS_tags)
# get features from words
if use_tfidf:
X_train_vocab,vectorizer_vocab = count_vocab_tfidf(df,params)
else:
X_train_vocab,vectorizer_vocab = count_vocab(df,params)
# save vectorizer to pickle
pickle.dump(vectorizer_vocab, open(pickle_vocab_vector,'wb'))
if stack_features:
X_train = get_feat_matrix(df,X_train_vocab)
else:
X_train =X_train_vocab
Y_train = np.array(df['rating'])
# save model to pickle
if use_svr:
model, svr = get_model2(X_train,Y_train)
pickle.dump(model,open(MODEL_FILE,'wb'))
pickle.dump(svr,open(MODEL_FILE+'_svr','wb'))
else:
model = get_model(X_train,Y_train)
pickle.dump(model, open(MODEL_FILE+'_svr','wb'))