-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_missing_value_datasets.py
131 lines (106 loc) · 4.48 KB
/
generate_missing_value_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import argparse
import csv
import math
import numpy as np
import os
import pandas as pd
import random
def get_dataframe_as_float_from_csv(csv_filename):
"""
Returns a dataframe loaded from CSV and parsed as floats. The reason why
we need the columns to be floats is because NaN can only be inserted into
float columns.
TODO: Should we use null instead? Maybe that doesn't suffer from the NaN
problem, where the column dtype needs to be a float.
"""
data = pd.read_csv(csv_filename)
for col in data:
if data[col].dtypes == 'int64':
data[col] = data[col].astype(float)
return data
def get_df_with_missing_value_MCAR(df, sample_fraction):
"""
Inserts missing values MCAR (Missing Completely At Random) into the input
data set, using the specified sample fraction.
Note that if the input dataframe already contains NaN, then the resulting
fraction of NaN may be larger than the specified sample_fraction.
"""
if sample_fraction < 0.0 or sample_fraction > 1.0:
raise ValueError("sample fraction must be number between 0 and 1")
print('Statistics of original data set:')
print(df.describe())
# Copy into a separate dataframe. Note that if the original dataframe
# is prohibitively large, this call is doubling that size.
df_with_nan = df.copy()
# Create an array where every tuple is the row and column number.
ix = [(row, col) for row in range(df_with_nan.shape[0])
for col in range(df_with_nan.shape[1])]
# Then for each (row, col) in the random sample, insert a NaN on that cell.
for row, col in random.sample(ix, int(round(sample_fraction * len(ix)))):
df_with_nan.iat[row, col] = np.nan
return df_with_nan
def get_target_df_column_names(df):
"""
Returns an array of column names that are renamed versions of the column
names from the input dataframe.
"""
target_df_column_names = []
for column in df:
target_df_column_names.append(column + '_target')
return target_df_column_names
def generate_training_and_test_datasets(csv_filename, sample_fraction):
"""
Returns a dataframe concatenating the dataframe with inserted missing
values, and the target variables dataframe (essentially the original data).
"""
target_df = get_dataframe_as_float_from_csv(csv_filename)
df_with_nan = get_df_with_missing_value_MCAR(target_df, sample_fraction)
target_df.columns = get_target_df_column_names(target_df)
# TODO: Figure out whether we'll use this tuple column technique in any way.
#
# concatenated = pd.concat([df_with_nan, target_df], axis=1)
# concatenated['target'] = concatenated[target_df_column_names].apply(tuple, axis=1)
#
training_df = pd.concat([df_with_nan, target_df], axis=1)
# Use a 70-30 breakdown for training and test data sets, where upper 70%
# are training, and bottom 30% rows are test.
#
# FIXME do we need to shuffle the rows first?
training_set_num_rows = int(math.ceil(.7 * float(training_df.shape[0])))
test_set_num_rows = int(math.floor(.3 * float(training_df.shape[0])))
output_filename_prefix = os.path.splitext(csv_filename)
training_set_filename = output_filename_prefix[0] + "_training.csv"
test_set_filename = output_filename_prefix[0] + "_test.csv"
training_df.head(training_set_num_rows).to_csv(
training_set_filename,
index=False,
quoting=csv.QUOTE_ALL,
quotechar='"')
df_with_nan.tail(test_set_num_rows).to_csv(
test_set_filename,
index=False,
quoting=csv.QUOTE_ALL,
quotechar='"')
print('Saved training data set under \"%s\"' % training_set_filename)
print('Saved test data set under \"%s\"' % test_set_filename)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Generate missing values training and test data sets.')
parser.add_argument(
'-f',
'--input_csv_file',
help='Relative path of input CSV file containing data set with numeric'
' columns.',
required='True')
parser.add_argument(
'-m',
'--missing_value_fraction',
type=float,
help='Fraction of missing values to be inserted.',
required='True')
args = parser.parse_args()
print('Generating missing value training and test data sets from \"%s\"'
% args.input_csv_file)
generate_training_and_test_datasets(
args.input_csv_file,
args.missing_value_fraction)